In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn import datasets, linear_model, preprocessing
import numpy.polynomial.polynomial as poly

import lime
import lime.lime_tabular

import copy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [2]:
df = pd.read_csv("xML Challenge Dataset and Data Dictionary/heloc_dataset_v1.csv")
feature_names = list(df)
data = df.values

y_original= data[:,:1]
X_original = data[:,1:]

In [3]:
# --- Basic Functions for Matrix editing --- 

def remove_row_with_all_same_val(data, target, val):
    row_no = 0 
    for row in data:
        for col in row:
            if (col == val):
                remove = True
            else:
                remove = False
                break

        if remove:
            data = np.delete(data, row_no, 0)
            target = np.delete(target, row_no, 0)     

        else:
            row_no += 1
    
    return data,target

def remove_row_with_vals(data, target, vals):
    row_no = 0 
    for row in data:
        for col in row:
            if (col in vals):
                data = np.delete(data, row_no, 0)
                target = np.delete(target, row_no, 0) 
                row_no -= 1
                break
        row_no += 1
    return data,target


def remove_col_with_vals(data, vals):
    no_cols = data.shape[1]
    no_rows = data.shape[0]
    row = 0
    while (no_rows > row):
        col = 0
        while (no_cols > col):
            if (data[row][col] in vals):
                data = np.delete(data, col, 1)
                no_cols -= 1
            else:
                col += 1
        row += 1     
    return data


def scaled_row(row):
    scld = []
    for k in range(features):
        scld.append((row[k] - scaler.mean_[k])/scaler.scale_[k])
    scld = np.array(scld)
    return scld
        
        
def masked_arr(A, mask):
    B = []
    for i in range(len(A)):
        row = []
        for j in range(len(A[0])):
            if mask[j] != 0:
                row.append(A[i][j])
        B.append(row)
    B = np.array(B)
    return B

def distance(row1, row2):
    dist = 0
    for i in range(len(row1)):
        t = (row1[i]-row2[i])**2
        dist += t
    dist = np.sqrt(dist)
    return dist

# predict features using kNN imputation
# need to test for both weighted and simple mean
# using 3 or 5 neighbors
def predict_feature_weighted(row, C, k, originalArr, ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
    
    max_dist = np.max(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     max_dist = np.max(min_dists)

    weights = []
    for i in min_dists:
        weights.append(1 - (i/max_dist))
    
#     print(weights)
#     print(values)
        
    imputed_val = 0
    for i in range(len(weights)):
        imputed_val += weights[i] * values[i]
#         print(imputed_val)
        
    return imputed_val         

def predict_feature_mean(row, C, k, originalArr,ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     print(values)
        
    imputed_val = 0
    for i in range(len(values)):
        imputed_val += values[i]/(len(values))
#         print(imputed_val)
        
    return imputed_val          


In [4]:
def test_model_degrees(X_tr,y_tr,degrees):
    RSS_list = []
    for deg in degrees:
        model = poly.polyfit(X_tr,y_tr,degree)
        pred = poly.polyval(X_tr, model)
        RSS = np.mean((Y_tr-pred)**2)
        RSS_list.append(RSS)
        
        
    plt.figure()
    plt.plot(degrees,RSS_list,'.-',color='r') 
    plt.xlabel('Degree')                            
    plt.ylabel('Performance')
    

def predict_values_poly_reg(X_tr,y_tr,X_test,deg):
    model = poly.polyfit(X_tr,y_tr,degree)
    pred = poly.polyval(X_test, model)
    return pred

def predict_values_lin_reg(X_tr,y_tr,X_test):
    model = linear_model.LinearRegression()
    model.fit(X_tr, y_tr)
    pred = model.predict(X_test)
    return pred
    

In [5]:
# --- Removing the rows with only -9 values ---
X, y = remove_row_with_all_same_val(X_original,y_original,-9)

# --- Testing ---
print(X_original.shape)
print(X.shape)
print(y_original.shape)
print(y.shape)

Y = np.copy(X)


(10459, 23)
(9871, 23)
(10459, 1)
(9871, 1)


In [6]:
# --- Taking the array without any negatives present ---

samples, features = Y.shape

Z = []

for i in range(samples):
    remove = False
    for j in range(features):
        if Y[i][j] < 0:
            remove = True
            break
    if remove == False:
        Z.append(Y[i])
        
Z = np.array(Z)

In [7]:
# --- Replacing -8 with k-nearest neighbours (average) ---

scaler = StandardScaler()
Z_std = scaler.fit_transform(Z)

imputed_8_Y = np.copy(Y)
# imputed_8_Y_std = np.copy(Y_std)

k = 5

cols_with_8 = [1,8,14,17,18,19,20,21,22]

for q in cols_with_8:
    for i in range(samples):

        if Y[i][q] == -8:

#             print(Y[i])

            row_to_comp = []
            mask = []
            scaled = scaled_row(Y[i])
            for j in range(features):
                if Y[i][j] >= 0:
                    mask.append(1)
                    row_to_comp.append(scaled[j])
                else:
                    mask.append(0)
            row_to_comp = np.array(row_to_comp)
            mask = np.array(mask)

            S = masked_arr(Z_std, mask)

    #         print(row_to_comp.shape)
    #         print(S.shape)

    #         print(row_to_comp)
    #         print(S)

            imputed = predict_feature_mean(row_to_comp, S, k, Z_std, q)
#             print(imputed)
#             print(imputed*scaler.scale_[q] + scaler.mean_[q])

            imputed_8_Y[i][q] = imputed*scaler.scale_[q] + scaler.mean_[q]

    #         imputed_8_Z_std[i][1] = imputed

#             print(imputed_8_Y[i])
    #         print(imputed_8_Z_std[i])
    
#             print()



In [78]:
data_no8 = np.append(y,np.copy(imputed_8_Y),axis=1)
np.place(data_no8, data_no8 == "Bad", 0)
np.place(data_no8, data_no8 == "Good", 1)

In [79]:
def data_organizer(all_data,target_col,target_val,no_col=0):
    y = all_data[:,target_col:target_col+1]
    X = np.delete(all_data,target_col,1)
    
    # Will hold the X for the y values that need to be predicted
    X_target = np.zeros((1,X.shape[1]))
    
    row_no = 0 
    for val in y:
        if (val[0] == target_val):
            X_target = np.append(X_target,X[row_no:row_no+1,:],axis=0)
            X = np.delete(X, row_no, 0)
            y = np.delete(y, row_no, 0) 
        else:
            row_no += 1
            
    X_target = np.delete(X_target,0,0)
    
    # Can make the function even more comprehensive
    if (no_col == 0):
        pass
    
    return X,y,X_target

def combine_parts(X,y,X_target,y_target):
    half_1 = y_target.reshape((y_target.shape[0],1))
    half_1 = np.append(half_1,X_target,axis=1)
    half_2 = np.append(y,X,axis=1)
    
    data = np.append(half_1,half_2,axis = 0)
    
    return data
    
    

In [80]:
X_nine,y_nine,X_nine_missing = data_organizer(data_no8,1,-9)

In [81]:
# --- Creating a training and test case from the column with outstanding -9 values---

# """
# y_nine
# X_nine
# X_nine_missing
# """
# y_nine = data_no8[:,1:2]
# X_nine = np.delete(data_no8,1,1)

# nine = -9

# X_nine_missing = np.zeros((1,X_nine.shape[1]))

# row_no = 1 
# for val in y_nine:
#     if (val[0] == nine):
#         X_nine_missing = np.append(X_nine_missing,X_nine[row_no:row_no+1,:],axis=0)

#         X_nine = np.delete(X_nine, row_no, 0)
#         y_nine = np.delete(y_nine, row_no, 0) 
#     else:
#         row_no += 1
# X_nine_missing = np.delete(X_nine_missing,0,0)


In [82]:
# --- Removing the columns with -7 values---
y_nine_tr = np.copy(y_nine)
X_nine_tr = remove_col_with_vals(X_nine,[-7])
X_nine_test = remove_col_with_vals(X_nine_missing,[-7])

In [83]:
# --- Testing ---
print(X_nine_test.shape)
print(X_nine_tr.shape)
print(y_nine_tr.shape)

(10, 21)
(9861, 21)
(9861, 1)


In [84]:
# --- Running linear regression to find missing -9 ---
y_nine_missing = predict_values_lin_reg(X_nine_tr,y_nine_tr,X_nine_test)

# --- Piecing it together ---

data_no89 = combine_parts(X_nine,y_nine,X_nine_missing,y_nine_missing)
# results = results.reshape((results.shape[0],1)) 
# results = np.append(results,X_nine_missing,axis=1)
# other = np.append(y_nine,X_nine,axis=1)

# X_no89 = np.append(results,other,axis = 0)

print(data_no89.shape)



(9871, 24)


In [85]:
# --- Selecting the first column with -7  ---

X_seven,y_seven,X_seven_missing = data_organizer(data_no89,9,-7)

# --- Removing the other column with -7 values---
y_seven_tr = np.copy(y_seven)
X_seven_tr = remove_col_with_vals(X_seven,[-7])
X_seven_test = remove_col_with_vals(X_seven_missing,[-7])

print(X_seven_test.shape)
print(X_seven_tr.shape)
print(y_seven_tr.shape)

(4664, 22)
(5207, 22)
(5207, 1)


In [None]:
test_model_degrees(X_seven_tr,y_seven_tr,[degrees]):


In [86]:
# --- Running linear regression to find missing -9 --
y_seven_missing = predict_values_lin_reg(X_seven_tr,y_seven_tr,X_seven_test)

In [87]:
data_no89 = combine_parts(X_seven,y_seven,X_seven_missing,y_seven_missing)
print(data_no89.shape)

(9871, 24)


In [94]:
# --- Selecting the second column with -7  ---

X_seven,y_seven,X_seven_missing = data_organizer(data_no89,15,-7)

# --- Removing the other column with -7 values---
y_seven_tr = np.copy(y_seven)
X_seven_tr = np.copy(X_seven)
X_seven_test = np.copy(X_seven_missing)

print(X_seven_test.shape)
print(X_seven_tr.shape)
print(y_seven_tr.shape)

(1855, 23)
(8016, 23)
(8016, 1)


In [95]:
# --- Running linear regression to find missing -9 --
y_seven_missing = predict_values_lin_reg(X_seven_tr,y_seven_tr,X_seven_test)

In [96]:
X_processed = combine_parts(X_seven,y_seven,X_seven_missing,y_seven_missing)
print(X_processed)

[[3.748628528655166 56.39038425695428 75.95160492833523 ..., 1.8
  2.8000000000000003 81.0]
 [8.249145749738885 60.232763974821054 85.39024838260647 ..., 1.8
  2.8000000000000003 82.200000000000003]
 [5.022676224208863 46.55772837014838 88 ..., 3 0 71]
 ..., 
 [7 80 73 ..., 2 0 100]
 [1 28 65 ..., 2 1 80]
 [6 35 72 ..., 1 0 38]]


In [100]:
#np.around(X_processed, decimals=2)
X_processed = np.around(X_processed.astype(np.float),0)
#np.ndarray.round(X_processed,2)

In [101]:
# np.savetxt("test_1.csv", X_processed, delimiter=",")

new_df = pd.DataFrame(X_processed)

new_df.to_csv("test_2.csv")
