In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn import datasets, linear_model, preprocessing
import numpy.polynomial.polynomial as poly

import lime
import lime.lime_tabular

import copy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [2]:
df = pd.read_csv("xML Challenge Dataset and Data Dictionary/heloc_dataset_v1.csv")
feature_names = list(df)
data = df.values

y_original= data[:,:1]
X_original = data[:,1:]

In [3]:
# --- Basic Functions for Matrix editing --- 

def remove_row_with_all_same_val(data, target, val):
    row_no = 0 
    for row in data:
        for col in row:
            if (col == val):
                remove = True
            else:
                remove = False
                break

        if remove:
            data = np.delete(data, row_no, 0)
            target = np.delete(target, row_no, 0)     

        else:
            row_no += 1
    return data,target

def remove_row_with_vals(data, target, vals):
    row_no = 0 
    for row in data:
        for col in row:
            if (col in vals):
                data = np.delete(data, row_no, 0)
                target = np.delete(target, row_no, 0) 
                row_no -= 1
                break
        row_no += 1
    return data,target


def remove_col_with_vals(data, vals):
    no_cols = data.shape[1]
    no_rows = data.shape[0]
    row = 0
    while (no_rows > row):
        col = 0
        while (no_cols > col):
            if (data[row][col] in vals):
                data = np.delete(data, col, 1)
                no_cols -= 1
            else:
                col += 1
        row += 1     
    return data


def scaled_row(row):
    scld = []
    for k in range(features):
        scld.append((row[k] - scaler.mean_[k])/scaler.scale_[k])
    scld = np.array(scld)
    return scld
        
        
def masked_arr(A, mask):
    B = []
    for i in range(len(A)):
        row = []
        for j in range(len(A[0])):
            if mask[j] != 0:
                row.append(A[i][j])
        B.append(row)
    B = np.array(B)
    return B

def distance(row1, row2):
    dist = 0
    for i in range(len(row1)):
        t = (row1[i]-row2[i])**2
        dist += t
    dist = np.sqrt(dist)
    return dist

# predict features using kNN imputation
# need to test for both weighted and simple mean
# using 3 or 5 neighbors
def predict_feature_weighted(row, C, k, originalArr, ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
    
    max_dist = np.max(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     max_dist = np.max(min_dists)

    weights = []
    for i in min_dists:
        weights.append(1 - (i/max_dist))
    
#     print(weights)
#     print(values)
        
    imputed_val = 0
    for i in range(len(weights)):
        imputed_val += weights[i] * values[i]
#         print(imputed_val)
        
    return imputed_val         

def predict_feature_mean(row, C, k, originalArr,ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     print(values)
        
    imputed_val = 0
    for i in range(len(values)):
        imputed_val += values[i]/(len(values))
#         print(imputed_val)
        
    return imputed_val          


In [4]:
# --- Removing the rows with only -9 values ---
X, y = remove_row_with_all_same_val(X_original,y_original,-9)

# --- Testing ---
print(X_original.shape)
print(X.shape)
print(y_original.shape)
print(y.shape)

Y = np.copy(X)


(10459, 23)
(9871, 23)
(10459, 1)
(9871, 1)


In [5]:
# --- Taking the array without any negatives present ---

samples, features = Y.shape

Z = []

for i in range(samples):
    remove = False
    for j in range(features):
        if Y[i][j] < 0:
            remove = True
            break
    if remove == False:
        Z.append(Y[i])
        
Z = np.array(Z)

In [6]:
# --- Replacing -8 with k-nearest neighbours (average) ---

scaler = StandardScaler()
Z_std = scaler.fit_transform(Z)

imputed_8_Y = np.copy(Y)
# imputed_8_Y_std = np.copy(Y_std)

k = 5

cols_with_8 = [1,8,14,17,18,19,20,21,22]

for q in cols_with_8:

    print(q)
    print()
    print()
    
    for i in range(samples):

        if Y[i][q] == -8:

#             print(Y[i])

            row_to_comp = []
            mask = []
            scaled = scaled_row(Y[i])
            for j in range(features):
                if Y[i][j] >= 0:
                    mask.append(1)
                    row_to_comp.append(scaled[j])
                else:
                    mask.append(0)
            row_to_comp = np.array(row_to_comp)
            mask = np.array(mask)

            S = masked_arr(Z_std, mask)

    #         print(row_to_comp.shape)
    #         print(S.shape)

    #         print(row_to_comp)
    #         print(S)

            imputed = predict_feature_mean(row_to_comp, S, k, Z_std, q)
#             print(imputed)
#             print(imputed*scaler.scale_[q] + scaler.mean_[q])

            imputed_8_Y[i][q] = imputed*scaler.scale_[q] + scaler.mean_[q]
            if imputed_8_Y[i][q]<0:
                print(i)
                print(Y[i])
                print(imputed_8_Y[i][q])
                while True:
                    a =1
    #         imputed_8_Z_std[i][1] = imputed

#             print(imputed_8_Y[i])
    #         print(imputed_8_Z_std[i])
    
#             print()



1


8


14


17


18


19


20


21


22




In [7]:
X_no8 = np.copy(imputed_8_Y)

In [8]:
# --- Selecting the column with -9  ---
y_nine = X_no8[:,:1]



[[55]
 [61]
 [67]
 ..., 
 [74]
 [72]
 [66]]


In [None]:
# --- Selecting the columns with -7  ---
X_no8 = np.copy(imputed_8_Y)




In [None]:
# # --- Removing the columns that have -8 values ---
# X2 = remove_col_with_vals(X,[-8])

# # --- Testing ---
# print(X.shape)
# print(X2.shape)


In [None]:
def test_model_degrees(X_tr,y_tr,degrees):
    RSS_list = []
    for deg in degrees:
        model = poly.polyfit(X_tr,y_tr,degree)
        pred = poly.polyval(X_tr, model)
        RSS = np.mean((Y_tr-pred)**2)
        RSS_list.append(RSS)
        
        
    plt.figure()
    plt.plot(degrees,RSS_list,'.-',color='r') 
    plt.xlabel('Degree')                            
    plt.ylabel('Performance')
    

def predict_values_poly_reg(X_tr,y_tr,X_test,deg):
    model = poly.polyfit(X_tr,y_tr,degree)
    pred = poly.polyval(X_test, model)
    return pred

def predict_values_lin_reg(X_tr,y_tr,X_test):
    model = linear_model.LinearRegression()
    model.fit(X_tr, y_tr)
    pred = model.predict(X_test)
    return pred
    