In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn import datasets, linear_model, preprocessing
import numpy.polynomial.polynomial as poly

import lime
import lime.lime_tabular

import copy

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [2]:
df = pd.read_csv("xML Challenge Dataset and Data Dictionary/heloc_dataset_v1.csv")
feature_names = list(df)
data = df.values

# --- Converting target to binary --- 
np.place(data, data == "Bad", 0)
np.place(data, data == "Good", 1)


In [3]:
# --- Shift categorical features so they match  ---

# Assumes array without labels where MaxDelq2PublicRecLast12M is in [10] and MaxDelqEver is in [11]

def shift_categorical(data):
    col_ten = data[:,10]
    np.place(col_ten, col_ten == 5, 6)
    np.place(col_ten, col_ten == 7, 5)
    np.place(col_ten, col_ten == 8, 7)
    np.place(col_ten, col_ten == 9, 7)
    
    col_eleven = data[:,11]
    np.place(col_eleven, col_eleven == 2, 0)
    np.place(col_eleven, col_eleven == 4, 2)
    np.place(col_eleven, col_eleven == 6, 4)
    np.place(col_eleven, col_eleven == 7, 6)
    np.place(col_eleven, col_eleven == 9, 7)
    np.place(col_eleven, col_eleven == 1, 7)
    np.place(col_eleven, col_eleven == 3, 1)
    np.place(col_eleven, col_eleven == 5, 3)
    np.place(col_eleven, col_eleven == 8, 5)

    data[:,10] = col_ten
    data[:,11] = col_eleven
    
    return data


In [4]:
# --- Basic Functions for Matrix editing --- 

def remove_row_with_all_same_val(data, target, val):
    row_no = 0 
    for row in data:
        for col in row:
            if (col == val):
                remove = True
            else:
                remove = False
                break

        if remove:
            data = np.delete(data, row_no, 0)
            target = np.delete(target, row_no, 0)     

        else:
            row_no += 1
    return data,target

def remove_row_with_vals(data, target, vals):
    row_no = 0 
    for row in data:
        for col in row:
            if (col in vals):
                data = np.delete(data, row_no, 0)
                target = np.delete(target, row_no, 0) 
                row_no -= 1
                break
        row_no += 1
    return data,target


def remove_col_with_vals(data, vals):
    no_cols = data.shape[1]
    no_rows = data.shape[0]
    row = 0
    while (no_rows > row):
        col = 0
        while (no_cols > col):
            if (data[row][col] in vals):
                data = np.delete(data, col, 1)
                no_cols -= 1
            else:
                col += 1
        row += 1     
    return data


def scaled_row(row):
    scld = []
    for k in range(features):
        scld.append((row[k] - scaler.mean_[k])/scaler.scale_[k])
    scld = np.array(scld)
    return scld
        
        
def masked_arr(A, mask):
    B = []
    for i in range(len(A)):
        row = []
        for j in range(len(A[0])):
            if mask[j] != 0:
                row.append(A[i][j])
        B.append(row)
    B = np.array(B)
    return B


def distance(row1, row2):
    dist = 0
    for i in range(len(row1)):
        t = (row1[i]-row2[i])**2
        dist += t
    dist = np.sqrt(dist)
    return dist

# predict features using kNN imputation
# need to test for both weighted and simple mean
# using 3 or 5 neighbors
def predict_feature_weighted(row, C, k, originalArr, ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
    
    max_dist = np.max(distances)
        
    idx = np.argpartition(distances, k)

    values = []
    min_dists = []
    
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)

    weights = []
    for i in min_dists:
        weights.append(1 - (i/max_dist))
        
    imputed_val = 0
    for i in range(len(weights)):
        imputed_val += weights[i] * values[i]
        
    return imputed_val    


def predict_feature_mean(row, C, k, originalArr, ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
        
    idx = np.argpartition(distances, k)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
        
    imputed_val = 0
    for i in range(len(values)):
        imputed_val += values[i]/(len(values))
        
    return imputed_val

In [5]:
#data = shift_categorical(data)

In [6]:
# --- Removing the rows with only -9 values ---
y_original= data[:,:1]
X_original = data[:,1:]
X, y = remove_row_with_all_same_val(X_original,y_original,-9)

In [7]:
# --- Taking the array without any negatives present ---

samples, features = X.shape

X_good = []

for i in range(samples):
    remove = False
    for j in range(features):
        if X[i][j] < 0:
            remove = True
            break
    if remove == False:
        X_good.append(X[i])
        
X_good = np.array(X_good)

In [8]:
# --- Replacing -8 with k-nearest neighbours (average) ---

scaler = StandardScaler()
X_good_std = scaler.fit_transform(X_good)

X_no_8 = np.copy(X)

kNN = 5

cols_with_8 = [1,8,14,17,18,19,20,21,22]

for q in cols_with_8:
    for i in range(samples):

        if X[i][q] == -8:
            row_to_comp = []
            mask = []
            scaled = scaled_row(X[i])
            for j in range(features):
                if X[i][j] >= 0:
                    mask.append(1)
                    row_to_comp.append(scaled[j])
                else:
                    mask.append(0)
            row_to_comp = np.array(row_to_comp)
            mask = np.array(mask)

            S = masked_arr(X_good_std, mask)

            imputed = predict_feature_weighted(row_to_comp, S, kNN, X_good_std, q)

            X_no_8[i][q] = imputed*scaler.scale_[q] + scaler.mean_[q]



In [9]:
# --- Creating one whole dataset ---
data_set = np.append(y,np.copy(X_no_8),axis=1)

# --- Write intermediate result to file ---
new_df = pd.DataFrame(data_set)
new_df.to_csv("datano8_weighted5.csv")

In [14]:
class ModelError(Exception):
    pass

def predict_values_lin_reg(X_tr,y_tr,X_test):
    model = linear_model.LinearRegression()
    model.fit(X_tr, y_tr)
    pred = model.predict(X_test)
    return pred

def data_spliter(all_data,target_col,target_val):
    y = all_data[:,target_col:target_col+1]
    X = np.delete(all_data,target_col,1)
    
    # Will hold the X for the y values that need to be predicted
    X_target = np.zeros((1,X.shape[1]))
    
    row_no = 0 
    for val in y:
        if (val[0] == target_val):
            X_target = np.append(X_target,X[row_no:row_no+1,:],axis=0)
            X = np.delete(X, row_no, 0)
            y = np.delete(y, row_no, 0) 
        else:
            row_no += 1
            
    X_target = np.delete(X_target,0,0)
    
    return X,y,X_target

def combine_parts_inorder(X,y,X_target,y_target,target_col):
    y_target = y_target.reshape((y_target.shape[0],1))
    y_full = np.append(y_target,y,axis=0)
    X_full = np.append(X_target,X,axis=0)
    
    data = np.append(X_full[:,:target_col],y_full,axis=1)
    data = np.append(data,X_full[:,target_col:],axis=1)
    return data

def average_each_feature(X):
    X_target = np.zeros((1,X.shape[1]))
    
    for i in range(X.shape[1]):
        col = X[:,i]
        col = np.mean(col,axis=0)
        print(col)
        X_target[:,i] = col
        
    return X_target


def process_and_predict(all_data,target_col,target_val,exclude=None,model="linear"):
    
    # -- Split data --
    X,y,X_target = data_spliter(all_data,target_col,target_val)
    
    # -- Remove certain columns --
    if (exclude != None):
        y_tr = np.copy(y)
        X_tr = remove_col_with_vals(X,exclude)
        X_pred = remove_col_with_vals(X_target,exclude) # The x used to predict
    else:
        y_tr = np.copy(y)
        X_tr = np.copy(X)
        X_pred = np.copy(X_target)
        
    # -- Run regression --
    if (model == "linear"):
        y_target = predict_values_lin_reg(X_tr,y_tr,X_pred)
    
    elif (model == "polynomial"):
        pass
    
    elif (model == "special"):
        X_avg = average_each_feature(X_pred)
        pred = predict_values_lin_reg(X_tr,y_tr,X_avg)
        print(pred)
    
    else:
        raise ModelError("Model currently not available")
        
    final_data = combine_parts_inorder(X,y,X_target,y_target,target_col)
    
    return final_data

In [16]:
# --- Splitting again to training and target data (to avoid data leaking) --- 

y = data_set[:,:1]
X = data_set[:,1:]

# --- Finding the missing -9 values ---
X = process_and_predict(X,0,-9,[-7])

# --- Finding the missing -7 values ---
X = process_and_predict(X,8,-7,[-7],None,"special")
X = process_and_predict(X,14,-7,None,"special")

# --- Rounding the values to nearest whole value ---
X = np.around(X.astype(np.float),0)

ValueError: Found array with 0 sample(s) (shape=(0, 22)) while a minimum of 1 is required.

In [12]:
# --- Creating one whole dataset ---
data_set = np.append(y,np.copy(X),axis=1)

# --- Write final result to file ---
new_df = pd.DataFrame(data_set)
new_df.to_csv("data_weighted5.csv")