# Fixing -8

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("xML Challenge Dataset and Data Dictionary/heloc_dataset_v1.csv")
feature_names = list(df)
data = df.values

y = data[:,:1]

y = y.reshape((y.shape[0],))
data = data[:,1:]

data = data.transpose()

# print(data)
# print(data.shape)
# print(y.shape)

In [2]:
print(feature_names)

['RiskPerformance', 'ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades', 'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec', 'PercentTradesNeverDelq', 'MSinceMostRecentDelq', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades', 'NumTradesOpeninLast12M', 'PercentInstallTrades', 'MSinceMostRecentInqexcl7days', 'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden', 'NetFractionInstallBurden', 'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance', 'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']


In [3]:
# Remove all completely empty rows (588)

X = np.transpose(data)

samples = X.shape[0]
features = X.shape[1]

print(X, X.shape)

Y = []

for i in range(samples):
    remove = True
    for j in range(features):
        if X[i][j] != -9:
            remove = False
            break
    if not remove:
        Y.append(X[i])
        
Y = np.array(Y)
print(Y, Y.shape)

[[55 144 4 ..., 1 1 69]
 [61 58 15 ..., -8 -8 0]
 [67 66 5 ..., 2 1 86]
 ..., 
 [74 129 6 ..., -8 0 56]
 [72 234 12 ..., 1 0 38]
 [66 28 1 ..., 1 0 100]] (10459, 23)
[[55 144 4 ..., 1 1 69]
 [61 58 15 ..., -8 -8 0]
 [67 66 5 ..., 2 1 86]
 ..., 
 [74 129 6 ..., -8 0 56]
 [72 234 12 ..., 1 0 38]
 [66 28 1 ..., 1 0 100]] (9871, 23)


In [7]:
# Data where to look for similar instances (all non-negative values)

samples = Y.shape[0]

Z = []

for i in range(samples):
    remove = False
    for j in range(features):
        if Y[i][j] < 0:
            remove = True
            break
    if remove == False:
        Z.append(Y[i])
        
Z = np.array(Z)
print(Z, Z.shape)

[[66 169 1 ..., 4 3 91]
 [59 137 11 ..., 4 3 94]
 [54 88 7 ..., 7 2 100]
 ..., 
 [76 353 2 ..., 2 0 80]
 [57 176 4 ..., 3 1 100]
 [65 147 39 ..., 2 1 80]] (2502, 23)


In [8]:
for i in range(len(Z)):
    for j in range(len(Z[0])):
        if Z[i][j] < 0:
            print(Z[i])
            print(i, j)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Z_std = scaler.fit_transform(Z)

print(Z_std, Z_std.shape)
print(scaler.mean_)
print(scaler.scale_)

[[-0.03828426 -0.39124088 -0.93039838 ...,  0.50698896  1.1496406
   1.08058076]
 [-0.93229828 -0.7399746   0.6050259  ...,  0.50698896  1.1496406
   1.2404097 ]
 [-1.57087972 -1.27397312 -0.00914381 ...,  2.23595721  0.48226223
   1.56006758]
 ..., 
 [ 1.23887863  1.61397805 -0.77685595 ..., -0.64565654 -0.85249451
   0.49454132]
 [-1.18773086 -0.31495537 -0.4697711  ..., -0.06933379 -0.18511614
   1.56006758]
 [-0.16600055 -0.63099531  4.90421388 ..., -0.64565654 -0.18511614
   0.49454132]] (2502, 23)
[  66.29976019  204.90047962    7.05955236   76.45723421   24.23261391
    0.95883293    0.57873701   87.09192646   21.11630695    4.72022382
    5.00719424   26.38888889    2.17226219   38.37529976    2.11830536
    1.67306155    1.61111111   41.11510791   68.54236611    4.76139089
    3.12030376    1.2773781    70.71742606]
[  7.82985483  91.7605541    6.51285781  26.839197    11.20655959
   1.48274916   1.16777762  10.83505533  20.70564857   1.59787987
   1.41447786  12.61536074   1.



# MSinceOldestTradeOpen

Months Since Oldest Trade Open

In [10]:
# Monotonically Decreasing
# Avg: 200.769
# Stdved: 7.946
# Min: 2
# Max: 803
# -7: 0
# -8: 239
# -9: 588
# Data is centred in the range 160-240. 
# Slight positive correlation
# Slightly more negative in special case

In [11]:
# Do similarity search for closest points ignoring the negative features.
# Standardize each feature and use Euclidean distance.

# Set a maximun radius for similarity and weight the values from each accordingly.

In [15]:
from sklearn.preprocessing import StandardScaler

# returns the array for similiarity search without the columns with a 0 in mask

def scaled_row(row):
    scld = []
    for k in range(features):
        scld.append((row[k] - scaler.mean_[k])/scaler.scale_[k])
    scld = np.array(scld)
    return scld
        
        
def masked_arr(A, mask):
    B = []
    for i in range(len(A)):
        row = []
        for j in range(len(A[0])):
            if mask[j] != 0:
                row.append(A[i][j])
        B.append(row)
    B = np.array(B)
    return B

def distance(row1, row2):
    dist = 0
    for i in range(len(row1)):
        t = (row1[i]-row2[i])**2
        dist += t
    dist = np.sqrt(dist)
    return dist

# predict features using kNN imputation
# need to test for both weighted and simple mean
# using 3 or 5 neighbors
def predict_feature_weighted(row, C, k, originalArr, ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
    
    max_dist = np.max(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     max_dist = np.max(min_dists)

    weights = []
    for i in min_dists:
        weights.append(1 - (i/max_dist))
    
#     print(weights)
#     print(values)
        
    imputed_val = 0
    for i in range(len(weights)):
        imputed_val += weights[i] * values[i]
#         print(imputed_val)
        
    return imputed_val         

def predict_feature_mean(row, C, k, originalArr,ft_idx):
    
    distances = []
    for i in range(len(C)):
        distances.append(distance(row,C[i]))
    distances = np.array(distances)
        
    idx = np.argpartition(distances, k)
#     print(idx)
    
    values = []
    min_dists = []
    for i in range(k):
        values.append(originalArr[idx[i]][ft_idx])
#         print(Z[idx[i]])
        min_dists.append(distances[idx[i]])
    values = np.array(values) 
    min_dists = np.array(min_dists)
    
#     print(values)
        
    imputed_val = 0
    for i in range(len(values)):
        imputed_val += values[i]/(len(values))
#         print(imputed_val)
        
    return imputed_val          

In [18]:
# predict using k neighbors

imputed_8_Y = np.copy(Y)
# imputed_8_Y_std = np.copy(Y_std)

k = 5

cols_with_8 = [1,8,14,17,18,19,20,21,22]

for q in cols_with_8:

    print(q)
    print()
    print()
    
    for i in range(samples):

        if Y[i][q] == -8:

#             print(Y[i])

            row_to_comp = []
            mask = []
            scaled = scaled_row(Y[i])
            for j in range(features):
                if Y[i][j] >= 0:
                    mask.append(1)
                    row_to_comp.append(scaled[j])
                else:
                    mask.append(0)
            row_to_comp = np.array(row_to_comp)
            mask = np.array(mask)

            S = masked_arr(Z_std, mask)

    #         print(row_to_comp.shape)
    #         print(S.shape)

    #         print(row_to_comp)
    #         print(S)

            imputed = predict_feature_mean(row_to_comp, S, k, Z_std, q)
#             print(imputed)
#             print(imputed*scaler.scale_[q] + scaler.mean_[q])

            imputed_8_Y[i][q] = imputed*scaler.scale_[q] + scaler.mean_[q]
            if imputed_8_Y[i][q]<0:
                print(i)
                print(Y[i])
                print(imputed_8_Y[i][q])
                while True:
                    a =1
    #         imputed_8_Z_std[i][1] = imputed

#             print(imputed_8_Y[i])
    #         print(imputed_8_Z_std[i])
    
#             print()


1


8


14


17


18


19


20


21


22




In [23]:
import pandas as pd 
df_8 = pd.DataFrame(imputed_8_Y)
df_8.to_csv("dataset_fix_8.csv")

In [22]:
for i in imputed_8_Y:
    for j in i:
        if j == -9:
            print(i)

[-9 17 17 17 1 0 0 100 -7 9 8 1 0 100 0 0 0 69.599999999999994
 75.599999999999994 2.3999999999999999 3.0 1.0 93.200000000000003]
[-9 24 24 24 1 3 3 100 -7 0 8 1 0 100 3 1 1 32.200000000000003
 61.399999999999999 0.99999999999999956 2.7999999999999998 0.0
 88.400000000000006]
[-9 92 82 87 2 0 0 100 -7 9 8 2 0 50 -7 0 0 65.200000000000003
 34.799999999999997 5.0 1.8 2.8000000000000003 81.0]
[-9 87 10 35 18 1 1 89 3 2 4 19 2 16 0 2 2 29 73 6 1 1 47]
[-9 115 55 78 3 0 0 100 -7 9 8 3 0 100 0 0 0 68.0 25.399999999999999
 3.3999999999999999 2.2000000000000002 1.8 78.799999999999997]
[-9 173 2 59 74 3 3 99 8 0 2 77 9 25 0 10 9 22 97 7 4 4 62]
[-9 175 159 167 2 1 1 100 -7 9 8 3 0 50 0 2 2 54.799999999999997
 36.400000000000006 5.4000000000000004 1.8 2.8000000000000003
 82.200000000000003]
[-9 383 383 383 1 1 1 100 -7 6 8 1 0 100 -7 1 1 54.799999999999997
 36.400000000000006 5.4000000000000004 1.8 2.8000000000000003
 82.200000000000003]
[-9 297 6 93 18 0 0 95 16 6 6 20 2 55 0 0 0 10 96 1 1 0 33