In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
def replace_missing_values(path_to_file, path_where_to_save, num_features):
    # load data into dataframes
    df = pd.read_csv(path_to_file, na_values='1.00000000000000e+99', header=None, delim_whitespace=True)
    df.columns
    
    # format dataframes
    x = df.iloc[:,0:num_features]
    print("Number of samples:", len(x))
    x.head()
    
    # calculate K with sqrt of length of samples rounded down to odd number
    n_neighbors = int(math.sqrt(len(x)))
    if n_neighbors % 2 == 0:
        n_neighbors = n_neighbors-1
    else:
        n_neighbors = n_neighbors
    print("K =", n_neighbors)
    
    # print sum of na on each column
    df.isna().sum()
    
    # run KNN imputer from scklearn
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_filled = imputer.fit_transform(df)
    
    # save file
    print(df_filled)
    np.savetxt(path_where_to_save, df_filled, fmt='%1.14f', delimiter='\t')
    
def replace_missing_values_v2(path_to_file, path_where_to_save, num_features):
    # load data into dataframes
    df = pd.read_csv(path_to_file, na_values='1.00000000000000e+99', header=None, sep=',')
    df.columns
    
    # format dataframes
    x = df.iloc[:,0:num_features]
    print("Number of samples:", len(x))
    x.head()
    
    # calculate K with sqrt of length of samples rounded down to odd number
    n_neighbors = int(math.sqrt(len(x)))
    if n_neighbors % 2 == 0:
        n_neighbors = n_neighbors-1
    else:
        n_neighbors = n_neighbors
    print("K =", n_neighbors)
    
    # print sum of na on each column
    df.isna().sum()
    
    # run KNN imputer from scklearn
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_filled = imputer.fit_transform(df)
    
    # save file
    print(df_filled)
    np.savetxt(path_where_to_save, df_filled, fmt='%1.14f', delimiter='\t')

In [3]:
# missing value estimates for TrainData1
replace_missing_values("./datasets/TrainData1.txt", "./clean_datasets/TrainData1.txt", 3312)

Number of samples: 150
K = 11
[[3.82425408 1.92376196 1.91844974 ... 2.46821436 2.47858092 2.30884176]
 [3.90418953 2.30952371 2.15293014 ... 2.05438321 2.6899035  2.09092785]
 [3.75090779 1.16106839 1.01703334 ... 1.44932409 2.6052296  1.36865871]
 ...
 [3.86019841 2.09777773 1.95185297 ... 1.         2.28254281 2.05570317]
 [3.90710165 1.         1.47363293 ... 1.         1.96848295 2.31850158]
 [3.76386656 2.2811925  2.19917903 ... 1.47334096 3.00529208 2.23279288]]


In [4]:
# missing value estimates for TestData1
replace_missing_values("./datasets/TestData1.txt", "./clean_datasets/TestData1.txt", 3312)

Number of samples: 53
K = 7
[[3.84110595 1.73247418 1.71180723 ... 1.88896534 2.7219754  1.87943997]
 [3.81238855 2.12692651 1.94252889 ... 2.23813396 2.43379373 1.81577692]
 [3.84018265 1.89751713 1.89751713 ... 2.18281395 2.40642095 2.02292301]
 ...
 [3.82585695 2.05369294 1.77995705 ... 1.         2.2230804  2.18473401]
 [3.89021528 2.08601221 2.09363178 ... 1.         2.09645811 2.14185722]
 [3.79932056 1.63002085 1.63002085 ... 1.86616915 2.63344826 1.        ]]


In [5]:
# missing value estimates for TrainData3
replace_missing_values("./datasets/TrainData3.txt", "./clean_datasets/TrainData3.txt", 13)

Number of samples: 6300
K = 79
[[2.         1.         5.         ... 1.         7.         1.02531646]
 [1.         1.         5.         ... 1.         7.         1.        ]
 [2.         1.         3.         ... 3.         7.         1.        ]
 ...
 [2.         1.         4.         ... 1.         7.         1.        ]
 [1.         3.         3.         ... 1.         7.         1.        ]
 [1.         1.         2.         ... 3.         7.         1.        ]]


In [6]:
# missing value estimates for TestData3
replace_missing_values_v2("./datasets/TestData3.txt", "./clean_datasets/TestData3.txt", 13)

Number of samples: 2693
K = 51
[[1. 5. 3. ... 1. 7. 1.]
 [2. 1. 4. ... 1. 7. 1.]
 [1. 5. 2. ... 1. 7. 1.]
 ...
 [2. 5. 1. ... 1. 7. 1.]
 [1. 1. 6. ... 3. 7. 1.]
 [1. 5. 3. ... 3. 5. 1.]]


In [7]:
# missing value estimates for MissingData1.txt
replace_missing_values("./datasets/MissingData1.txt", "./results/ZamanKurienMissingDataEstimation1.txt", 242)

Number of samples: 242
K = 15
[[-0.11  0.02 -0.36 ...  0.08 -0.45  0.25]
 [-0.3  -0.37 -0.18 ...  0.31  0.4  -0.07]
 [ 0.5   0.18  0.41 ... -0.01 -0.28 -0.09]
 ...
 [ 1.31  0.05 -0.17 ... -0.06  0.21 -0.01]
 [ 0.34  0.03 -0.12 ... -0.14  0.12 -0.3 ]
 [-0.03 -0.03 -0.22 ...  0.21 -0.26  0.1 ]]


In [8]:
# missing value estimates for MissingData2.txt
replace_missing_values("./datasets/MissingData2.txt", "./results/ZamanKurienMissingDataEstimation2.txt", 50)

Number of samples: 758
K = 27
[[ 0.68872799 -0.21271986  0.49878314 ...  0.27281525  0.24299623
   0.0446016 ]
 [ 0.34345852  0.11024161 -0.2191136  ... -0.15204121 -0.48716497
  -0.60096978]
 [-0.738929   -0.10991489 -0.58472624 ... -0.18842155 -0.18041769
  -0.22630141]
 ...
 [ 0.42922617  0.27817391  0.00507326 ...  0.19688787  0.16375493
  -0.4069241 ]
 [ 0.24003554  0.11469736  0.05653686 ...  0.41545486  0.22499277
  -0.44735631]
 [-0.34445147 -0.00562868 -0.4868051  ... -0.12193684 -0.16557833
  -0.18063191]]


In [9]:
# missing value estimates for MissingData3.txt
replace_missing_values("./datasets/MissingData3.txt", "./results/ZamanKurienMissingDataEstimation3.txt", 273)

Number of samples: 273
K = 15
[[10.14567746  4.35707143  6.4666     ... 10.67912679  9.24377854
   9.01102386]
 [11.          4.35707143  6.4666     ... 10.67912679  9.24377854
   9.01102386]
 [11.86170729  4.35707143  6.4666     ... 10.67912679  9.24377854
   9.01102386]
 ...
 [ 5.65096898  4.35707143  6.4666     ... 10.67912679 11.999
  10.28663476]
 [ 6.51763565  4.35707143  6.4666     ... 10.67912679 11.45219461
   8.08179409]
 [ 6.38430232  4.35707143  6.4666     ... 10.67912679 11.999
   9.62058641]]
