In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
data = pd.read_csv('part_corrupt.csv',index_col=0)
df = data[['P_PARTKEY','P_SIZE','P_RETAILPRICE','P_BRAND']]

data_org = pd.read_csv('part_org.csv',index_col=0)
df_org = data_org[['P_PARTKEY','P_SIZE','P_RETAILPRICE','P_BRAND']]

In [None]:
#Get categorical columns by checking if dtype is object 
def separate_columns(data):
  dataTypeDict = dict(data.dtypes)
  print("All columns : ",dataTypeDict)
  categorical_cols = list()
  numerical_cols = list()
  for i in dataTypeDict:
    if dataTypeDict[i] == object:
      categorical_cols.append(i)
    else:
      numerical_cols.append(i)
  print("Categorical columns : ",categorical_cols)
  print("Numerical columns : ",numerical_cols)
  return categorical_cols,numerical_cols

In [None]:
def categorical_encoding(categorical_cols, ohe):
  categorical_ohe = pd.DataFrame()
  cat_length = dict({})
  for i in categorical_cols:
    each_cat = df[[i]]
    index = pd.isna(each_cat).to_numpy()
    each_cat = each_cat.dropna()
    each_cat_ohe = pd.DataFrame(ohe.fit_transform(each_cat),columns=ohe.categories_[0])
    cat_length[i] = len(each_cat_ohe.columns)
    nanrow = df[[i]].iloc[index]
    for i in range(len(nanrow)):
      line = pd.DataFrame(columns=each_cat_ohe.columns, index=[nanrow.index[i]])
      each_cat_ohe = each_cat_ohe.append(line, ignore_index=False)
      each_cat_ohe = each_cat_ohe.sort_index().reset_index(drop=True)
    categorical_ohe = pd.concat([categorical_ohe,each_cat_ohe],axis=1)
  return categorical_ohe, cat_length

In [None]:
def numerical_transformation(numerical_cols,scaler):
  numerical_df = df[numerical_cols]
  scaled_data = scaler.fit_transform(numerical_df)
  df_numerical= pd.DataFrame(scaled_data,columns=numerical_cols)
  return df_numerical

In [None]:
categorical_cols,numerical_cols = separate_columns(df)
ohe = OneHotEncoder(sparse=False)
scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
df_categorical, cat_length = categorical_encoding(categorical_cols, ohe)
df_numerical = numerical_transformation(numerical_cols, scaler)
df_transformed = pd.concat([df_numerical,df_categorical],axis=1)

All columns :  {'P_PARTKEY': dtype('float64'), 'P_SIZE': dtype('float64'), 'P_RETAILPRICE': dtype('float64'), 'P_BRAND': dtype('O')}
Categorical columns :  ['P_BRAND']
Numerical columns :  ['P_PARTKEY', 'P_SIZE', 'P_RETAILPRICE']


In [None]:
X_train, X_test = train_test_split(df_transformed, test_size=0.3)

In [None]:
neighbors = [3,5,7,10,12,15,17,20]
for n in neighbors:
  print("neighbors : ",n)
  knn = KNNImputer(n_neighbors=n, add_indicator=True)
  #print(X_train.isna().sum())
  knn.fit(X_train)
  #pd.DataFrame(knn.transform(X_train))
  #print(X_test.isna().sum())
  NanRowIndex = np.where(np.isnan(np.sum(X_test.to_numpy(),axis=1)))
  total_cols = len(df_transformed.columns.to_numpy())
  num_cols = len(numerical_cols)
  pred_df = pd.DataFrame(knn.transform(X_test)).iloc[NanRowIndex]
  pred_df = pred_df.iloc[:,0:total_cols]
  pred_df.columns = df_transformed.columns
  corrupt_df = df.iloc[(X_test.iloc[NanRowIndex]).index]
  org_df = df_org.iloc[(X_test.iloc[NanRowIndex]).index]
  #pred_df
  descaled_value = scaler.inverse_transform(pred_df.iloc[:,0:num_cols])
  pred_df.iloc[:,0:num_cols] = descaled_value
  prev_cols = num_cols
  for j in categorical_cols:
    pred_df[[j]] = ''
    next_cols = prev_cols + cat_length[j]
    ohe_decoded_value = ohe.inverse_transform(pred_df.iloc[:,prev_cols:next_cols])
    pred_df[j]= ohe_decoded_value
    prev_cols = next_cols
  pred_df = pred_df[df.columns]
  break
  

neighbors :  3
     P_PARTKEY  P_SIZE  P_RETAILPRICE   P_BRAND
4          NaN    15.0         905.00  Brand#32
120      121.0    13.0            NaN  Brand#14
71        72.0     NaN         972.07  Brand#23
     P_PARTKEY  P_SIZE  P_RETAILPRICE   P_BRAND
4          NaN    15.0         905.00  Brand#32
120      121.0    13.0            NaN  Brand#14
71        72.0     NaN         972.07  Brand#23


In [None]:
NanRowIndex = np.where(np.isnan(np.sum(X_test.to_numpy(),axis=1)))
total_cols = len(df_transformed.columns.to_numpy())
num_cols = len(numerical_cols)
pred_df = pd.DataFrame(knn.transform(X_test)).iloc[NanRowIndex]
pred_df = pred_df.iloc[:,0:total_cols]
pred_df.columns = df_transformed.columns
pred_df

Unnamed: 0,P_PARTKEY,P_SIZE,P_RETAILPRICE,Brand#11,Brand#12,Brand#13,Brand#14,Brand#15,Brand#21,Brand#22,Brand#23,Brand#24,Brand#25,Brand#31,Brand#32,Brand#33,Brand#34,Brand#35,Brand#41,Brand#42,Brand#43,Brand#44,Brand#45,Brand#51,Brand#52,Brand#53,Brand#54,Brand#55
4261,0.160624,0.285714,0.003339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48973,0.0006,0.244898,0.343578,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50111,0.000355,0.404082,0.059324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
descaled_value = scaler.inverse_transform(pred_df.iloc[:,0:num_cols])
pred_df.iloc[:,0:num_cols] = descaled_value

In [None]:
prev_cols = num_cols
for j in categorical_cols:
  pred_df[[j]] = ''
  next_cols = prev_cols + cat_length[j]
  ohe_decoded_value = ohe.inverse_transform(pred_df.iloc[:,prev_cols:next_cols])
  pred_df[j]= ohe_decoded_value
  prev_cols = next_cols

In [None]:
corrupt_df = df.iloc[(X_test.iloc[NanRowIndex]).index]
corrupt_df

Unnamed: 0,P_PARTKEY,P_SIZE,P_RETAILPRICE,P_BRAND
4,,15.0,905.0,Brand#32
120,121.0,13.0,,Brand#14
71,72.0,,972.07,Brand#23


In [None]:
org_df = df_org.iloc[(X_test.iloc[NanRowIndex]).index]
org_df

Unnamed: 0,P_PARTKEY,P_SIZE,P_RETAILPRICE,P_BRAND
4,5,15,905.0,Brand#32
120,121,13,1021.12,Brand#14
71,72,25,972.07,Brand#23


In [None]:
pred_df = pred_df[df.columns]
pred_df

Unnamed: 0,P_PARTKEY,P_SIZE,P_RETAILPRICE,P_BRAND
4261,32125.65,15.0,905.0,Brand#32
48973,121.0,13.0,1312.6035,Brand#14
50111,72.0,20.8,972.07,Brand#23
