<a href="https://colab.research.google.com/github/FalaahArifKhan/RAI-summer-stability/blob/main/examples/Preprocessing_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/FalaahArifKhan/RAI-summer-stability.git

Cloning into 'RAI-summer-stability'...
remote: Enumerating objects: 194, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 194 (delta 4), reused 10 (delta 3), pack-reused 179[K
Receiving objects: 100% (194/194), 35.78 MiB | 21.90 MiB/s, done.
Resolving deltas: 100% (87/87), done.


# Loading the data

In [None]:
from IPython.display import clear_output
try:
    from folktables import ACSDataSource, ACSEmployment
except:
    !pip install folktables
    from folktables import ACSDataSource, ACSEmployment
    clear_output()
from sys import getsizeof
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from utils.data_loader import *
from utils.null_handler import *
from utils.EDA_utils import *

import warnings
warnings.filterwarnings('ignore')

SEED=10

In [None]:
X_data_load, y_data = ACSDataLoader(task=ACSEmployment, state=['AL'], year='2016')

Downloading data for 2016 1-Year person survey for AL...


In [None]:
print(f'Original: {int(getsizeof(X_data_load) / 1024**2)} mb')
print(f'Optimized: {int(getsizeof(optimize_ACSEmployment(X_data_load)) / 1024**2)} mb')

X_data = optimize_ACSEmployment(X_data_load)

Original: 5 mb
Optimized: 2 mb


In [None]:
X_data.isna().sum()

AGEP            0
SCHL         1396
MAR             0
RELP            0
DIS             0
ESP         38956
CIT             0
MIG           444
MIL          8820
ANC             0
NATIVITY        0
DEAR            0
DEYE            0
DREM         2347
SEX             0
RAC1P           0
dtype: int64

# Imputation methods

## Deterministic or Stochastic Regression Imputation

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [68]:
def regression_imputation(input_data, column_names: dict, stochastic=True):
    data = input_data.copy(deep=True)
    for column_name, column_type in column_names.items():
        other_columns = [col for col in data.columns if col != column_name]
        indexes = data[column_name].isna()
        
        not_null_df = data[~indexes]
        null_df = data[indexes]

        X_train = not_null_df[other_columns].fillna(0).to_numpy()
        y_train = not_null_df[column_name].to_numpy()

        X_pred = null_df[other_columns].fillna(0).to_numpy()
        
        if column_type == 'numerical':
            model = LinearRegression().fit(X_train, y_train)
        else:
            model = LogisticRegression(multi_class='multinomial').fit(X_train, y_train)

        pred = model.predict(X_pred)
        if column_type == 'numerical' and stochastic:
            variance = not_null_df[column_name].std()
            pred += np.random.normal(loc=0,
                                        scale=variance,
                                        size=len(pred))
        print(pred[:5])
        data.loc[indexes, column_name] = pred
    return data

In [69]:
column_names = X_data.columns[X_data.isna().any()].tolist()
column_names = dict(zip(column_names, ['categorical'] * len(column_names)))
imputed = regression_imputation(X_data, column_names, stochastic=False)
imputed.isna().any().any()

[2. 2. 2. 2. 2.]
[6. 6. 6. 6. 6.]
[1. 1. 1. 1. 1.]
[4. 4. 4. 4. 4.]
[2. 2. 2. 2. 2.]


False

In [70]:
column_names = dict(zip(column_names, ['numerical'] * len(column_names)))
imputed = regression_imputation(X_data, column_names, stochastic=True)
imputed.isna().any().any()

[-1.63474912 -2.10466141 -7.75491565 10.58805528 10.02717085]
[7.80857922 3.20156056 4.64031247 8.40450935 1.75384929]
[1.33620945 1.24840552 0.97262184 0.96927577 0.61532494]
[4.11679212 4.81116361 4.26628639 4.78682204 3.75590149]
[2.00114142 1.89705917 2.2142358  2.18055729 1.83027737]


False

## kNN or Hot-Deck

In [None]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [53]:
def kNN_imputation(input_data, column_names: dict, n_neighbors=5, weights='distance'):
    data = input_data.copy(deep=True)
    for column_name, column_type in column_names.items():
        other_columns = [col for col in data.columns if col != column_name]
        indexes = data[column_name].isna()
        
        not_null_df = data[~indexes]
        null_df = data[indexes]

        X_train = not_null_df[other_columns].fillna(0).to_numpy()
        y_train = not_null_df[column_name].to_numpy()

        X_pred = null_df[other_columns].fillna(0).to_numpy()
        
        if column_type == 'numerical':
            model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
        else:
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)

        pred = model.predict(X_pred)
        print(pred[:5])
        data.loc[indexes, column_name] = pred
    return data

In [54]:
column_names = X_data.columns[X_data.isna().any()].tolist()
column_names = dict(zip(column_names, ['categorical'] * len(column_names)))
imputed = kNN_imputation(X_data, column_names)
imputed.isna().any().any()

[2. 1. 1. 1. 1.]
[7. 7. 1. 7. 2.]
[1. 1. 1. 1. 1.]
[4. 4. 4. 4. 4.]
[2. 2. 2. 2. 2.]


False

In [55]:
column_names = X_data.columns[X_data.isna().any()].tolist()
column_names = dict(zip(column_names, ['numerical'] * len(column_names)))
imputed = kNN_imputation(X_data, column_names)
imputed.isna().any().any()

[2.         1.2        1.19430958 1.24264069 1.4       ]
[4.39999557 4.39993361 2.3999625  4.39984011 3.39964572]
[1.         2.09384221 1.         1.         2.02657024]
[4. 4. 4. 4. 4.]
[2. 2. 2. 2. 2.]


False

## Datawig

In [None]:
try:
    import datawig
except:
    !pip install datawig
    import datawig
    clear_output()

In [57]:
X_data_imputed = datawig.SimpleImputer.complete(X_data)

KeyboardInterrupt: ignored

In [58]:
X_data_imputed.isna().any().any()

False

# Comparison of imputation methods

In [59]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

In [62]:
data_source = ACSDataSource(
    survey_year='2016',
    horizon='1-Year',
    survey='person'
)
acs_data = data_source.get_data(states=['AL'], download=True)
features, label, group = ACSEmployment.df_to_numpy(acs_data)

X_full = pd.DataFrame(features, columns=ACSEmployment.features)
y_full = pd.DataFrame(label)
y_full.rename(columns={0: ACSEmployment.target}, inplace=True)
X_full.isna().any().any()

False

Should I add target column and compare imputation metrics for it?

In [61]:
column_names = X_data.columns[X_data.isna().any()].tolist()
column_names = dict(zip(column_names, ['categorical'] * len(column_names)))


imputed_regression = regression_imputation(X_data, column_names, stochastic=False)
print()
imputed_kNN = kNN_imputation(X_data, column_names)

for column_name in column_names:
    indexes = X_data[column_name].isna()
    true = X_full.loc[indexes, column_name]
    std = X_full[column_name].std()

    pred_regression = imputed_regression.loc[indexes, column_name]
    print(f'MSE for regression - {column_name}: {MSE(true, pred_regression) ** .5 / std}')
    print(f'MAE for regression - {column_name}: {MAE(true, pred_regression) / std}')
    print()

    pred_kNN = imputed_kNN.loc[indexes, column_name]
    print(f'MSE for kNN - {column_name}: {MSE(true, pred_kNN) ** .5 / std}')
    print(f'MAE for kNN - {column_name}: {MAE(true, pred_kNN) / std}')
    print()

    pred = X_data_imputed.loc[indexes, column_name]
    print(f'MSE for Datawig - {column_name}: {MSE(true, pred) ** .5 / std}')
    print(f'MAE for Datawig - {column_name}: {MAE(true, pred) / std}')
    
    print('-' * 100)

[2. 2. 2. 2. 2.]
[6. 6. 6. 6. 6.]
[1. 1. 1. 1. 1.]
[4. 4. 4. 4. 4.]
[2. 2. 2. 2. 2.]

[2. 1. 1. 1. 1.]
[7. 7. 1. 7. 2.]
[1. 1. 1. 1. 1.]
[4. 4. 4. 4. 4.]
[2. 2. 2. 2. 2.]
MSE for regression - SCHL: 0.33464422154868556
MAE for regression - SCHL: 0.33464422154868556

MSE for kNN - SCHL: 0.19036613903213773
MAE for kNN - SCHL: 0.18374269041337213

MSE for Datawig - SCHL: 0.21917474057385292
MAE for Datawig - SCHL: 0.18266780376848127
----------------------------------------------------------------------------------------------------
MSE for regression - ESP: 3.0948704069037376
MAE for regression - ESP: 2.8313342434435858

MSE for kNN - ESP: 2.9473601139450576
MAE for kNN - ESP: 2.387706748881783

MSE for Datawig - ESP: 9.90962571214113
MAE for Datawig - ESP: 8.6203384497272
----------------------------------------------------------------------------------------------------
MSE for regression - MIG: 1.5092073168811968
MAE for regression - MIG: 1.4960858492062403

MSE for kNN - MIG: 1.88289

I guess, all results except for SCHL are very poor according to RMSE / std (relative error in terms of standard deviation of the given column).

I also think we cannot apply imputation methods until we convert categorical columns to dummies.

# Null simulation and imputation

Sorry for breaking YAGNI...

## Unknown. AGEP

In [63]:
X_full.isna().any().any()

False

In [64]:
special_values = (8, 10, 11, 12, 15)
condition_col = 'RELP'
target_col = 'AGEP'
fraction = .4
corrupted_data_AGEP = nulls_simulator(X_data, target_col, condition_col, special_values, fraction)

In [76]:
column_names = {'AGEP': 'numerical'}

imputed_regression = regression_imputation(corrupted_data_AGEP, column_names, stochastic=False)
print()
imputed_kNN = kNN_imputation(corrupted_data_AGEP, column_names)

[58.00631736 29.61905465 32.55799475 31.446597   26.36660763]

[56.1723686  32.25402078 22.34590845 35.39805608 25.71572875]


In [77]:
for column_name in column_names:
    indexes = corrupted_data_AGEP[column_name].isna()
    true = X_full.loc[indexes, column_name]
    std = X_full[column_name].std()

    pred_regression = imputed_regression.loc[indexes, column_name]
    print(f'MSE for regression - {column_name}: {MSE(true, pred_regression) ** .5 / std}')
    print(f'MAE for regression - {column_name}: {MAE(true, pred_regression) / std}')
    print()

    pred_kNN = imputed_kNN.loc[indexes, column_name]
    print(f'MSE for kNN - {column_name}: {MSE(true, pred_kNN) ** .5 / std}')
    print(f'MAE for kNN - {column_name}: {MAE(true, pred_kNN) / std}')

    print('-' * 100)

MSE for regression - AGEP: 0.6274729506662078
MAE for regression - AGEP: 0.47722993815269676

MSE for kNN - AGEP: 0.7462729786565029
MAE for kNN - AGEP: 0.5298874886355066
----------------------------------------------------------------------------------------------------


## Special. SEX

In [79]:
special_values = [1]
condition_col = 'SEX'
target_col = 'SEX'
fraction = .11
corrupted_data_SEX = nulls_simulator(X_data, target_col, condition_col, special_values, fraction)

In [80]:
column_names = {'SEX': 'categorical'}

imputed_regression = regression_imputation(corrupted_data_SEX, column_names, stochastic=False)
print()
imputed_kNN = kNN_imputation(corrupted_data_SEX, column_names)

[1. 2. 2. 2. 2.]

[1. 1. 1. 2. 1.]


In [83]:
for column_name in column_names:
    indexes = corrupted_data_SEX[column_name].isna()
    true = X_full.loc[indexes, column_name]
    std = X_full[column_name].std()

    pred_regression = imputed_regression.loc[indexes, column_name]
    print(f'MSE for regression - {column_name}: {MSE(true, pred_regression) ** .5 / std}')
    print(f'MAE for regression - {column_name}: {MAE(true, pred_regression) / std}')
    print()

    pred_kNN = imputed_kNN.loc[indexes, column_name]
    print(f'MSE for kNN - {column_name}: {MSE(true, pred_kNN) ** .5 / std}')
    print(f'MAE for kNN - {column_name}: {MAE(true, pred_kNN) / std}')

    print('-' * 100)

(2520,)
(2520,)
MSE for regression - SEX: 1.737900159194137
MAE for regression - SEX: 1.50904208950754

MSE for kNN - SEX: 1.2178361022697717
MAE for kNN - SEX: 0.7410190892160711
----------------------------------------------------------------------------------------------------
