<a href="https://colab.research.google.com/github/FalaahArifKhan/RAI-summer-stability/blob/main/examples/Preprocessing_Techniques_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/FalaahArifKhan/RAI-summer-stability.git

Cloning into 'RAI-summer-stability'...
remote: Enumerating objects: 453, done.[K
remote: Total 453 (delta 0), reused 0 (delta 0), pack-reused 453[K
Receiving objects: 100% (453/453), 52.93 MiB | 13.24 MiB/s, done.
Resolving deltas: 100% (269/269), done.


# Loading the data

In [102]:
from IPython.display import clear_output
try:
    from folktables import ACSDataSource, ACSEmployment
except:
    !pip install folktables
    from folktables import ACSDataSource, ACSEmployment
    clear_output()
from sys import getsizeof
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from utils.data_loader import *
from utils.null_handler import *
from utils.EDA_utils import *
from utils.simple_utils import get_column_type
from config import COLUMN_TO_TYPE, SEED

import warnings
warnings.filterwarnings('ignore')

In [103]:
X_data_load, y_data = ACSDataLoader(task=ACSEmployment, state=['AL'], year='2016')

In [104]:
print(f'Original: {int(getsizeof(X_data_load) / 1024**2)} mb')
print(f'Optimized: {int(getsizeof(optimize_ACSEmployment(X_data_load)) / 1024**2)} mb')

X_data = optimize_ACSEmployment(X_data_load)

Original: 5 mb
Optimized: 2 mb


In [105]:
X_data.isna().sum()

AGEP            0
SCHL         1396
MAR             0
RELP            0
DIS             0
ESP         38956
CIT             0
MIG           444
MIL          8820
ANC             0
NATIVITY        0
DEAR            0
DEYE            0
DREM         2347
SEX             0
RAC1P           0
dtype: int64

# Imputation methods

## Deterministic or Stochastic Regression Imputation

In [107]:
def regression_imputation(input_data, column_names):
    data = input_data.copy(deep=True)
    for column_name in column_names:
        column_type = get_column_type(column_name)

        other_columns = [col for col in data.columns if col != column_name]
        indexes = data[column_name].isna()
        
        not_null_df = data[~indexes]
        null_df = data[indexes]

        X_train = not_null_df[other_columns].to_numpy()
        y_train = not_null_df[column_name].to_numpy()

        X_pred = null_df[other_columns].to_numpy()
        
        if column_type == 'numerical':
            model = LinearRegression().fit(X_train, y_train)
        else:
            model = LogisticRegression(multi_class='multinomial').fit(X_train, y_train)

        data.loc[indexes, column_name] = model.predict(X_pred)
    return data

## kNN or Hot-Deck

In [203]:
def kNN_imputation(input_data, column_names, n_neighbors=4, weights='distance'):
    data = input_data.copy(deep=True)
    for column_name in column_names:
        column_type = get_column_type(column_name)

        other_columns = [col for col in data.columns if col != column_name]
        indexes = data[column_name].isna()
        
        not_null_df = data[~indexes]
        null_df = data[indexes]

        X_train = not_null_df[other_columns].to_numpy()
        y_train = not_null_df[column_name].to_numpy()

        X_pred = null_df[other_columns].to_numpy()
        
        if column_type == 'numerical':
            model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
        else:
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)

        data.loc[indexes, column_name] = model.predict(X_pred)
    return data

# Adding regression and kNN into handle_df_nulls

In [204]:
def handle_df_nulls(input_data, how, column_names, condition_column=None):
    """
    Description: Processes the null values in the dataset
    Input:
    data: dataframe with missing values
    how: processing method, currently supports
            - 'special': corresponds to 'not applicable' scenario, designates null values as their own special category
            - 'drop-column' : removes the column with nulls from the dataset
            - 'drop-rows' : removes all the rows with the nulls values from the dataset
            - 'predict-by-sklearn' : predict the values to impute nulls based on the features in the rows; used for multivariate data
            - 'regression' : predict the values to impute with linear (logistic in categorical column) regression based
            on all other dataset features;
            - 'kNN' : predict the values with kNN regressor (classifier) with 5 neighbors and weighted by distance;
            - 'impute-by-mode' : impute nulls by mode of the column values without nulls
            - 'impute-by-mode-trimmed' : the same as 'impute-by-mode', but the column is filtered from nulls,
            sorted in descending order, and top and bottom k% are removed from it. After that 'impute-by-mode' logic is applied
            - 'impute-by-mean' : impute nulls by mean of the column values without nulls
            - 'impute-by-mean-trimmed' : the same as 'impute-by-mean', but the column is filtered from nulls,
            sorted in descending order, and top and bottom k% are removed from it. After that 'impute-by-mean' logic is applied
            - 'impute-by-median' : impute nulls by median of the column values without nulls
            - 'impute-by-median-trimmed' : the same as 'impute-by-median', but the column is filtered from nulls,
            sorted in descending order, and top and bottom k% are removed from it. After that 'impute-by-median' logic is applied
    column-names: list of column names, for which the particular techniques needs to be applied
    Output:
    dataframe with processed nulls
    """
    data = input_data.copy(deep=True)

    if how == 'drop-column':
        data.drop(columns=column_names,  axis=1, inplace=True)
    elif how == 'drop-rows':
        data.dropna(subset=column_names, inplace=True)
    elif how == 'predict-by-sklearn':
        if len(column_names) > 1:
            print(f"\n\nERROR: {how} technique does not work with more than one column.\n\n")
            return data

        # Setting the random_state argument for reproducibility
        imputer = IterativeImputer(random_state=42,
                                   min_value=input_data[column_names[0]].min(),
                                   max_value=input_data[column_names[0]].max())
        imputed = imputer.fit_transform(data)
        data = pd.DataFrame(imputed, columns=data.columns)
        data = data[column_names].round()
    elif how == 'regression':
        data = regression_imputation(data, column_names)
    elif how == 'kNN':
        data = kNN_imputation(data, column_names)
    else:
        get_impute_value = None
        if how == 'special':
            get_impute_value = decide_special_category
        elif 'impute-by-mode' in how:
            get_impute_value = find_column_mode
        elif 'impute-by-mean' in how:
            get_impute_value = find_column_mean
        elif 'impute-by-median' in how:
            get_impute_value = find_column_median

        if 'conditional' in how:
            data = apply_conditional_technique(data, column_names, condition_column, how, get_impute_value)
        else:
            vals = {}
            for col in column_names:
                filtered_df = data[~data[col].isnull()][[col]].copy(deep=True)
                if 'trimmed' in how:
                    k_percent = 10
                    reduce_n_rows = int(filtered_df.shape[0] / 100 * k_percent)
                    filtered_df.sort_values(by=[col], ascending=False, inplace=True)
                    filtered_df = filtered_df[reduce_n_rows: -reduce_n_rows]

                vals[col] = get_impute_value(filtered_df[col].values)
            print("Impute values: ", vals)
            data.fillna(value=vals, inplace=True)
    return data

# Comparison of imputation methods

In [206]:
data_source = ACSDataSource(
    survey_year='2016',
    horizon='1-Year',
    survey='person'
)
acs_data = data_source.get_data(states=['AL'], download=True)
features, label, group = ACSEmployment.df_to_numpy(acs_data)

X_full = pd.DataFrame(features, columns=ACSEmployment.features)
y_full = pd.DataFrame(label)
y_full.rename(columns={0: ACSEmployment.target}, inplace=True)
X_full.isna().any().any()

False

## Evaluate imputation

In [216]:
def evaluate_imputation(real, imputed, corrupted, column_names):
    metrics = []
    for column_name in column_names:
        column_type = get_column_type(column_name)

        indexes = corrupted[column_name].isna()
        true = real.loc[indexes, column_name]
        pred = imputed.loc[indexes, column_name]

        if column_type == 'numerical':
            mae = MAE(true, pred)
            print('MAE for regression - {}: {:.1f}'.format(column_name, mae))
            metrics.append(mae)
        else:
            conf_matrix = confusion_matrix(true, pred)
            accuracy = conf_matrix.trace() / conf_matrix.sum()
            print('Accuracy for regression - {}: {:.2f}'.format(column_name, accuracy))
            metrics.append(accuracy)
    return metrics

# Null simulation and imputation

## Unknown. AGEP

In [217]:
X_full.isna().any().any()

False

In [218]:
special_values = (8, 10, 11, 12, 15)
condition_col = 'RELP'
target_col = 'AGEP'
fraction = .4
corrupted_data_AGEP = nulls_simulator(X_full, target_col, condition_col, special_values, fraction)

In [219]:
column_names = ['AGEP']

imputed = handle_df_nulls(corrupted_data_AGEP, 'kNN', column_names)

evaluate_imputation(X_full, imputed, corrupted_data_AGEP, column_names)

MAE for regression - AGEP: 10.2


[10.1880990699547]

## Special. SEX

In [220]:
special_values = [1]
condition_col = 'SEX'
target_col = 'SEX'
fraction = .11
corrupted_data_SEX = nulls_simulator(X_full, target_col, condition_col, special_values, fraction)

In [221]:
column_names = ['SEX']

imputed = handle_df_nulls(corrupted_data_SEX, 'kNN', column_names)

evaluate_imputation(X_full, imputed, corrupted_data_SEX, column_names)

Accuracy for regression - SEX: 0.67


[0.6698412698412698]

# Optional MAR

In [222]:
special_values = [2, 3, 4]
condition_col='MAR'
target_col='MAR'
fraction=0.9
corrupted_data_MAR = nulls_simulator(X_full, target_col, condition_col, special_values, fraction)

In [223]:
column_names = ['MAR']

imputed = handle_df_nulls(corrupted_data_MAR, 'kNN', column_names)

evaluate_imputation(X_full, imputed, corrupted_data_MAR, column_names)

Accuracy for regression - MAR: 0.03


[0.02793090775450202]