In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import helpers

from sklearn.impute import SimpleImputer

np.random.seed(helpers.random_seed)

In [None]:
initial_train_df = pd.read_csv(f'../data/categorized_train.csv', index_col=0)

Here I will use the simple median imputation of missing values. PDFs will be, of course, skewed.

In [None]:
# np.int64 is used here, because all the continuous features have floating type.
# I am well aware that int can also be used as a continuous type, but in this case all ints are 0/1 of one-hot encoding
categorical_cols = initial_train_df.select_dtypes(np.int64).columns.to_list()
continuous_cols = list(set(initial_train_df.columns.to_list()) - set(categorical_cols))
continuous_cols

In [None]:
imputer = SimpleImputer(strategy='median')

imputed_train = imputer.fit_transform(initial_train_df[continuous_cols])

In [None]:
train_df = pd.DataFrame(imputed_train, index = initial_train_df.index, columns = continuous_cols)
train_df.info()

## Test Dataset

Apply the imputation algorithm trained on the train dataset to the test dataset.

In [None]:
initial_test_df = pd.read_csv(f'../data/categorized_test.csv', index_col=0)

In [None]:
initial_test_df.info()

In [None]:
imputed_test = imputer.transform(initial_test_df[continuous_cols])

In [None]:
test_df = pd.DataFrame(imputed_test, index = initial_test_df.index, columns = continuous_cols)
test_df.info()

## Saving the data

Merge the imputed values with the rest and save the imputed datasets and the imputer model itself for future uses.

In [None]:
final_train_df = pd.concat([train_df, initial_train_df[categorical_cols]], axis=1)
final_test_df = pd.concat([test_df, initial_test_df[categorical_cols]], axis=1)

In [None]:
final_train_df.info()

In [None]:
final_test_df.info()

In [None]:
final_train_df.to_csv(f'../data/post_impute_train.csv')
final_test_df.to_csv(f'../data/post_impute_test.csv')

In [None]:
joblib.dump(imputer, '../data/imputer.joblib')

## Plotting data

#### The distributions of train imputations

In [None]:
fig, axes = plt.subplots(nrows=train_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.histplot(initial_train_df[col], ax=ax[0], bins=50).set(title=f'Initial: {col}', xlabel="")
    sns.histplot(train_df[col], ax=ax[1], bins=50).set(title=f'Imputed: {col}', xlabel="")
plt.show()

It seems the shapes of the PDFs skewed with median values, especially if the feature is missing a lot of values.

In [None]:
fig, axes = plt.subplots(nrows=train_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.scatterplot(x=initial_train_df[col], y=initial_train_df[helpers.target_feature], ax=ax[0]).set(title=f'Initial: {col}', xlabel="")
    sns.scatterplot(x=train_df[col], y=train_df[helpers.target_feature], ax=ax[1]).set(title=f'Imputed: {col}', xlabel="")
plt.show()

#### The distributions of test imputations

In [None]:
fig, axes = plt.subplots(nrows=test_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(test_df.columns, axes):
    sns.histplot(initial_test_df[col], ax=ax[0], bins=50).set(title=f'Initial: {col}', xlabel="")
    sns.histplot(test_df[col], ax=ax[1], bins=50).set(title=f'Imputed: {col}', xlabel="")
plt.show()

The imputation skews PDFs as well, unfortunately.

In [None]:
fig, axes = plt.subplots(nrows=test_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.scatterplot(x=initial_test_df[col], y=initial_test_df[helpers.target_feature], ax=ax[0]).set(title=f'Initial: {col}', xlabel="")
    sns.scatterplot(x=test_df[col], y=test_df[helpers.target_feature], ax=ax[1]).set(title=f'Imputed: {col}', xlabel="")
plt.show()