#### Univariate Imputation

```plaintext
We fill the missing value in the data with some arbitary value. 

we fill the categorical missing value with the term Missing so that the ML model can learn to differentiate between missing data and non missing data.

for numerical data we fill with some random value
```

| **Advantages**       | **Disadvantages**                           |
|----------------------|---------------------------------------------|
| Easy to apply        | PDF gets distorted                          |
|                      | Variance changes                            |
|                      | Covariance structure changes                |


In [170]:
import pandas as pd
import numpy as np 
import polars as pl
import plotly.graph_objects as go
import plotly.express as px

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import gaussian_kde

In [172]:
df = pl.read_csv('Data_set\\titanic_toy.csv') 
df.head(2)

Age,Fare,Family,Survived
f64,f64,i64,i64
22.0,7.25,1,0
38.0,71.2833,1,1


In [173]:
x = df.drop('Survived')
y = df['Survived']

In [174]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 2)
x_train.shape, x_test.shape

((712, 3), (179, 3))

using random value imputation

In [175]:
x_train = x_train.with_columns([
    pl.col('Age').fill_null(99).alias('Age_99'),
    pl.col('Age').fill_null(-1).alias('Age_minus1'),
    pl.col('Fare').fill_null(999).alias('Fare_999'),
    pl.col('Fare').fill_null(-1).alias('Fare_minus1')
])

In [176]:
var_summary = pl.DataFrame({
    'Feature' : [
        'Age', 'Age_median', 'Age_mean', 'Fare', 'Fare_median', 'Fare_mean'
    ],
    'Variance' : [
        x_train['Age'].var(),
        x_train['Age_99'].var(),
        x_train['Age_minus1'].var(),
        x_train['Fare'].var(),
        x_train['Fare_999'].var(),
        x_train['Fare_minus1'].var()
    ]
})

var_summary

Feature,Variance
str,f64
"""Age""",204.349513
"""Age_median""",951.727557
"""Age_mean""",318.08962
"""Fare""",2448.197914
"""Fare_median""",47219.202652
"""Fare_mean""",2378.567678


In [None]:
age_orig = x_train['Age'].drop_nulls().to_numpy()
age_999 = x_train['Age_99'].drop_nulls().to_numpy()
age_minus1 = x_train['Age_minus1'].drop_nulls().to_numpy()

x_vals = np.linspace(min(age_orig.min(), age_999.min(), age_minus1.min()),
                     max(age_orig.max(), age_999.max(), age_minus1.max()), 200)

kde_original  = gaussian_kde(age_orig)(x_vals)
kde_median    = gaussian_kde(age_999)(x_vals)
kde_mean      = gaussian_kde(age_minus1)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_original,
    name = 'Original',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_median, 
    name = 'Median Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_mean, 
    name = 'Mean Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'green'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.update_layout(
    title = 'KDE Plot of Age Imputation Methods',
    xaxis_title = 'Age',
    yaxis_title = 'Density',
    plot_bgcolor = 'white',
    yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1, griddash='dot'),
    xaxis=dict(showgrid=False)
)

fig.show()

In [178]:
fare_org = x_train['Fare'].drop_nulls().to_numpy()
Fare_999 = x_train['Fare_999'].drop_nulls().to_numpy()
Fare_minus1 = x_train['Fare_minus1'].drop_nulls().to_numpy()

x_vals = np.linspace(min(fare_org.min(), Fare_999.min(), Fare_minus1.min()),
                     max(fare_org.max(), Fare_999.max(), Fare_minus1.max()), 200)

kde_original  = gaussian_kde(fare_org)(x_vals)
kde_median    = gaussian_kde(Fare_999)(x_vals)
kde_mean      = gaussian_kde(Fare_minus1)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_original,
    name = 'Original',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_median, 
    name = 'Median Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_mean, 
    name = 'Mean Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'green'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.update_layout(
    title = 'KDE Plot of Age Imputation Methods',
    xaxis_title = 'Age',
    yaxis_title = 'Density',
    plot_bgcolor = 'white',
    yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1, griddash='dot'),
    xaxis=dict(showgrid=False)
)

fig.show()

Checking the covariance matrix

In [179]:
cov_matrix = x_train.to_pandas().cov()
cov_matrix

Unnamed: 0,Age,Fare,Family,Age_99,Age_minus1,Fare_999,Fare_minus1
Age,204.349513,70.719262,-6.498901,204.349513,204.349513,162.79343,63.321188
Fare,70.719262,2448.197914,17.258917,-101.671097,125.558364,2448.197914,2448.197914
Family,-6.498901,17.258917,2.735252,-7.387287,-4.149246,11.528625,16.553989
Age_99,204.349513,-101.671097,-7.387287,951.727557,-189.53554,-159.931663,-94.3174
Age_minus1,204.349513,125.558364,-4.149246,-189.53554,318.08962,257.379887,114.394141
Fare_999,162.79343,2448.197914,11.528625,-159.931663,257.379887,47219.202652,762.474982
Fare_minus1,63.321188,2448.197914,16.553989,-94.3174,114.394141,762.474982,2378.567678


In [180]:
corr_matrix = x_train.to_pandas().corr()
corr_matrix

Unnamed: 0,Age,Fare,Family,Age_99,Age_minus1,Fare_999,Fare_minus1
Age,1.0,0.092644,-0.299113,1.0,1.0,0.051179,0.084585
Fare,0.092644,1.0,0.208268,-0.066273,0.142022,1.0,1.0
Family,-0.299113,0.208268,1.0,-0.144787,-0.140668,0.032079,0.205233
Age_99,1.0,-0.066273,-0.144787,1.0,-0.344476,-0.023857,-0.062687
Age_minus1,1.0,0.142022,-0.140668,-0.344476,1.0,0.066411,0.131514
Fare_999,0.051179,1.0,0.032079,-0.023857,0.066411,1.0,0.071946
Fare_minus1,0.084585,1.0,0.205233,-0.062687,0.131514,0.071946,1.0


Using Sklearn

In [181]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=2)

In [182]:
imputer1 = SimpleImputer(strategy='constant', fill_value=99)
imputer2 = SimpleImputer(strategy='constant', fill_value=999)

In [183]:
trf = ColumnTransformer([
    ('imputer1', imputer1, ['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

In [184]:
trf.fit(x_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [185]:
trf.named_transformers_['imputer1'].statistics_
trf.named_transformers_['imputer2'].statistics_

array([999.])

In [187]:
x_train.head()

Age,Fare,Family
f64,f64,i64
40.0,27.7208,0
4.0,16.7,2
47.0,9.0,0
9.0,31.3875,6
20.0,9.8458,0
