#### Filling the missing value with mean/median
```plaintext 
use this method when we have. 
1) Data missing completely at random 
2) less than 5%

when to use mean or median
- if the data is some what following normal distribution - use mean 
- if the data is some what skewed use median

| **Advantages**                                     | **Disadvantages**                                                 |
|----------------------------------------------------|-------------------------------------------------------------------|
| Simple to use                                      | Changes the distribution of the data                              |
| Applicable when missing data is less than 5%       | May introduce outliers                                            |
|                                                    | Covariance and correlation structure can change                   |


In [159]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [160]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import gaussian_kde

In [161]:
df = pl.read_csv('Data_set\\titanic_toy.csv') 
df.head(2)

Age,Fare,Family,Survived
f64,f64,i64,i64
22.0,7.25,1,0
38.0,71.2833,1,1


Checking the null % value in the df

In [162]:
null_percentage = df.select([
    (pl.col(c).is_null().sum()/df.height)
    for c in df.columns
]) * 100

null_percentage = null_percentage.unpivot(
    index = [],
    variable_name='column',
    value_name = 'Percentage (%)'
)

null_percentage

column,Percentage (%)
str,f64
"""Age""",19.86532
"""Fare""",5.050505
"""Family""",0.0
"""Survived""",0.0


In [163]:
x = df.drop('Survived')
y = df['Survived']

In [164]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 2)
x_train.shape, x_test.shape

((712, 3), (179, 3))

In [165]:
x_train_null_perc = df.select([
    (pl.col(c).is_null().sum()/df.height)
    for c in df.columns
]) * 100

null_percentage = x_train_null_perc.unpivot(
    index = [],
    variable_name='column',
    value_name = 'Percentage %'
)
null_percentage

column,Percentage %
str,f64
"""Age""",19.86532
"""Fare""",5.050505
"""Family""",0.0
"""Survived""",0.0


Getting the mean and median of the age, fair

In [166]:
mean_age = x_train.select(pl.col('Age').mean()).item()
median_age = x_train.select(pl.col('Age').median()).item()

mean_fare = x_train.select(pl.col('Fare').mean()).item()
median_fare = x_train.select(pl.col('Fare')).median()

In [167]:
x_train = x_train.with_columns([
    pl.col('Age').fill_null(median_age).alias('Age_median'),
    pl.col('Age').fill_null(mean_age).alias('Age_mean'),
    
    pl.col('Fare').fill_null(median_fare).alias('Fare_median'),
    pl.col('Fare').fill_null(mean_fare).alias('Fare_mean')
])

x_train.sample(3)

Age,Fare,Family,Age_median,Age_mean,Fare_median,Fare_mean
f64,f64,i64,f64,f64,f64,f64
70.0,71.0,2,70.0,70.0,71.0,71.0
30.0,13.0,0,30.0,30.0,13.0,13.0
15.0,7.2292,2,15.0,15.0,7.2292,7.2292


When we replace the the null values with mean, median - we see a change in the variance, mostly shrink in variance

In [168]:
var_summary = pl.DataFrame({
    'Feature' : [
        'Age', 'Age_median', 'Age_mean', 'Fare', 'Fare_median', 'Fare_mean'
    ],
    'Variance' : [
        x_train['Age'].var(),
        x_train['Age_median'].var(),
        x_train['Age_mean'].var(),
        x_train['Fare'].var(),
        x_train['Fare_median'].var(),
        x_train['Fare_mean'].var()
    ]
})

var_summary

Feature,Variance
str,f64
"""Age""",204.349513
"""Age_median""",161.989566
"""Age_mean""",161.812625
"""Fare""",2448.197914
"""Fare_median""",2340.091022
"""Fare_mean""",2324.238526


Plotting the KDE to see the distribution

In [169]:
age_orig = x_train['Age'].drop_nulls().to_numpy()
age_median = x_train['Age_median'].drop_nulls().to_numpy()
age_mean = x_train['Age_mean'].drop_nulls().to_numpy()

x_vals = np.linspace(min(age_orig.min(), age_median.min(), age_mean.min()),
                     max(age_orig.max(), age_median.max(), age_mean.max()), 200)

kde_original  = gaussian_kde(age_orig)(x_vals)
kde_median    = gaussian_kde(age_median)(x_vals)
kde_mean      = gaussian_kde(age_mean)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_original,
    name = 'Original',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_median, 
    name = 'Median Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals, 
    y = kde_mean, 
    name = 'Mean Imputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'green'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.update_layout(
    title = 'KDE Plot of Age Imputation Methods',
    xaxis_title = 'Age',
    yaxis_title = 'Density',
    plot_bgcolor = 'white',
    yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1, griddash='dot'),
    xaxis=dict(showgrid=False)
)

fig.show()


In [170]:
fare_orig = x_train['Fare'].drop_nulls().to_numpy()
fare_median = x_train['Fare_median'].drop_nulls().to_numpy()
fare_mean = x_train['Fare_mean'].drop_nulls().to_numpy()

x_vals = np.linspace(min(fare_orig.min(), fare_median.min(), fare_mean.min()),
                     max(fare_orig.max(), fare_median.max(), fare_mean.max()), 200)

kde_original = gaussian_kde(fare_orig)(x_vals)
kde_median   = gaussian_kde(fare_median)(x_vals)
kde_mean     = gaussian_kde(fare_mean)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_original,
    name = 'Original',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_trace(go.Scatter(
    x = x_vals,
    y = kde_median, 
    name = "Median Inputer",
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(0,128,0,0.3)'  
))

fig.add_traces(go.Scatter(
    x = x_vals,
    y = kde_mean,
    name = 'Mean Inputer',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'green'),
    fillcolor = 'rgba(0,128,0,0.3)'
))

fig.update_layout(
    title = 'KDE Plot Age Imputation Methods',
    xaxis_title = 'Age',
    yaxis_title = 'Density',
    plot_bgcolor = 'white',
    yaxis = dict(showgrid = True, gridcolor = 'lightgray', gridwidth = 1, griddash='dot'),
    xaxis = dict(showgrid = False)
)

fig.show()

Check the covariance matrix

In [171]:
cov_matrix = x_train.to_pandas().cov()
cov_matrix

Unnamed: 0,Age,Fare,Family,Age_median,Age_mean,Fare_median,Fare_mean
Age,204.349513,70.719262,-6.498901,204.349513,204.349513,64.858859,66.665205
Fare,70.719262,2448.197914,17.258917,57.957599,55.603719,2448.197914,2448.197914
Family,-6.498901,17.258917,2.735252,-5.112563,-5.146106,16.476305,16.385048
Age_median,204.349513,57.957599,-5.112563,161.989566,161.812625,53.553455,55.023037
Age_mean,204.349513,55.603719,-5.146106,161.812625,161.812625,51.358,52.788341
Fare_median,64.858859,2448.197914,16.476305,53.553455,51.358,2340.091022,2324.238526
Fare_mean,66.665205,2448.197914,16.385048,55.023037,52.788341,2324.238526,2324.238526


In [172]:
corr_matrix = x_train.to_pandas().corr()
corr_matrix

Unnamed: 0,Age,Fare,Family,Age_median,Age_mean,Fare_median,Fare_mean
Age,1.0,0.092644,-0.299113,1.0,1.0,0.087356,0.090156
Fare,0.092644,1.0,0.208268,0.091757,0.088069,1.0,1.0
Family,-0.299113,0.208268,1.0,-0.242883,-0.24461,0.205942,0.205499
Age_median,1.0,0.091757,-0.242883,1.0,0.999454,0.086982,0.089673
Age_mean,1.0,0.088069,-0.24461,0.999454,1.0,0.083461,0.086078
Fare_median,0.087356,1.0,0.205942,0.086982,0.083461,1.0,0.996607
Fare_mean,0.090156,1.0,0.205499,0.089673,0.086078,0.996607,1.0


In [173]:
df_long = x_train.select(['Age', 'Age_median', 'Age_mean']).to_pandas().melt(
    var_name = 'Imputation Method',
    value_name = 'Imputed Age'
)

fig = px.box(df_long, x='Imputation Method', y='Imputed Age', title = 'Boxplot of Age Imputation Method ')
fig.show()

---
Using Sklearn

In [174]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=2)

In [175]:
imputer1 = SimpleImputer(strategy = 'median')
imputer2 = SimpleImputer(strategy = 'mean')

In [176]:
trf = ColumnTransformer([
    ('imputer1', imputer1, ['Age']),
    ('imputer2', imputer2, ['Fare'])
],remainder='passthrough')

In [177]:
trf.fit(x_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [178]:
trf.named_transformers_['imputer1'].statistics_        # gets the values that are going to be used to fill the missing data
trf.named_transformers_['imputer2'].statistics_

array([32.61759689])

In [179]:
x_train = trf.transform(x_train)
x_test = trf.transform(x_test)

x_train

array([[ 40.    ,  27.7208,   0.    ],
       [  4.    ,  16.7   ,   2.    ],
       [ 47.    ,   9.    ,   0.    ],
       ...,
       [ 71.    ,  49.5042,   0.    ],
       [ 28.75  , 221.7792,   0.    ],
       [ 28.75  ,  25.925 ,   0.    ]])