#### Univariate Analysis

In [329]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [330]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import gaussian_kde

In [331]:
df = pl.read_csv(
    'Data_set\\train.csv',
    null_values = 'NA',     # the is NA value in int column treat it as a missing value 
).select(['GarageQual', 'FireplaceQu', 'SalePrice'])
df.head(2)

GarageQual,FireplaceQu,SalePrice
str,str,i64
"""TA""",,208500
"""TA""","""TA""",181500


In [332]:
null_percentage = df.select([
    (pl.col(c).is_null().sum()/df.height)
    for c in df.columns
]) * 100

null_percentage = null_percentage.unpivot(
    index = [],
    variable_name='column',
    value_name = 'Percentage (%)'
)

null_percentage

column,Percentage (%)
str,f64
"""GarageQual""",5.547945
"""FireplaceQu""",47.260274
"""SalePrice""",0.0


In [333]:
vc = df['GarageQual'].to_pandas().value_counts().reset_index()
vc.columns = ['GarageQual', 'Count']

fig = px.bar(vc, x='GarageQual', y='Count', title='GarageQual Value Counts')
fig.show()

In [334]:
df['GarageQual'].mode()    # returns the most frequent value in the column

GarageQual
str
"""TA"""


In [335]:
garage_ta = df.filter(pl.col('GarageQual') == 'TA').select('SalePrice').drop_nulls().to_series().to_pandas()    # selecting the column and removign the null values from it 
garage_na = df.filter(pl.col('GarageQual').is_null()).select('SalePrice').drop_nulls().to_series().to_pandas()

x_vals = np.linspace(min(garage_ta.min(), garage_na.min()),
                     max(garage_ta.max(), garage_na.max()),200)

kde_ta = gaussian_kde(garage_ta)(x_vals)
kde_na = gaussian_kde(garage_na)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_ta,
    name = 'House with TA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_traces(go.Scatter(
    x = x_vals,
    y = kde_na, 
    name = 'House with NA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(255,0,0,0.3)'
))

fig.update_layout(
    title = 'GarageQual vs SalePrice (KDE)',
    xaxis_title = 'SalePrice',
    yaxis_title = 'Density',
    width = 1000,
    height = 500,
    plot_bgcolor = 'white',
    yaxis = dict(showgrid = True, gridcolor = 'lightgray'), 
    xaxis = dict(showgrid = False)
)

fig.show()

Observation - 
```plaintext 
we can see that the distribution of house with TA and House with NA W.R.T to house price is very different. 

In [336]:
temp = df.filter(pl.col('GarageQual') == 'TA').select('SalePrice').drop_nulls().to_series().to_pandas()    # storing the value of the column when it had null values

##### Filling the missing values with The MODE of the particular column

In [337]:
df = df.with_columns(
    pl.col("GarageQual").fill_null('TA').alias('GarageQual')
)

In [338]:
vc = df['GarageQual'].to_pandas().value_counts().reset_index()
vc.columns = ['GarageQual', 'Count']

fig = px.bar(vc, x='GarageQual', y='Count', title='GarageQual Value Counts')
fig.show()

In [339]:
garage_ta = df.filter(pl.col('GarageQual') == 'TA').select('SalePrice').to_series().to_pandas()    # now this stores the column where there is no null values in it we are replaced the null values above

x_vals = np.linspace(min(garage_ta.min(), temp.min()),
                     max(garage_ta.max(), temp.max()),200)

kde_ta = gaussian_kde(garage_ta)(x_vals)
kde_na = gaussian_kde(temp)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_ta,
    name = 'TA with no NA IMPUTED',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_traces(go.Scatter(
    x = x_vals,
    y = kde_na, 
    name = 'TA with NA ORIGINAL',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(255,0,0,0.3)'
))

fig.update_layout(
    title = 'GarageQual vs SalePrice (KDE)',
    xaxis_title = 'SalePrice',
    yaxis_title = 'Density',
    width = 1000,
    height = 500,
    plot_bgcolor = 'white',
    yaxis = dict(showgrid = True, gridcolor = 'lightgray'), 
    xaxis = dict(showgrid = False)
)

fig.show()

Observation - we can see that the frequency of the bar graph for the TA column has increases the the distribution is now almost the same. in KDE


Now why does the distribution comes out to be almost the same. 
- the reason is because we had almost close to 5 percent missing value hence the distribution ended up matching.

but when we have a column with large % of missing value we see unexpected results the below code will give an demonstration. 

---

In [340]:
vc = df['FireplaceQu'].to_pandas().value_counts().reset_index()
vc.columns = ['FireplaceQu', 'Count']

fig = px.bar(vc, x = 'FireplaceQu', y = 'Count', title = 'FireplaceQu Value Counts')
fig.show()

Key Observation - 

When we are planning on replacing the missing values of a categorical colum with the most frequent value we need to make sure the MODE of the column has to be dominant, but we see that there are 2 category with almost the same frequency. hence we will be facing issues ahead. 

In [341]:
df['FireplaceQu'].mode()
# we will see null as most of the values in the column are null hence we refer the bar graph

FireplaceQu
str
""


In [342]:
FireplaceQu_ta = df.filter(pl.col('FireplaceQu') == 'Gd').select('SalePrice').drop_nulls().to_series().to_pandas()    # selecting the column and removign the null values from it 
FireplaceQu_na = df.filter(pl.col('FireplaceQu').is_null()).select('SalePrice').drop_nulls().to_series().to_pandas()

x_vals = np.linspace(min(FireplaceQu_ta.min(), FireplaceQu_na.min()),
                     max(FireplaceQu_ta.max(), FireplaceQu_na.max()),200)

kde_Gd = gaussian_kde(FireplaceQu_ta)(x_vals)
kde_na = gaussian_kde(FireplaceQu_na)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_Gd,
    name = 'FireplaceQu with no NA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_traces(go.Scatter(
    x = x_vals,
    y = kde_na, 
    name = 'FireplaceQu with NA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(255,0,0,0.3)'
))

fig.update_layout(
    title = 'GarageQual vs SalePrice (KDE)',
    xaxis_title = 'SalePrice',
    yaxis_title = 'Density',
    width = 1000,
    height = 500,
    plot_bgcolor = 'white',
    yaxis = dict(showgrid = True, gridcolor = 'lightgray'), 
    xaxis = dict(showgrid = False)
)

fig.show()

Observation the distribution is very different because we are missing more than 50% values in the column

In [343]:
temp = df.filter(pl.col('FireplaceQu') == 'Gd').select('SalePrice').drop_nulls().to_series().to_pandas()   # storign the colum value when it has NA (filtering it out over here) will be used for plotting later

filling the null values with Gd

In [344]:
df = df.with_columns(
    pl.col("FireplaceQu").fill_null('Gd').alias('FireplaceQu')
)

In [345]:
vc = df['FireplaceQu'].to_pandas().value_counts().reset_index()
vc.columns = ['FireplaceQu', 'Count']

fig = px.bar(vc, x = 'FireplaceQu', y = 'Count', title = 'FireplaceQu Value Counts')
fig.show()

We can see that the frequency has increased vary much of the Gd column

In [346]:
FireplaceQu_ta = df.filter(pl.col('FireplaceQu') == 'Gd').select('SalePrice').to_series().to_pandas()    # selecting the column and removign the null values from it 

x_vals = np.linspace(min(FireplaceQu_ta.min(), temp.min()),
                     max(FireplaceQu_ta.max(), temp.max()),200)

kde_Gd = gaussian_kde(FireplaceQu_ta)(x_vals)
kde_na = gaussian_kde(temp)(x_vals)

fig = go.Figure()

fig.add_traces(go.Scatter(
    x = x_vals, 
    y = kde_Gd,
    name = 'FireplaceQu with no NA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'blue'),
    fillcolor = 'rgba(0,0,255,0.3)'
))

fig.add_traces(go.Scatter(
    x = x_vals,
    y = kde_na, 
    name = 'FireplaceQu with NA',
    mode = 'lines',
    fill = 'tozeroy',
    line = dict(color = 'red'),
    fillcolor = 'rgba(255,0,0,0.3)'
))

fig.update_layout(
    title = 'GarageQual vs SalePrice (KDE)',
    xaxis_title = 'SalePrice',
    yaxis_title = 'Density',
    width = 1000,
    height = 500,
    plot_bgcolor = 'white',
    yaxis = dict(showgrid = True, gridcolor = 'lightgray'), 
    xaxis = dict(showgrid = False)
)

fig.show()

We see that the distribution still differs a lot because we replaced 50% null values with the mode values. 

In [347]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [348]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice'), df['SalePrice'], test_size=0.2)

In [349]:
imputer = SimpleImputer(strategy='most_frequent')

In [350]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_train)


X does not have valid feature names, but SimpleImputer was fitted with feature names



In [351]:
imputer.statistics_

array(['TA', 'Gd'], dtype=object)