## Analysing the Categorical Features - House Pricing

In [1]:
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
def display_scrollable_df(df, *, title="DataFrame"):
    style = f"""
    <style>
    .scrollable-table {{ 
        overflow-y: auto; 
        height: 400px; 
        border: 1px solid #ddd; 
        padding: 8px; 
    }}
    </style>
    """
    scrollable_div = f"""
    <h3>{title}</h3>
    <div class='scrollable-table'>{df.to_html()}</div>
    """
    
    display(HTML(style + scrollable_div))

### Loading data

In [3]:
df = pd.read_csv("./../data/train.csv")
df_categorical = df.select_dtypes(include="object")

In [4]:
display_scrollable_df(df_categorical.head(), title="The head of categorical df")

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [5]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   Alley          91 non-null     object
 3   LotShape       1460 non-null   object
 4   LandContour    1460 non-null   object
 5   Utilities      1460 non-null   object
 6   LotConfig      1460 non-null   object
 7   LandSlope      1460 non-null   object
 8   Neighborhood   1460 non-null   object
 9   Condition1     1460 non-null   object
 10  Condition2     1460 non-null   object
 11  BldgType       1460 non-null   object
 12  HouseStyle     1460 non-null   object
 13  RoofStyle      1460 non-null   object
 14  RoofMatl       1460 non-null   object
 15  Exterior1st    1460 non-null   object
 16  Exterior2nd    1460 non-null   object
 17  MasVnrType     588 non-null    object
 18  ExterQual      1460 non-null

In [6]:
df_categorical.describe().T

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [7]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Elimination of NaNs

In [8]:
NAN_COLS = df_categorical.iloc[:, df_categorical.isna().any().tolist()].columns

In [9]:
df_categorical[NAN_COLS].isna().sum()

Alley           1369
MasVnrType       872
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [10]:
for col in NAN_COLS:
    print(df_categorical[col].value_counts(), '\n')

Alley
Grvl    50
Pave    41
Name: count, dtype: int64 

MasVnrType
BrkFace    445
Stone      128
BrkCmn      15
Name: count, dtype: int64 

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64 

BsmtCond
TA    1311
Gd      65
Fa      45
Po       2
Name: count, dtype: int64 

BsmtExposure
No    953
Av    221
Gd    134
Mn    114
Name: count, dtype: int64 

BsmtFinType1
Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: count, dtype: int64 

BsmtFinType2
Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: count, dtype: int64 

Electrical
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: count, dtype: int64 

FireplaceQu
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: count, dtype: int64 

GarageType
Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: count, dtype: int64 

GarageFinish
Unf    605
RFn    422
Fin    352
Name: count, dtype: int64 


In [11]:
df_categorical.fillna(value="None", inplace=True)

In [12]:
df_categorical[NAN_COLS].isna().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [13]:
display_scrollable_df(pd.concat([df_categorical.nunique()], axis=1), title="Number of unique counts")

Unnamed: 0,0
MSZoning,5
Street,2
Alley,3
LotShape,4
LandContour,4
Utilities,2
LotConfig,5
LandSlope,3
Neighborhood,25
Condition1,9


### Applying the oridnal encoding

In [14]:
for col in df_categorical.columns:
    col_dict = {}

    for val in df_categorical[col].unique():
        mean = df.loc[df_categorical[col] == val, 'SalePrice'].mean()
        col_dict[val] = mean
    
    sorted_keys = list(dict(sorted(col_dict.items(), key=lambda item: item[1])).keys())
    mapping = {}

    for i in range(len(sorted_keys)):
        key = sorted_keys[i]        
        mapping[key] = i

    df_categorical[col] = df_categorical[col].map(mapping)

In [15]:
display_scrollable_df(df_categorical.head(), title="The head of categorical df")

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,3,1,2,0,1,1,0,0,16,4,4,4,6,1,2,11,12,2,2,3,5,3,3,1,6,5,5,4,1,5,2,6,1,5,2,3,5,2,0,4,3,4,4
1,3,1,2,0,1,1,1,0,20,2,4,4,5,1,2,4,5,1,1,3,2,3,3,4,4,5,5,4,1,5,1,6,3,5,2,3,5,2,0,4,3,4,4
2,3,1,2,1,1,1,0,0,16,4,4,4,6,1,2,11,12,2,2,3,5,3,3,2,6,5,5,4,1,5,2,6,3,5,2,3,5,2,0,4,3,4,4
3,3,1,2,1,1,1,2,0,17,4,4,4,6,1,2,5,8,1,1,3,1,2,4,1,4,5,5,3,1,5,2,6,4,2,1,3,5,2,0,4,3,4,1
4,3,1,2,1,1,1,1,0,24,4,4,4,6,1,2,11,12,2,2,3,5,3,3,3,6,5,5,4,1,5,2,6,3,5,2,3,5,2,0,4,3,4,4


### Saving the encoded data

In [16]:
df_categorical.to_csv("./../data/categorical_train.csv", index=False)