## Analysing the Numerical Features - House Pricing

In [1]:
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
def display_scrollable_df(df, *, title="DataFrame"):
    style = f"""
    <style>
    .scrollable-table {{ 
        overflow-y: auto; 
        height: 400px; 
        border: 1px solid #ddd; 
        padding: 8px; 
    }}
    </style>
    """
    scrollable_div = f"""
    <h3>{title}</h3>
    <div class='scrollable-table'>{df.to_html()}</div>
    """
    
    display(HTML(style + scrollable_div))

### Loading data

In [3]:
df = pd.read_csv("./../data/train.csv")
df_numerical = df.select_dtypes(include=np.number)

In [4]:
display_scrollable_df(df_numerical.head(), title="Numerical DataFrame")

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2003.0,2,548,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,1976.0,2,460,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2001.0,2,608,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,1998.0,3,642,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,2000.0,3,836,192,84,0,0,0,0,0,12,2008,250000


In [5]:
df_numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [6]:
display_scrollable_df(df.describe().T, title="DataFrame Description")
print(df_numerical.shape)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


(1460, 38)


### Elimination of NaNs

In [7]:
display_scrollable_df(pd.concat([df_numerical.isna().sum()], axis=1), title="DataFrame NaN Counts")

Unnamed: 0,0
Id,0
MSSubClass,0
LotFrontage,259
LotArea,0
OverallQual,0
OverallCond,0
YearBuilt,0
YearRemodAdd,0
MasVnrArea,8
BsmtFinSF1,0


#### LotFrontage

In [8]:
bools = df_numerical.loc[:, "LotFrontage"].isna().tolist()
df_lotFrontage = df_numerical.loc[bools, ["LotFrontage", "LotArea"]]
display_scrollable_df(df_lotFrontage, title="NaNs of LotFrontage")
df_lotFrontage.shape

Unnamed: 0,LotFrontage,LotArea
7,,10382
12,,12968
14,,10920
16,,11241
24,,8246
31,,8544
42,,9180
43,,9200
50,,13869
64,,9375


(259, 2)

In [9]:
df_numerical["LotFrontage"].fillna(df_numerical["LotFrontage"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numerical["LotFrontage"].fillna(df_numerical["LotFrontage"].mean(), inplace=True)


In [10]:
display_scrollable_df(df_numerical[["LotFrontage", "LotArea"]], title="Lot Numerical DataFrame")

Unnamed: 0,LotFrontage,LotArea
0,65.0,8450
1,80.0,9600
2,68.0,11250
3,60.0,9550
4,84.0,14260
5,85.0,14115
6,75.0,10084
7,70.049958,10382
8,51.0,6120
9,50.0,7420


#### MasVnrArea

In [11]:
bools = df_numerical.loc[:, "MasVnrArea"].isna().tolist()
df_masVnrArea = df.loc[bools, ["MasVnrArea", "MasVnrType"]]
display_scrollable_df(df_masVnrArea, title="NaNs of MasVnrArea")
df_masVnrArea.shape

Unnamed: 0,MasVnrArea,MasVnrType
234,,
529,,
650,,
936,,
973,,
977,,
1243,,
1278,,


(8, 2)

In [12]:
df_numerical["MasVnrArea"].fillna(value=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numerical["MasVnrArea"].fillna(value=0, inplace=True)


In [13]:
display_scrollable_df(df_numerical[["MasVnrArea"]], title="MasVnrArea DataFrame")

Unnamed: 0,MasVnrArea
0,196.0
1,0.0
2,162.0
3,0.0
4,350.0
5,0.0
6,186.0
7,240.0
8,0.0
9,0.0


#### GarageYrBlt

In [14]:
df_garage = df.loc[:, ["GarageYrBlt", "GarageType", "GarageFinish", "GarageQual", "GarageCond"]]
display_scrollable_df(df_garage.iloc[df_garage.isna().any(axis=1).tolist(), :], title="Garage NaNs")

Unnamed: 0,GarageYrBlt,GarageType,GarageFinish,GarageQual,GarageCond
39,,,,,
48,,,,,
78,,,,,
88,,,,,
89,,,,,
99,,,,,
108,,,,,
125,,,,,
127,,,,,
140,,,,,


In [15]:
df_numerical["IsGarageIncluded"] = ~df_numerical["GarageYrBlt"].isna()
df_numerical["GarageYrBlt"].fillna(value=-1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numerical["GarageYrBlt"].fillna(value=-1, inplace=True)


In [16]:
display_scrollable_df(df_numerical[["GarageYrBlt", "IsGarageIncluded", "GarageCars"]], title="Garage Numerical DataFrame")

Unnamed: 0,GarageYrBlt,IsGarageIncluded,GarageCars
0,2003.0,True,2
1,1976.0,True,2
2,2001.0,True,2
3,1998.0,True,3
4,2000.0,True,3
5,1993.0,True,2
6,2004.0,True,2
7,1973.0,True,2
8,1931.0,True,2
9,1939.0,True,1


### Casting to right data types

#### Integer

In [17]:
INT_COLS = ["MSSubClass", "OverallQual", "OverallCond", 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']

In [18]:
df_numerical["MSSubClass"] /= 10

for col in INT_COLS:
    df_numerical[col] = df_numerical[col].astype(int)

In [19]:
display_scrollable_df(df_numerical.select_dtypes(include=np.integer), title="Integer DataFrame")

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,6,8450,7,5,2003,2003,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2003,2,548,0,61,0,0,0,0,0,2,2008,208500
1,2,2,9600,6,8,1976,1976,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,1976,2,460,298,0,0,0,0,0,0,5,2007,181500
2,3,6,11250,7,5,2001,2002,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2001,2,608,0,42,0,0,0,0,0,9,2008,223500
3,4,7,9550,7,5,1915,1970,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,1998,3,642,0,35,272,0,0,0,0,2,2006,140000
4,5,6,14260,8,5,2000,2000,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,2000,3,836,192,84,0,0,0,0,0,12,2008,250000
5,6,5,14115,5,5,1993,1995,732,0,64,796,796,566,0,1362,1,0,1,1,1,1,5,0,1993,2,480,40,30,0,320,0,0,700,10,2009,143000
6,7,2,10084,8,5,2004,2005,1369,0,317,1686,1694,0,0,1694,1,0,2,0,3,1,7,1,2004,2,636,255,57,0,0,0,0,0,8,2007,307000
7,8,6,10382,7,6,1973,1973,859,32,216,1107,1107,983,0,2090,1,0,2,1,3,1,7,2,1973,2,484,235,204,228,0,0,0,350,11,2009,200000
8,9,5,6120,7,5,1931,1950,0,0,952,952,1022,752,0,1774,0,0,2,0,2,2,8,2,1931,2,468,90,0,205,0,0,0,0,4,2008,129900
9,10,19,7420,5,6,1939,1950,851,0,140,991,1077,0,0,1077,1,0,1,0,2,2,5,2,1939,1,205,0,4,0,0,0,0,0,1,2008,118000


#### Float

In [20]:
FLOAT_COLS = ["LotArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "SalePrice"]

In [21]:
for col in FLOAT_COLS:
    df_numerical[col] = df_numerical[col].astype(float)

In [22]:
display_scrollable_df(df_numerical.select_dtypes(include=np.float_), title="Float DataFrame")

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice
0,65.0,8450.0,196.0,706.0,0.0,150.0,856.0,856.0,854.0,0.0,1710.0,548.0,0.0,61.0,0.0,0.0,0.0,0.0,0.0,208500.0
1,80.0,9600.0,0.0,978.0,0.0,284.0,1262.0,1262.0,0.0,0.0,1262.0,460.0,298.0,0.0,0.0,0.0,0.0,0.0,0.0,181500.0
2,68.0,11250.0,162.0,486.0,0.0,434.0,920.0,920.0,866.0,0.0,1786.0,608.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,223500.0
3,60.0,9550.0,0.0,216.0,0.0,540.0,756.0,961.0,756.0,0.0,1717.0,642.0,0.0,35.0,272.0,0.0,0.0,0.0,0.0,140000.0
4,84.0,14260.0,350.0,655.0,0.0,490.0,1145.0,1145.0,1053.0,0.0,2198.0,836.0,192.0,84.0,0.0,0.0,0.0,0.0,0.0,250000.0
5,85.0,14115.0,0.0,732.0,0.0,64.0,796.0,796.0,566.0,0.0,1362.0,480.0,40.0,30.0,0.0,320.0,0.0,0.0,700.0,143000.0
6,75.0,10084.0,186.0,1369.0,0.0,317.0,1686.0,1694.0,0.0,0.0,1694.0,636.0,255.0,57.0,0.0,0.0,0.0,0.0,0.0,307000.0
7,70.049958,10382.0,240.0,859.0,32.0,216.0,1107.0,1107.0,983.0,0.0,2090.0,484.0,235.0,204.0,228.0,0.0,0.0,0.0,350.0,200000.0
8,51.0,6120.0,0.0,0.0,0.0,952.0,952.0,1022.0,752.0,0.0,1774.0,468.0,90.0,0.0,205.0,0.0,0.0,0.0,0.0,129900.0
9,50.0,7420.0,0.0,851.0,0.0,140.0,991.0,1077.0,0.0,0.0,1077.0,205.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,118000.0


### Saving the data

In [23]:
display_scrollable_df(df_numerical.describe().T, title="Numerical DataFrame")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,5.673288,4.225251,2.0,2.0,5.0,7.0,19.0
LotFrontage,1460.0,70.049958,22.024023,21.0,60.0,70.049958,79.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1460.0,103.117123,180.731373,0.0,0.0,0.0,164.25,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [24]:
df_numerical.to_csv("./../data/numerical_train.csv", index=False)