In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

df = pd.read_csv('./datasets/train.csv')  # import file as df

df.shape  # starting the EDA


(2051, 81)

In [3]:
df.head()  # getting a sense of what the df looks like ( not pulling 80 columns)


Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


*we observe: categorical features, missing values, and bizarre columns (PID, Misc feature, Fence)*

In [4]:
df.columns  # getting all columns names


Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

*no major observation here just more weird columns (half bath, full bath) and lots of garage information too. The salesPrice is the last feature*

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [7]:
# checking for unique values in each feature
df.nunique()[:39].sort_values(ascending=False)


Id                2051
PID               2051
Lot Area          1476
Bsmt Unf SF        967
BsmtFin SF 1       821
Mas Vnr Area       373
BsmtFin SF 2       205
Lot Frontage       118
Year Built         113
Year Remod/Add      61
Neighborhood        28
MS SubClass         16
Exterior 1st        15
Exterior 2nd        15
Overall Qual        10
Overall Cond         9
Condition 1          9
Condition 2          8
House Style          8
MS Zoning            7
BsmtFin Type 2       6
Roof Style           6
Roof Matl            6
BsmtFin Type 1       6
Foundation           6
Bldg Type            5
Bsmt Qual            5
Lot Config           5
Bsmt Cond            5
Exter Cond           5
Bsmt Exposure        4
Lot Shape            4
Exter Qual           4
Mas Vnr Type         4
Land Contour         4
Land Slope           3
Utilities            3
Alley                2
Street               2
dtype: int64

*lets look at missing values*

In [8]:
# exploring where are the null values, and how many are there
miss_val_features = df.isnull().sum().sort_values(ascending=False)[0:25]
missing_percent = miss_val_features / len(df)
low_qual_columns = missing_percent > 0.8
low_qual_columns.index


Index(['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu',
       'Lot Frontage', 'Garage Finish', 'Garage Qual', 'Garage Yr Blt',
       'Garage Cond', 'Garage Type', 'Bsmt Exposure', 'BsmtFin Type 2',
       'BsmtFin Type 1', 'Bsmt Cond', 'Bsmt Qual', 'Mas Vnr Area',
       'Mas Vnr Type', 'Bsmt Half Bath', 'Bsmt Full Bath', 'Garage Area',
       'Total Bsmt SF', 'Bsmt Unf SF', 'BsmtFin SF 2', 'BsmtFin SF 1'],
      dtype='object')

##### we need to make a choice on how we go about these features: *keep or not keep*
*Out of 2'051 records, some features are missing significant amount of data* <br>
*before deciding on keeping or removing them, let's see how they correlate to sales price* 





In [9]:
df["SalePrice"].describe()  # lets observe SalePrice


count      2051.000000
mean     181469.701609
std       79258.659352
min       12789.000000
25%      129825.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: SalePrice, dtype: float64

In [10]:
#sns.histplot(df['SalePrice'], bins = 100)
# plt.show()


Let's observe the relation betwwen missing values fields and sales price to make a decision on keep or drop

In [11]:
# sns.heatmap(df.corr(),cmap='coolwarm')


In [12]:
df.corr() > 0.8


Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
Id,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PID,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
MS SubClass,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Lot Frontage,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Lot Area,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Overall Qual,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Overall Cond,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Year Built,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
Year Remod/Add,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
Mas Vnr Area,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# how does each feature correlate with sale price?
columns_selection1 = list(df.columns)[0:10]
columns_selection1.append('SalePrice')


In [14]:
columns_selection1


['Id',
 'PID',
 'MS SubClass',
 'MS Zoning',
 'Lot Frontage',
 'Lot Area',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'SalePrice']

In [15]:
sns.pairplot(
    df,
    x_vars=df[columns_selection1],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ece74eac0>

*from this we dont really se any clear correlation. I retain lot area and lot fontage as potential variables for now*<br>
*next 15 features [15-30]*

In [16]:
# how does each feature correlate with sale price?
columns_selection2 = list(df.columns)[16:30]
columns_selection2.append('SalePrice')
columns_selection2


['Bldg Type',
 'House Style',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Mas Vnr Area',
 'Exter Qual',
 'Exter Cond',
 'SalePrice']

In [17]:

sns.pairplot(
    df,
    x_vars=df[columns_selection2],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ecf45e250>

*couple of interesting ones here: building type, house type, overall quality, overall condition,exter quality ( multicolinearity?)*<br>
*next 10*

In [18]:
columns_selection3 = list(df.columns)[31:40]
columns_selection3.append('SalePrice')
columns_selection3


sns.pairplot(
    df,
    x_vars=df[columns_selection3],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ed0be90d0>

In [19]:
columns_selection4 = list(df.columns)[41:50]
columns_selection4


sns.pairplot(
    df,
    x_vars=df[columns_selection4],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ed10b1310>

*variables of interest: living area, 1st floor*

In [20]:
columns_selection5 = list(df.columns)[51:60]
columns_selection5


sns.pairplot(
    df,
    x_vars=df[columns_selection5],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ed0ace880>

*variable of interest: total rooms abv ground*

In [21]:
columns_selection6 = list(df.columns)[61:70]
columns_selection6


sns.pairplot(
    df,
    x_vars=df[columns_selection6],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ed14f4df0>

*garage area, porches*

In [22]:
columns_selection7 = list(df.columns)[71:81]
columns_selection7


sns.pairplot(
    df,
    x_vars=df[columns_selection7],
    y_vars='SalePrice'
)


<seaborn.axisgrid.PairGrid at 0x25ed1d29970>

Screen porch looks interesting

In [23]:
# let's look at the highest correlations with sales price


In [24]:
Correlation_serie = df.corr()["SalePrice"]
Correlation_serie.sort_values(ascending=False)


SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016255
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045328
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0

In [25]:
missing_data_serie = df.isnull().sum()
missing_data_serie.sort_values(ascending=False)

# After considering all columns, We can make our first judgment call to remove some and clear the dataset


Pool QC         2042
Misc Feature    1986
Alley           1911
Fence           1651
Fireplace Qu    1000
                ... 
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
SalePrice          0
Length: 81, dtype: int64

In [26]:
df = pd.read_csv('./datasets/train.csv')  # instantiate file to ease access


In [27]:
df.columns = df.columns.str.replace(' ', '_')  # cleaning column names


In [28]:
df.columns


Index(['Id', 'PID', 'MS_SubClass', 'MS_Zoning', 'Lot_Frontage', 'Lot_Area',
       'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities',
       'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1',
       'Condition_2', 'Bldg_Type', 'House_Style', 'Overall_Qual',
       'Overall_Cond', 'Year_Built', 'Year_Remod/Add', 'Roof_Style',
       'Roof_Matl', 'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type',
       'Mas_Vnr_Area', 'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual',
       'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_SF_1',
       'BsmtFin_Type_2', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF',
       'Heating', 'Heating_QC', 'Central_Air', 'Electrical', '1st_Flr_SF',
       '2nd_Flr_SF', 'Low_Qual_Fin_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath',
       'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr',
       'Kitchen_AbvGr', 'Kitchen_Qual', 'TotRms_AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace_Qu', 'Garage_Type', 'Garage_Yr_Blt',
       'G

In [29]:
# Removing columns with more than 80% missing data
# HOW TO AUTOMATE THIS? (If missing_val%>80%, then drop?)
df.drop(columns=['Pool_QC', 'Misc_Feature', 'Alley', 'Fence'], inplace=True)


In [30]:
df.shape  # control


(2051, 77)

In [31]:
# Removing columns  with <20% correlation with Sales price
Low_corr_features = ['Bedroom_AbvGr',
                     'Screen_Porch',
                     '3Ssn_Porch',
                     'Mo_Sold',
                     'Pool_Area',
                     'BsmtFin_SF_2',
                     'Misc_Val',
                     'Yr_Sold',
                     'Low_Qual_Fin_SF',
                     'Bsmt_Half_Bath',
                     'Id',
                     'MS_SubClass',
                     'Overall_Cond',
                     'Kitchen_AbvGr',
                     'Enclosed_Porch',
                     'PID', 'Id']

# HOW TO AUTOMATE THIS? (If low quality %>80%, then drop?)
df.drop(columns=Low_corr_features, inplace=True)


In [32]:
df.shape


(2051, 61)

In [33]:
# identifying colums requiring imputation work
df.isnull().sum().sort_values(ascending=False)[:20]


Fireplace_Qu      1000
Lot_Frontage       330
Garage_Yr_Blt      114
Garage_Cond        114
Garage_Qual        114
Garage_Finish      114
Garage_Type        113
Bsmt_Exposure       58
BsmtFin_Type_2      56
Bsmt_Qual           55
Bsmt_Cond           55
BsmtFin_Type_1      55
Mas_Vnr_Type        22
Mas_Vnr_Area        22
Bsmt_Full_Bath       2
Garage_Cars          1
Garage_Area          1
Bsmt_Unf_SF          1
Total_Bsmt_SF        1
BsmtFin_SF_1         1
dtype: int64

In [None]:
# Bmst and garage look very similar in names and missing values. maybe multicolinearity. Lets check and remove if necessary
Correlation_serie = df.corr()["SalePrice"]
Correlation_serie.sort_values(ascending=False)


In [None]:
df.corr().values > 0.8


In [36]:
# we need to find a way to impute missing data for categorical and numeric features


Identifying the most relevant data using Lasso

In [37]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
y = df['SalePrice']
X = df.drop(columns=['SalePrice'])
X.shape, df.shape  # controlling the number of columns remaining makes sense


((2051, 60), (2051, 61))

In [41]:
# let sidentify which columns are object and should be OHEncoded
X_object_columns = X.select_dtypes('object')


In [39]:
# lets OHE them inside X
# pd.get_dummy already deletes the object columns
X = pd.get_dummies(data=X, columns=X_object_columns.columns, drop_first=True)


In [42]:
X.shape, X.columns

((2051, 222),
 Index(['Lot_Frontage', 'Lot_Area', 'Overall_Qual', 'Year_Built',
        'Year_Remod/Add', 'Mas_Vnr_Area', 'BsmtFin_SF_1', 'Bsmt_Unf_SF',
        'Total_Bsmt_SF', '1st_Flr_SF',
        ...
        'Paved_Drive_P', 'Paved_Drive_Y', 'Sale_Type_CWD', 'Sale_Type_Con',
        'Sale_Type_ConLD', 'Sale_Type_ConLI', 'Sale_Type_ConLw',
        'Sale_Type_New', 'Sale_Type_Oth', 'Sale_Type_WD '],
       dtype='object', length=222))