# 1- Importing packages

In [3]:
import datetime as dt
import pandas as pd
import numpy as np

# 2- Importing and investigating data

In [14]:
house_price = pd.read_csv('Data/train.csv')

In [16]:
# look at a few rows of the data by printing first 5 rows
house_price.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [28]:
# select categorical features from the data
house_price.select_dtypes(include='object').columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [39]:
# select numerical features from the data
house_price.select_dtypes(include='number').columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

# 3- Feature Engineering and Selection

In [47]:
# Create Date feature as following:
# first we need to combine month and year into date, and setting day =1 rename columns
house_price.rename({'MoSold':'Month','YrSold':'Year'},axis=1,inplace=True)

house_price['Date'] = pd.to_datetime(house_price[['Year','Month']].assign(Day=1))
house_price['Date']

0      2008-02-01
1      2007-05-01
2      2008-09-01
3      2006-02-01
4      2008-12-01
          ...    
1455   2007-08-01
1456   2010-02-01
1457   2010-05-01
1458   2010-04-01
1459   2008-06-01
Name: Date, Length: 1460, dtype: datetime64[ns]

In [61]:
# Select month of the year from Date column
month_of_year = house_price['Date'].dt.month
month_of_year


0        2
1        5
2        9
3        2
4       12
        ..
1455     8
1456     2
1457     5
1458     4
1459     6
Name: Date, Length: 1460, dtype: int32

In [63]:
# copy your data 
data = house_price.copy()

In [73]:
# Create 2 variables, one holds the categorical features and the other holds the numerical ones
object_df = data.select_dtypes(include = ['object'])
numerical_df = data.select_dtypes(include = ['number'])
object_df


Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [153]:
object_df['Heating_Grav'].value_counts(normalize=True).sort_values(ascending = False)[0]

  object_df['Heating_Grav'].value_counts(normalize=True).sort_values(ascending = False)[0]


0.9952054794520548

In [149]:
# Delete the categorical features: distribution imblance single categtory accounts for more than 95%
c= []
for i in object_df.columns:
    x = object_df[i].value_counts(normalize=True).sort_values(ascending=False).iloc[0]
    if x > 0.95:
        c.append(i)
c   

['MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'Street_Grvl',
 'Street_Pave',
 'Alley_Grvl',
 'Alley_Pave',
 'LotShape_IR2',
 'LotShape_IR3',
 'LandContour_Bnk',
 'LandContour_HLS',
 'LandContour_Low',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_Crawfor',
 'Neighborhood_IDOTRR',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NPkVill',
 'Neighborhood_NoRidge',
 'Neighborhood_SWISU',
 'Neighborhood_SawyerW',
 'Neighborhood_StoneBr',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'Condition1_Artery',
 'Condition1_PosA',
 'Condition1_PosN',
 'Condition1_RRAe',
 'Condition1_RRAn',
 'Condition1_RRNe',
 'Condition1_RRNn',
 'Condition2_Artery',
 'Condition2_Feedr',
 'Condition2_Norm',
 'Condition2_PosA',
 'Condition2_PosN',
 'Condition2_RRAe',
 'Condition2_RRAn

In [159]:
object_df.drop(columns = c ,inplace = True)

In [161]:
object_df

Unnamed: 0,MSZoning_RL,MSZoning_RM,LotShape_IR1,LotShape_Reg,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_Inside,LandSlope_Gtl,Neighborhood_CollgCr,...,GarageQual_TA,GarageCond_TA,PavedDrive_N,PavedDrive_Y,Fence_MnPrv,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_Normal,SaleCondition_Partial
0,True,False,False,True,True,False,False,True,True,True,...,True,True,False,True,False,False,True,False,True,False
1,True,False,False,True,True,False,False,False,True,False,...,True,True,False,True,False,False,True,False,True,False
2,True,False,True,False,True,False,False,True,True,True,...,True,True,False,True,False,False,True,False,True,False
3,True,False,True,False,True,True,False,False,True,False,...,True,True,False,True,False,False,True,True,False,False
4,True,False,True,False,True,False,False,False,True,False,...,True,True,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,True,False,False,True,True,False,False,True,True,False,...,True,True,False,True,False,False,True,False,True,False
1456,True,False,False,True,True,False,False,True,True,False,...,True,True,False,True,True,False,True,False,True,False
1457,True,False,False,True,True,False,False,True,True,False,...,True,True,False,True,False,False,True,False,True,False
1458,True,False,False,True,True,False,False,True,True,False,...,True,True,False,True,False,False,True,False,True,False


In [79]:
# Create numerical feature: Age_house = year - YearBulit
numerical_df['Age_House'] = (numerical_df['Year'] - numerical_df['YearBuilt'])
numerical_df['Age_House']

0        5
1       31
2        7
3       91
4        8
        ..
1455     8
1456    32
1457    69
1458    60
1459    43
Name: Age_House, Length: 1460, dtype: int64

In [83]:
# Create numerical feature: TotalBsmtBath = BsmtFullBath + 0.5 * BsmtHaifBath
numerical_df['TotalBsmtBath'] = numerical_df['BsmtFullBath'] + 0.5 * numerical_df['BsmtHalfBath']
numerical_df['TotalBsmtBath'] 

0       1.0
1       0.5
2       1.0
3       1.0
4       1.0
       ... 
1455    0.0
1456    1.0
1457    0.0
1458    1.0
1459    1.0
Name: TotalBsmtBath, Length: 1460, dtype: float64

In [87]:
# Create numerical feature: TotalBath = FullBath + 0.5 * HaifBath
numerical_df['TotalBath'] = numerical_df['FullBath'] + 0.5 * numerical_df['HalfBath']
numerical_df['TotalBath'] 

0       2.5
1       2.0
2       2.5
3       1.0
4       2.5
       ... 
1455    2.5
1456    2.0
1457    2.0
1458    1.0
1459    1.5
Name: TotalBath, Length: 1460, dtype: float64

In [89]:
#Create numerical feature: TotalSA = TotalBsmtSF + 1stFlrSF + 2ndFlrSF
numerical_df['TotalSA '] = numerical_df['TotalBsmtSF'] + numerical_df['1stFlrSF']+ numerical_df['2ndFlrSF']
numerical_df['TotalSA '] 

0       2566
1       2524
2       2706
3       2473
4       3343
        ... 
1455    2600
1456    3615
1457    3492
1458    2156
1459    2512
Name: TotalSA , Length: 1460, dtype: int64

In [169]:
#one-hot encoding for categorical features
object_ = pd.get_dummies(object_df,columns = object_df.columns)
object_

Unnamed: 0,MSZoning_RL_False,MSZoning_RL_True,MSZoning_RM_False,MSZoning_RM_True,LotShape_IR1_False,LotShape_IR1_True,LotShape_Reg_False,LotShape_Reg_True,LandContour_Lvl_False,LandContour_Lvl_True,...,SaleType_New_False,SaleType_New_True,SaleType_WD_False,SaleType_WD_True,SaleCondition_Abnorml_False,SaleCondition_Abnorml_True,SaleCondition_Normal_False,SaleCondition_Normal_True,SaleCondition_Partial_False,SaleCondition_Partial_True
0,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False
1,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False
2,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,False,True,True,False
3,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,False,True,True,False,True,False
4,False,True,True,False,False,True,True,False,False,True,...,True,False,False,True,True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False
1456,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False
1457,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False
1458,False,True,True,False,True,False,False,True,False,True,...,True,False,False,True,True,False,False,True,True,False


In [115]:
# Concatenate back the categorical and numerical featueres together
pd.concat([object_df, numerical_df], axis = 1)


Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,LotShape_IR1,...,ScreenPorch,PoolArea,MiscVal,Month,Year,SalePrice,Age_House,TotalBsmtBath,TotalBath,TotalSA
0,False,False,False,True,False,False,True,False,False,False,...,0,0,0,2,2008,208500,5,1.0,2.5,2566
1,False,False,False,True,False,False,True,False,False,False,...,0,0,0,5,2007,181500,31,0.5,2.0,2524
2,False,False,False,True,False,False,True,False,False,True,...,0,0,0,9,2008,223500,7,1.0,2.5,2706
3,False,False,False,True,False,False,True,False,False,True,...,0,0,0,2,2006,140000,91,1.0,1.0,2473
4,False,False,False,True,False,False,True,False,False,True,...,0,0,0,12,2008,250000,8,1.0,2.5,3343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,False,False,False,True,False,False,True,False,False,False,...,0,0,0,8,2007,175000,8,0.0,2.5,2600
1456,False,False,False,True,False,False,True,False,False,False,...,0,0,0,2,2010,210000,32,1.0,2.0,3615
1457,False,False,False,True,False,False,True,False,False,False,...,0,0,2500,5,2010,266500,69,0.0,2.0,3492
1458,False,False,False,True,False,False,True,False,False,False,...,0,0,0,4,2010,142125,60,1.0,1.0,2156


In [206]:
# # Create a column for seasons
def get_season(date):
    year = str(date.year)
    seasons = {'Spring': pd.date_range(start='21/03/'+year,end='21/06/'+year),
               'Summer': pd.date_range(start='22/06/'+year,end='22/09/'+year),
               'Autumn': pd.date_range(start='23/09/'+year,end='21/12/'+year),
               'Winter': pd.date_range(start='22/12/'+year,end='20/03/'+year)}
    if date in seasons['Summer']:
        return 'Summer'
    elif date in seasons['Spring']:
        return 'Spring'
    elif date in seasons['Autumn']:
        return 'Autumn'
    else: 
        return 'Winter'


data['seasons'] = data.Date.map(get_season)
data['seasons']

0       Winter
1       Spring
2       Summer
3       Winter
4       Autumn
         ...  
1455    Summer
1456    Winter
1457    Spring
1458    Spring
1459    Spring
Name: seasons, Length: 1460, dtype: object

In [202]:
data


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,Month,Year,SaleType,SaleCondition,SalePrice,Date,seasons
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,0,2,2008,WD,Normal,208500,2008-02-01,
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,0,5,2007,WD,Normal,181500,2007-05-01,Spring
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,0,9,2008,WD,Normal,223500,2008-09-01,Summer
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,0,2,2006,WD,Abnorml,140000,2006-02-01,
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,0,12,2008,WD,Normal,250000,2008-12-01,Autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,,,0,8,2007,WD,Normal,175000,2007-08-01,Summer
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,MnPrv,,0,2,2010,WD,Normal,210000,2010-02-01,
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,GdPrv,Shed,2500,5,2010,WD,Normal,266500,2010-05-01,Spring
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,,,0,4,2010,WD,Normal,142125,2010-04-01,Spring


In [119]:
data.groupby('seasons')['SalePrice'].sum()

seasons
autumn     42170475
spring    104213764
summer     78457064
winter     39303643
Name: SalePrice, dtype: int64

# 3- Feature Selection

In [188]:
# Select only Features that have correlation higher than 90%
mat = numerical_df.corr()
cols = set()
l = len(mat)
for i in range(l):
    for j in range(l):
        v =  mat.iloc[i,j]
        if abs(v)>0.9:
            cols.add((mat.columns[i]))
cols

{'1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'Age_House',
 'BedroomAbvGr',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'EnclosedPorch',
 'Fireplaces',
 'FullBath',
 'GarageArea',
 'GarageCars',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'Id',
 'KitchenAbvGr',
 'LotArea',
 'LotFrontage',
 'LowQualFinSF',
 'MSSubClass',
 'MasVnrArea',
 'MiscVal',
 'Month',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PoolArea',
 'SalePrice',
 'ScreenPorch',
 'TotRmsAbvGrd',
 'TotalBath',
 'TotalBsmtBath',
 'TotalBsmtSF',
 'TotalSA ',
 'WoodDeckSF',
 'Year',
 'YearBuilt',
 'YearRemodAdd'}

In [None]:
# conclusion?