In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, LeaveOneOut

# 1. Load data

In [2]:
train_dataset = pd.read_csv('train.csv', sep = ',')

In [3]:
test_dataset = pd.read_csv('test.csv', sep = ',')

In [4]:
train_dataset.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

From examining the data and looking at the variables dewscription of the dataset, it can be seen that this dataset contains a significant amount of categorical data. In order for this data to be used to train a machine learning model this variables need to be encoded. For this encoding 2 different approaches will be followed depending on the nature of the data.\\

For those variables whose categories are ordered, such as ratings, they will be encoded in a way that is ordering is preserved, while in the case of categories that are unordered a One Hot Encoder will be used to binarized the data in a way that that there is no partial ordering between any of the categories

In [6]:
train_dataset['Alley'] = train_dataset['Alley'].fillna('No')
train_dataset['BsmtFinType1'] = train_dataset['BsmtFinType1'].fillna('No')
train_dataset['BsmtFinType2'] = train_dataset['BsmtFinType2'].fillna('No')
train_dataset['GarageType'] = train_dataset['GarageType'].fillna('No')
train_dataset['GarageFinish'] = train_dataset['GarageFinish'].fillna('No')
train_dataset['Fence'] = train_dataset['Fence'].fillna('No')
train_dataset['MiscFeature'] = train_dataset['MiscFeature'].fillna('No')

In [7]:
train_dataset['BsmtQual'] = train_dataset['BsmtQual'].fillna('No')
train_dataset['BsmtCond'] = train_dataset['BsmtCond'].fillna('No')
train_dataset['BsmtExposure'] = train_dataset['BsmtExposure'].fillna('Na')
train_dataset['FireplaceQu'] = train_dataset['FireplaceQu'].fillna('No')
train_dataset['GarageQual'] = train_dataset['GarageQual'].fillna('No')
train_dataset['GarageCond'] = train_dataset['GarageCond'].fillna('No')
train_dataset['PoolQC'] = train_dataset['PoolQC'].fillna('No')

In [8]:
train_dataset['LotFrontage'] = train_dataset['LotFrontage'].fillna(0)
train_dataset['GarageYrBlt'] = train_dataset['GarageYrBlt'].fillna(0)
train_dataset['MasVnrArea'] = train_dataset['MasVnrArea'].fillna(0)

# 2. Encoding

In [9]:
#Create the list of variables that need to be one hot encoded
ohenc_list = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
              'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical',
              'Functional', 'GarageType', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [10]:
ohenc = OneHotEncoder(sparse=False)

In [11]:
ohenc_dataset = pd.DataFrame(ohenc.fit_transform(train_dataset[ohenc_list]), index = train_dataset['Id'], columns = list(ohenc.get_feature_names_out()))

In [13]:
rating_list = ['No', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
exp_list = ['Na', 'No', 'Mn', 'Av', 'Gd']	
fin_list = ['No', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
yesno_list = ['N', 'Y']  
gar_fin_list = ['No', 'Unf', 'RFn', 'Fin']
pav_dr_list = ['N', 'P', 'Y']
ordenc = OrdinalEncoder(categories = [rating_list, rating_list, rating_list, rating_list, exp_list, fin_list, fin_list, rating_list, yesno_list, rating_list,
                                      rating_list, gar_fin_list, rating_list, rating_list, pav_dr_list, rating_list])

In [14]:
enc_var_list = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual',
                'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC']

In [16]:
tmp = ordenc.fit_transform(train_dataset[enc_var_list])

In [17]:
ordenc_dataset = pd.DataFrame(tmp, index = train_dataset['Id'], columns = enc_var_list)

In [18]:
ordenc_dataset

Unnamed: 0_level_0,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,KitchenQual,FireplaceQu,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,4.0,3.0,4.0,3.0,1.0,6.0,1.0,5.0,1.0,4.0,0.0,2.0,3.0,3.0,2.0,0.0
2,3.0,3.0,4.0,3.0,4.0,5.0,1.0,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0
3,4.0,3.0,4.0,3.0,2.0,6.0,1.0,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0
4,3.0,3.0,3.0,4.0,1.0,5.0,1.0,4.0,1.0,4.0,4.0,1.0,3.0,3.0,2.0,0.0
5,4.0,3.0,4.0,3.0,3.0,6.0,1.0,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,3.0,3.0,4.0,3.0,1.0,1.0,1.0,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0
1457,3.0,3.0,4.0,3.0,1.0,5.0,3.0,3.0,1.0,3.0,3.0,1.0,3.0,3.0,2.0,0.0
1458,5.0,4.0,3.0,4.0,1.0,6.0,1.0,5.0,1.0,4.0,4.0,2.0,3.0,3.0,2.0,0.0
1459,3.0,3.0,3.0,3.0,2.0,6.0,3.0,4.0,1.0,4.0,0.0,1.0,3.0,3.0,2.0,0.0


# 3. Form final dataset

In [19]:
non_enc_data = train_dataset.drop(columns = ohenc_list + enc_var_list)

In [20]:
non_enc_data.set_index('Id', inplace = True)

In [21]:
non_enc_data

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
2,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
3,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
4,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
5,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,62.0,7917,6,5,1999,2000,0.0,0,0,953,...,0,40,0,0,0,0,0,8,2007,175000
1457,85.0,13175,6,6,1978,1988,119.0,790,163,589,...,349,0,0,0,0,0,0,2,2010,210000
1458,66.0,9042,7,9,1941,2006,0.0,275,0,877,...,0,60,0,0,0,0,2500,5,2010,266500
1459,68.0,9717,5,6,1950,1996,0.0,49,1029,0,...,366,0,112,0,0,0,0,4,2010,142125


In [22]:
tmp = pd.merge(ohenc_dataset, ordenc_dataset, how = 'inner', on = 'Id')
complete_dataset = pd.merge(non_enc_data, tmp, how = 'inner', on = 'Id')

In [23]:
complete_dataset = complete_dataset.astype('float64')

In [24]:
complete_dataset

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,BsmtFinType2,HeatingQC,CentralAir,KitchenQual,FireplaceQu,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,150.0,...,1.0,5.0,1.0,4.0,0.0,2.0,3.0,3.0,2.0,0.0
2,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,284.0,...,1.0,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0
3,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,434.0,...,1.0,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0
4,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,540.0,...,1.0,4.0,1.0,4.0,4.0,1.0,3.0,3.0,2.0,0.0
5,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,490.0,...,1.0,5.0,1.0,4.0,3.0,2.0,3.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,62.0,7917.0,6.0,5.0,1999.0,2000.0,0.0,0.0,0.0,953.0,...,1.0,5.0,1.0,3.0,3.0,2.0,3.0,3.0,2.0,0.0
1457,85.0,13175.0,6.0,6.0,1978.0,1988.0,119.0,790.0,163.0,589.0,...,3.0,3.0,1.0,3.0,3.0,1.0,3.0,3.0,2.0,0.0
1458,66.0,9042.0,7.0,9.0,1941.0,2006.0,0.0,275.0,0.0,877.0,...,1.0,5.0,1.0,4.0,4.0,2.0,3.0,3.0,2.0,0.0
1459,68.0,9717.0,5.0,6.0,1950.0,1996.0,0.0,49.0,1029.0,0.0,...,3.0,4.0,1.0,4.0,0.0,1.0,3.0,3.0,2.0,0.0


# 4. Modeling

In [25]:
rfregressor = RandomForestRegressor()

In [26]:
X = complete_dataset.drop(columns = ['SalePrice'])
y = complete_dataset['SalePrice'].values

In [34]:
cv = LeaveOneOut()

In [35]:
scores = cross_val_score(rfregressor, X, y = y, scoring = 'neg_root_mean_squared_error', cv = cv)

In [42]:
-scores.mean()

17355.024465753424

In [46]:
y.min()

34900.0