In [164]:
import pandas as pd
import numpy as np

#Loading Dataset
train = pd.read_csv('train.csv', error_bad_lines=False, delimiter=';', header= 1 )

In [94]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [165]:
# Defining the dependent variable
train["LogSalePrice"] = np.log2(train["SalePrice"])

In [166]:
# Dealing with missing
train.columns[train.isna().any()]

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [167]:
# LotFrontage: Replace NAs with Mean
train["LotFrontage"] = train["LotFrontage"].fillna(0)
# Alley: Replace NAs with NoAlley
train["Alley"] = train["Alley"].fillna("NoAlley") 
# MasVnrType: No important variable 
# MasVnrArea: Replace NAs with 0
train["MasVnrArea"] = train["MasVnrArea"].fillna(0)
# BsmtQual: Replace NAs with NoBasement
train["BsmtQual"] = train["BsmtQual"].fillna("NoBasement") 
# BsmtCond: Replace NAs with NoBasement
train["BsmtCond"] = train["BsmtCond"].fillna("NoBasement") 
# BsmtExposure: Replace NAs with NoBasement
train["BsmtExposure"] = train["BsmtExposure"].fillna("NoBasement") 
# BsmtFinType1:Replace NAs with NoBasement
train["BsmtFinType1"] = train["BsmtFinType1"].fillna("NoBasement") 
# BsmtFinType2:Replace NAs with NoBasement
train["BsmtFinType2"] = train["BsmtFinType2"].fillna("NoBasement") 
# Electrical: Replace missing with most common electrical system
train["Electrical"] = train["Electrical"].fillna("SBrkr") 
# FireplaceQu: I don't think this variable is important
# GarageType: Replace NAs with NoGarage
train["GarageType"] = train["GarageType"].fillna("NoGarage") 
# GarageYrBlt: Replace NAs with YearBuilt
train["GarageYrBlt"] = train["GarageYrBlt"].fillna("YearBuilt") 
# GarageFinish: Replace NAs with NoGarage
train["GarageFinish"] = train["GarageFinish"].fillna("NoGarage") 
# GarageQual: Replace NAs with NoGarage
train["GarageQual"] = train["GarageQual"].fillna("NoGarage") 
# GarageCond: Replace NAs with NoGarage
train["GarageCond"] = train["GarageCond"].fillna("NoGarage") 
# PoolQC: No important variable. Just 7 observations 
# MiscFeature: Replace NA with NoFeature
train["MiscFeature"] = train["MiscFeature"].fillna("NoFeature") 

In [None]:
#Dealing with outliers:

In [168]:
# Drop variables
train = train.drop(["Fence","FireplaceQu", "PoolQC"], axis=1)

In [169]:
# Creating variables
# Age
train["Age"]=2011-train["YearBuilt"]
# Age2
train["Age2"]=train["Age"]**2
# GarageYesNo
train["GarageYesNo"]= np.where(train['GarageType'] == 'NoGarage', 0, 1)
# AlleyYesNo
train["AlleyYesNo"]= np.where(train['Alley'] == 'NoAlley', 0, 1)
# BasementYesNo
train["BasementYesNo"]= np.where(train['BsmtQual'] == 'NoBasement', 0, 1)
# Second Garage
train["Gar2"]= np.where(train['MiscFeature'] == 'Gar2', 1, 0)
# Shed
train["ShedYesNo"]= np.where(train['MiscFeature'] == 'Shed', 1, 0)
# NeigborhoodReclassification
MappingNeighboorhood = pd.read_csv('MappingNeighboorhood.csv', error_bad_lines=False, delimiter=';', header= 0 )
train = pd.merge(train, MappingNeighboorhood, how="left", on=["Neighborhood"])

In [182]:
# Create Neigborhood, MsZoning, Condition1, Condition2 dummies
train = pd.concat([train,pd.get_dummies(train.Neighborhood2, prefix='Neighborhood2')],axis=1)
train = pd.concat([train,pd.get_dummies(train.MSZoning, prefix='MSZoning')],axis=1)
train = pd.concat([train,pd.get_dummies(train.Condition1, prefix='Condition1')],axis=1)
train = pd.concat([train,pd.get_dummies(train.Condition1, prefix='Condition2')],axis=1)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNe,Condition2_RRNn
0,1,60,RL,65.0,8450,Pave,NoAlley,Reg,Lvl,AllPub,...,0,0,0,1,0,0,0,0,0,0
1,2,20,RL,80.0,9600,Pave,NoAlley,Reg,Lvl,AllPub,...,0,0,1,0,0,0,0,0,0,0
2,3,60,RL,68.0,11250,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,0,1,0,0,0,0,0,0
3,4,70,RL,60.0,9550,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,0,1,0,0,0,0,0,0
4,5,60,RL,84.0,14260,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,0,1,0,0,0,0,0,0


In [200]:
from sklearn.model_selection import train_test_split
y = train["LogSalePrice"].values
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, random_state=42)

In [201]:
# Linear Regression:
from sklearn.linear_model import LinearRegression
# Finding regression coefficients

x = X_train[['LotFrontage', 'LotArea', 'Age', 'Age2', 'OverallQual', 'OverallCond', 'GrLivArea', 'BedroomAbvGr']].values
y = X_train["LogSalePrice"].values

reg = LinearRegression()
reg.fit(x,y)

LinearRegression()

In [194]:
import statsmodels.api as sm 
X_add_const = sm.add_constant(x)
ols = sm.OLS(y, X_add_const)
ans = ols.fit()
print(ans.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.802
Model:                            OLS   Adj. R-squared:                  0.800
Method:                 Least Squares   F-statistic:                     511.8
Date:                Sun, 28 Feb 2021   Prob (F-statistic):               0.00
Time:                        16:18:32   Log-Likelihood:                -45.391
No. Observations:                1022   AIC:                             108.8
Df Residuals:                    1013   BIC:                             153.1
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         15.5320      0.072    216.584      0.0

In [191]:
ans.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,15.5320,0.072,216.584,0.000,15.391,15.673
x1,0.0001,0.000,0.483,0.629,-0.000,0.001
x2,5.707e-06,7.39e-07,7.719,0.000,4.26e-06,7.16e-06
x3,-0.0070,0.001,-6.527,0.000,-0.009,-0.005
x4,5.835e-06,9.11e-06,0.641,0.522,-1.2e-05,2.37e-05
x5,0.1625,0.010,16.680,0.000,0.143,0.182
x6,0.0794,0.008,9.868,0.000,0.064,0.095
x7,0.0004,2.39e-05,17.061,0.000,0.000,0.000
x8,-0.0150,0.012,-1.236,0.217,-0.039,0.009


In [202]:
# Random Forest
import sklearn.ensemble as skle


RandomForestRegressor(n_estimators=40)

In [235]:
# Number of trees in random forest
n_estimators = [30,40] #[int(x) for x in np.linspace(start=10, stop=80, num=10)]
# Maximum number of levels in tree
max_depth = [3,5]
# Minimum number of samples required to split a node
min_samples_split = [2,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True]

In [236]:
param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [30, 40], 'max_depth': [3, 5], 'min_samples_split': [2, 5], 'bootstrap': [True]}


In [237]:
rfc = skle.RandomForestRegressor()

In [238]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rfc, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)

In [239]:
rf_Grid.fit(x,y)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  16 out of  16 | elapsed:    2.3s finished


GridSearchCV(cv=2, estimator=RandomForestRegressor(), n_jobs=4,
             param_grid={'bootstrap': [True], 'max_depth': [3, 5],
                         'min_samples_split': [2, 5],
                         'n_estimators': [30, 40]},
             verbose=2)