In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import scipy as sp
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats


In [2]:
df = pd.read_csv('Ames_HousePrice.csv')
df.shape

(2580, 82)

In [3]:
df.isna().sum(axis=0).sort_values(ascending=False).head(30)

PoolQC           2571
MiscFeature      2483
Alley            2412
Fence            2055
FireplaceQu      1241
LotFrontage       462
GarageFinish      129
GarageQual        129
GarageYrBlt       129
GarageCond        129
GarageType        127
BsmtExposure       71
BsmtFinType2       70
BsmtFinType1       69
BsmtCond           69
BsmtQual           69
MasVnrArea         14
MasVnrType         14
BsmtHalfBath        2
BsmtFullBath        2
GarageArea          1
GarageCars          1
Electrical          1
BsmtUnfSF           1
BsmtFinSF2          1
BsmtFinSF1          1
TotalBsmtSF         1
Functional          0
EnclosedPorch       0
PavedDrive          0
dtype: int64

In [5]:
df.isnull().sum().sum()

12254

### Convert Supposingly Numerical Cols To Categorical ###

In [9]:
df[['MSSubClass','YrSold','MoSold']].isnull().sum()

MSSubClass    0
YrSold        0
MoSold        0
dtype: int64

In [10]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

### Object Columns, Fillna with "None" & Dummify! ###

In [13]:
c = df.select_dtypes('object')
c.head(3)

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
0,30,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Norm,Norm,1Fam,1Story,Gable,CompShg,Wd Sdng,Wd Sdng,,TA,TA,CBlock,TA,TA,No,Rec,Unf,GasA,TA,Y,SBrkr,TA,Typ,Gd,Detchd,Unf,TA,TA,Y,,,,3,2010,WD,Normal
1,120,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,Gable,CompShg,HdBoard,HdBoard,BrkFace,Gd,TA,CBlock,Gd,TA,Mn,GLQ,ALQ,GasA,TA,Y,SBrkr,Gd,Typ,,Attchd,Fin,TA,TA,Y,,,,2,2009,WD,Normal
2,30,C (all),Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1Story,Hip,CompShg,MetalSd,MetalSd,,Gd,TA,BrkTil,TA,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Detchd,Unf,TA,Po,N,,,,11,2007,WD,Normal


In [14]:
c.shape

(2580, 46)

In [15]:
c.isnull().sum().sum()

11639

In [16]:
c = c.fillna("None")
c.isnull().sum().sum()

0

In [17]:
c.shape

(2580, 46)

In [18]:
c = pd.get_dummies(c, drop_first=True)
c.shape

(2580, 262)

### Numeric Columns, Dropna on Rows/Observations, Create Full (Clean) ###

In [19]:
n = df.select_dtypes('number')
n.head(3)

Unnamed: 0.1,Unnamed: 0,PID,GrLivArea,SalePrice,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,1,909176150,856,126000,,7890,6,6,1939,1950,0.0,238.0,0.0,618.0,856.0,856,0,0,1.0,0.0,1,0,2,1,4,1,1939.0,2.0,399.0,0,0,0,0,166,0,0
1,2,905476230,1049,139500,42.0,4235,5,5,1984,1984,149.0,552.0,393.0,104.0,1049.0,1049,0,0,1.0,0.0,2,0,2,1,5,0,1984.0,1.0,266.0,0,105,0,0,0,0,0
2,3,911128020,1001,124900,60.0,6060,5,9,1930,2007,0.0,737.0,0.0,100.0,837.0,1001,0,0,0.0,0.0,1,0,2,1,5,0,1930.0,1.0,216.0,154,0,42,86,0,0,0


In [20]:
n.shape

(2580, 36)

In [21]:
n.isnull().sum().sum()

615

In [22]:
full = pd.concat([c, n], axis=1)
full.shape

(2580, 298)

In [23]:
full = full.dropna()
full.shape

(1988, 298)

In [24]:
full.isnull().sum().sum()

0

### MLR For Base Case ###

In [25]:
y=full.SalePrice
X=full.drop('SalePrice', axis=1)

In [26]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

In [27]:
regressor.score(X,y)

0.943375217744169

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [29]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.944666
--------------------------------------------------
R^2 for test  set: 0.919440


### Stats for MLR For Base Case ###

In [30]:
# Obtain and set x=input and y=output
X0 = full.drop('SalePrice', axis=1)
y0 = full.SalePrice

In [31]:
#Setup statsmodels.api, inputs (x, x_constant, and y)
x_constant0 = sm.add_constant(X0)

In [32]:
#Run OLS with statsmodels
lin_reg0 = sm.OLS(y0,x_constant0).fit()
lin_reg0.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.943
Model:,OLS,Adj. R-squared:,0.934
Method:,Least Squares,F-statistic:,104.2
Date:,"Tue, 19 Apr 2022",Prob (F-statistic):,0.0
Time:,07:01:25,Log-Likelihood:,-22364.0
No. Observations:,1988,AIC:,45280.0
Df Residuals:,1713,BIC:,46820.0
Df Model:,274,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.818e+05,1.17e+05,-4.987,0.000,-8.11e+05,-3.53e+05
MSSubClass_150,-0.9474,0.227,-4.167,0.000,-1.393,-0.501
MSSubClass_160,-5139.2016,7149.677,-0.719,0.472,-1.92e+04,8883.815
MSSubClass_180,4458.0725,1.06e+04,0.420,0.675,-1.64e+04,2.53e+04
MSSubClass_190,7961.1017,2.68e+04,0.297,0.766,-4.46e+04,6.05e+04
MSSubClass_20,2.035e+04,1.31e+04,1.554,0.120,-5341.490,4.6e+04
MSSubClass_30,2.384e+04,1.35e+04,1.767,0.077,-2618.955,5.03e+04
MSSubClass_40,3.14e+04,1.7e+04,1.850,0.065,-1897.902,6.47e+04
MSSubClass_45,1.419e+04,2.29e+04,0.619,0.536,-3.08e+04,5.91e+04

0,1,2,3
Omnibus:,580.725,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50335.772
Skew:,-0.344,Prob(JB):,0.0
Kurtosis:,27.641,Cond. No.,4.56e+22
