In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import scipy as sp
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats

In [2]:
df = pd.read_csv('Ames_HousePrice.csv')
df.shape

(2580, 82)

In [3]:
df[['MSSubClass','YrSold','MoSold']].isnull().sum()

MSSubClass    0
YrSold        0
MoSold        0
dtype: int64

In [4]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

In [5]:
c = df.select_dtypes('object')
c.shape

(2580, 46)

In [7]:
c = c.fillna("None")
c.isnull().sum().sum()

0

In [8]:
c = pd.get_dummies(c, drop_first=True)

#c = pd.get_dummies(c)

c.shape

(2580, 262)

In [9]:
n = df.select_dtypes('number')
n.shape

(2580, 36)

In [10]:
n.isnull().sum().sum()

615

In [11]:
full = pd.concat([c, n], axis=1)
full.shape

(2580, 298)

In [12]:
full = full.dropna()
full.shape

(1988, 298)

In [13]:
full.isnull().sum().sum()

0

### MLR With SFS (n=10) ###

In [14]:
from sklearn.feature_selection import SequentialFeatureSelector

In [15]:
y=full.SalePrice
X=full.drop('SalePrice', axis=1)

In [16]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

In [17]:
regressor.score(X,y)

0.943375217744169

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [19]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.955426
--------------------------------------------------
R^2 for test  set: 0.829620


In [20]:
sfs_foreward = SequentialFeatureSelector(
    regressor, n_features_to_select=10, direction="forward"
).fit(X, y)

In [21]:
selected = X.columns[sfs_foreward.get_support()]
selected

Index(['MSZoning_RM', 'Neighborhood_NridgHt', 'BsmtExposure_Gd', 'GrLivArea',
       'OverallQual', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF',
       'KitchenAbvGr', 'GarageArea'],
      dtype='object')

In [22]:
X = full[selected]
X

Unnamed: 0,MSZoning_RM,Neighborhood_NridgHt,BsmtExposure_Gd,GrLivArea,OverallQual,YearRemodAdd,BsmtFinSF1,TotalBsmtSF,KitchenAbvGr,GarageArea
1,0,0,0,1049,5,1984,552.0,1049.0,1,266.0
2,0,0,0,1001,5,2007,737.0,837.0,1,216.0
3,0,0,0,1039,4,2003,0.0,405.0,1,281.0
4,0,0,0,1665,8,2001,643.0,810.0,1,528.0
5,0,0,0,1922,7,2003,0.0,0.0,1,672.0
...,...,...,...,...,...,...,...,...,...,...
2572,0,0,0,1242,4,1950,0.0,484.0,1,336.0
2573,0,0,0,816,6,1950,574.0,816.0,1,240.0
2574,0,0,0,1724,7,2004,0.0,796.0,1,616.0
2577,0,0,0,2002,5,1950,284.0,1001.0,2,871.0


In [24]:
y=full.SalePrice
X=full[selected]

In [25]:
regressor2 = LinearRegression()
regressor2.fit(X,y)

LinearRegression()

In [26]:
regressor2.score(X,y)

0.8607360105157563

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [28]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.853608
--------------------------------------------------
R^2 for test  set: 0.878189


### Stats for MLR SFS (n=10) ###

In [29]:
# Obtain and set x=input and y=output
X0 = full[selected]
y0 = full.SalePrice

In [30]:
#Setup statsmodels.api, inputs (x, x_constant, and y)
x_constant0 = sm.add_constant(X0)

In [31]:
#Run OLS with statsmodels
lin_reg0 = sm.OLS(y0,x_constant0).fit()
lin_reg0.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.861
Model:,OLS,Adj. R-squared:,0.86
Method:,Least Squares,F-statistic:,1222.0
Date:,"Tue, 19 Apr 2022",Prob (F-statistic):,0.0
Time:,07:18:40,Log-Likelihood:,-23259.0
No. Observations:,1988,AIC:,46540.0
Df Residuals:,1977,BIC:,46600.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.955e+05,7.54e+04,-9.219,0.000,-8.43e+05,-5.48e+05
MSZoning_RM,-1.244e+04,1851.158,-6.722,0.000,-1.61e+04,-8813.746
Neighborhood_NridgHt,2.788e+04,3081.607,9.049,0.000,2.18e+04,3.39e+04
BsmtExposure_Gd,2.537e+04,2505.160,10.128,0.000,2.05e+04,3.03e+04
GrLivArea,58.2490,1.782,32.690,0.000,54.755,61.743
OverallQual,1.622e+04,755.115,21.484,0.000,1.47e+04,1.77e+04
YearRemodAdd,333.6790,38.668,8.629,0.000,257.844,409.514
BsmtFinSF1,26.5360,1.845,14.380,0.000,22.917,30.155
TotalBsmtSF,22.4902,2.241,10.037,0.000,18.096,26.884

0,1,2,3
Omnibus:,540.225,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48003.263
Skew:,0.072,Prob(JB):,0.0
Kurtosis:,27.073,Cond. No.,322000.0
