In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import scipy as sp
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats

In [2]:
df = pd.read_csv('Ames_HousePrice.csv')
df.shape

(2580, 82)

In [3]:
c = df.select_dtypes('object')
c.shape

(2580, 43)

In [4]:
c = c.fillna("None")
c.isnull().sum().sum()

0

In [5]:
c = pd.get_dummies(c, drop_first=True)

#c = pd.get_dummies(c)

c.shape

(2580, 232)

In [6]:
n = df.select_dtypes('number')
n.shape

(2580, 39)

In [7]:
n.isnull().sum().sum()

615

In [8]:
full = pd.concat([c, n], axis=1)
full.shape

(2580, 271)

In [9]:
full = full.dropna()
full.shape

(1988, 271)

In [10]:
full.isnull().sum().sum()

0

### MLR With SFS (n=10) ###

In [11]:
from sklearn.feature_selection import SequentialFeatureSelector

In [12]:
y=full.SalePrice
X=full.drop('SalePrice', axis=1)

In [13]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

In [14]:
regressor.score(X,y)

0.9421659842747901

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [16]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.946307
--------------------------------------------------
R^2 for test  set: 0.898843


In [17]:
sfs_foreward = SequentialFeatureSelector(
    regressor, n_features_to_select=10, direction="forward"
).fit(X, y)

In [18]:
selected = X.columns[sfs_foreward.get_support()]
selected

Index(['Neighborhood_NridgHt', 'Neighborhood_StoneBr', 'BsmtExposure_Gd',
       'GrLivArea', 'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt',
       'BsmtFinSF1', 'GarageArea'],
      dtype='object')

In [19]:
X = full[selected]
X

Unnamed: 0,Neighborhood_NridgHt,Neighborhood_StoneBr,BsmtExposure_Gd,GrLivArea,MSSubClass,OverallQual,OverallCond,YearBuilt,BsmtFinSF1,GarageArea
1,0,0,0,1049,120,5,5,1984,552.0,266.0
2,0,0,0,1001,30,5,9,1930,737.0,216.0
3,0,0,0,1039,70,4,8,1900,0.0,281.0
4,0,0,0,1665,60,8,6,2001,643.0,528.0
5,0,0,0,1922,85,7,5,2003,0.0,672.0
...,...,...,...,...,...,...,...,...,...,...
2572,0,0,0,1242,20,4,5,1946,0.0,336.0
2573,0,0,0,816,30,6,8,1934,574.0,240.0
2574,0,0,0,1724,60,7,5,2003,0.0,616.0
2577,0,0,0,2002,90,5,6,1949,284.0,871.0


In [20]:
y=full.SalePrice
X=full[selected]

In [21]:
regressor2 = LinearRegression()
regressor2.fit(X,y)

LinearRegression()

In [22]:
regressor2.score(X,y)

0.8705364411749056

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [24]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.865488
--------------------------------------------------
R^2 for test  set: 0.879867


### Stats for MLR SFS (n=10) ###

In [25]:
# Obtain and set x=input and y=output
X0 = full[selected]
y0 = full.SalePrice

In [26]:
#Setup statsmodels.api, inputs (x, x_constant, and y)
x_constant0 = sm.add_constant(X0)

In [27]:
#Run OLS with statsmodels
lin_reg0 = sm.OLS(y0,x_constant0).fit()
lin_reg0.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.871
Model:,OLS,Adj. R-squared:,0.87
Method:,Least Squares,F-statistic:,1329.0
Date:,"Tue, 19 Apr 2022",Prob (F-statistic):,0.0
Time:,07:19:25,Log-Likelihood:,-23186.0
No. Observations:,1988,AIC:,46390.0
Df Residuals:,1977,BIC:,46460.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.858e+05,6.03e+04,-14.685,0.000,-1e+06,-7.67e+05
Neighborhood_NridgHt,3.973e+04,2980.350,13.330,0.000,3.39e+04,4.56e+04
Neighborhood_StoneBr,5.109e+04,4773.595,10.703,0.000,4.17e+04,6.05e+04
BsmtExposure_Gd,2.592e+04,2408.538,10.760,0.000,2.12e+04,3.06e+04
GrLivArea,64.9303,1.746,37.195,0.000,61.507,68.354
MSSubClass,-239.7097,15.329,-15.638,0.000,-269.772,-209.648
OverallQual,1.675e+04,748.055,22.386,0.000,1.53e+04,1.82e+04
OverallCond,6021.4027,673.520,8.940,0.000,4700.519,7342.287
YearBuilt,408.9691,30.771,13.291,0.000,348.622,469.317

0,1,2,3
Omnibus:,532.419,Durbin-Watson:,2.039
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42832.103
Skew:,0.13,Prob(JB):,0.0
Kurtosis:,25.738,Cond. No.,246000.0
