In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import scipy as sp
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats

In [2]:
df = pd.read_csv('Ames_HousePrice.csv')
df.shape

(2580, 82)

In [3]:
df[['MSSubClass','YrSold','MoSold']].isnull().sum()

MSSubClass    0
YrSold        0
MoSold        0
dtype: int64

In [4]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

In [5]:
c = df.select_dtypes('object')
c.shape

(2580, 46)

In [6]:
c = c.fillna("None")
c.isnull().sum().sum()

0

In [7]:
c = pd.get_dummies(c, drop_first=True)
c.shape

(2580, 262)

In [8]:
n = df.select_dtypes('number')
n.shape

(2580, 36)

In [9]:
n.isnull().sum().sum()

615

In [10]:
full = pd.concat([c, n], axis=1)
full.shape

(2580, 298)

In [11]:
full = full.dropna()
full.shape

(1988, 298)

In [12]:
full.isnull().sum().sum()

0

### MLR With SFS (n=5) ###

In [13]:
from sklearn.feature_selection import SequentialFeatureSelector

In [14]:
y=full.SalePrice
X=full.drop('SalePrice', axis=1)

In [15]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

In [16]:
regressor.score(X,y)

0.943375217744169

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [18]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.946747
--------------------------------------------------
R^2 for test  set: 0.914719


In [19]:
sfs_foreward = SequentialFeatureSelector(
    regressor, n_features_to_select=5, direction="forward"
).fit(X, y)

In [20]:
selected = X.columns[sfs_foreward.get_support()]
selected

Index(['GrLivArea', 'OverallQual', 'BsmtFinSF1', 'TotalBsmtSF', 'GarageArea'], dtype='object')

In [21]:
X = full[selected]
X

Unnamed: 0,GrLivArea,OverallQual,BsmtFinSF1,TotalBsmtSF,GarageArea
1,1049,5,552.0,1049.0,266.0
2,1001,5,737.0,837.0,216.0
3,1039,4,0.0,405.0,281.0
4,1665,8,643.0,810.0,528.0
5,1922,7,0.0,0.0,672.0
...,...,...,...,...,...
2572,1242,4,0.0,484.0,336.0
2573,816,6,574.0,816.0,240.0
2574,1724,7,0.0,796.0,616.0
2577,2002,5,284.0,1001.0,871.0


In [22]:
y=full.SalePrice
X=full[selected]

In [23]:
regressor2 = LinearRegression()
regressor2.fit(X,y)

LinearRegression()

In [24]:
regressor2.score(X,y)

0.8329698390687386

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [26]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.843798
--------------------------------------------------
R^2 for test  set: 0.801923


### Stats for MLR SFS (n=5) ###

In [27]:
# Obtain and set x=input and y=output
X0 = full[selected]
y0 = full.SalePrice

In [28]:
#Setup statsmodels.api, inputs (x, x_constant, and y)
x_constant0 = sm.add_constant(X0)

In [29]:
#Run OLS with statsmodels
lin_reg0 = sm.OLS(y0,x_constant0).fit()
lin_reg0.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.833
Model:,OLS,Adj. R-squared:,0.833
Method:,Least Squares,F-statistic:,1977.0
Date:,"Tue, 19 Apr 2022",Prob (F-statistic):,0.0
Time:,23:51:47,Log-Likelihood:,-23440.0
No. Observations:,1988,AIC:,46890.0
Df Residuals:,1982,BIC:,46920.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.095e+05,3309.084,-33.081,0.000,-1.16e+05,-1.03e+05
GrLivArea,52.6954,1.906,27.653,0.000,48.958,56.433
OverallQual,2.279e+04,718.219,31.725,0.000,2.14e+04,2.42e+04
BsmtFinSF1,32.9182,1.946,16.914,0.000,29.101,36.735
TotalBsmtSF,26.5340,2.404,11.036,0.000,21.819,31.250
GarageArea,61.8201,4.887,12.651,0.000,52.237,71.404

0,1,2,3
Omnibus:,467.56,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23876.638
Skew:,0.054,Prob(JB):,0.0
Kurtosis:,19.978,Cond. No.,9400.0
