In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

import scipy as sp
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats

In [2]:
df = pd.read_csv('Ames_HousePrice.csv')
df.shape

(2580, 82)

In [3]:
c = df.select_dtypes('object')
c.shape

(2580, 43)

In [4]:
c = c.fillna("None")
c.isnull().sum().sum()

0

In [6]:
c = pd.get_dummies(c, drop_first=True)
c.shape

(2580, 232)

In [7]:
n = df.select_dtypes('number')
n.shape

(2580, 39)

In [8]:
n.isnull().sum().sum()

615

In [9]:
full = pd.concat([c, n], axis=1)
full.shape

(2580, 271)

In [11]:
full = full.dropna()
full.shape

(1988, 271)

In [12]:
full.isnull().sum().sum()

0

### MLR With SFS (n=5) ###

In [13]:
from sklearn.feature_selection import SequentialFeatureSelector

In [14]:
y=full.SalePrice
X=full.drop('SalePrice', axis=1)

In [15]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

In [16]:
regressor.score(X,y)

0.9421659842747901

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [18]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.944818
--------------------------------------------------
R^2 for test  set: 0.913814


In [29]:
sfs_foreward = SequentialFeatureSelector(
    regressor, n_features_to_select=5, direction="forward"
).fit(X, y)

In [33]:
selected = X.columns[sfs_foreward.get_support()]
selected

Index(['GrLivArea', 'MSSubClass', 'OverallQual', 'BsmtFinSF1', 'GarageArea'], dtype='object')

In [38]:
X = full[selected]
X

Unnamed: 0,GrLivArea,MSSubClass,OverallQual,BsmtFinSF1,GarageArea
1,1049,120,5,552.0,266.0
2,1001,30,5,737.0,216.0
3,1039,70,4,0.0,281.0
4,1665,60,8,643.0,528.0
5,1922,85,7,0.0,672.0
...,...,...,...,...,...
2572,1242,20,4,0.0,336.0
2573,816,30,6,574.0,240.0
2574,1724,60,7,0.0,616.0
2577,2002,90,5,284.0,871.0


In [36]:
y=full.SalePrice
X=full[selected]

In [39]:
regressor2 = LinearRegression()
regressor2.fit(X,y)

LinearRegression()

In [40]:
regressor2.score(X,y)

0.8350900851530029

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [42]:
ols = LinearRegression()
ols.fit(X_train, y_train)
print("R^2 for train set: %f" %ols.score(X_train, y_train))

print('-'*50)

print("R^2 for test  set: %f" %ols.score(X_test, y_test))

R^2 for train set: 0.830411
--------------------------------------------------
R^2 for test  set: 0.843985


### Stats for MLR SFS (n=5) ###

In [43]:
# Obtain and set x=input and y=output
X0 = full[selected]
y0 = full.SalePrice

In [44]:
#Setup statsmodels.api, inputs (x, x_constant, and y)
x_constant0 = sm.add_constant(X0)

In [45]:
#Run OLS with statsmodels
lin_reg0 = sm.OLS(y0,x_constant0).fit()
lin_reg0.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.835
Model:,OLS,Adj. R-squared:,0.835
Method:,Least Squares,F-statistic:,2007.0
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.0
Time:,16:45:06,Log-Likelihood:,-23427.0
No. Observations:,1988,AIC:,46870.0
Df Residuals:,1982,BIC:,46900.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.007e+05,3364.817,-29.934,0.000,-1.07e+05,-9.41e+04
GrLivArea,56.9266,1.889,30.132,0.000,53.222,60.632
MSSubClass,-208.4209,17.084,-12.200,0.000,-241.926,-174.916
OverallQual,2.596e+04,682.893,38.017,0.000,2.46e+04,2.73e+04
BsmtFinSF1,40.5850,1.742,23.298,0.000,37.169,44.001
GarageArea,65.5671,4.788,13.695,0.000,56.177,74.957

0,1,2,3
Omnibus:,477.468,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21395.198
Skew:,0.263,Prob(JB):,0.0
Kurtosis:,19.063,Cond. No.,8100.0
