In [142]:
import numpy as np
import pandas as pd
import statsmodels.api as smf
import statsmodels.formula.api as sm
import seaborn as sns
import matplotlib.pyplot as plt


In [143]:
housing = pd.read_csv('House.csv')

In [144]:
housing.head(5)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1800,7503


In [145]:
fit1 = sm.ols('price~bedrooms+bathrooms+sqft_living+sqft_lot+floors+waterfront+grade', data=housing).fit()

In [146]:
fit1.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.582
Model:,OLS,Adj. R-squared:,0.581
Method:,Least Squares,F-statistic:,4290.0
Date:,"Wed, 22 Mar 2023",Prob (F-statistic):,0.0
Time:,14:44:10,Log-Likelihood:,-298190.0
No. Observations:,21613,AIC:,596400.0
Df Residuals:,21605,BIC:,596500.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.805e+05,1.43e+04,-33.636,0.000,-5.08e+05,-4.52e+05
bedrooms,-3.482e+04,2211.105,-15.748,0.000,-3.92e+04,-3.05e+04
bathrooms,-1.39e+04,3547.867,-3.918,0.000,-2.09e+04,-6945.158
sqft_living,213.1015,3.516,60.605,0.000,206.209,219.994
sqft_lot,-0.3414,0.040,-8.562,0.000,-0.420,-0.263
floors,-3.802e+04,3589.470,-10.593,0.000,-4.51e+04,-3.1e+04
waterfront,7.955e+05,1.88e+04,42.217,0.000,7.59e+05,8.32e+05
grade,1.019e+05,2284.361,44.598,0.000,9.74e+04,1.06e+05

0,1,2,3
Omnibus:,15017.098,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,740935.632
Skew:,2.775,Prob(JB):,0.0
Kurtosis:,31.142,Cond. No.,514000.0


In [147]:
housing.floors = housing['floors'].astype(str)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  object 
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  int64  
 9   grade          21613 non-null  int64  
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  sqft_living15  21613 non-null  int64  
 13  sqft_lot15     21613 non-null  int64  
dtypes: float64(2), int64(11), object(1)
memory usage: 2.3+ MB


In [148]:
oneHotEncoding = pd.get_dummies(data=housing, columns=['floors'])

In [149]:
oneHotEncoding

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,floors_1.0,floors_1.5,floors_2.0,floors_2.5,floors_3.0,floors_3.5
0,221900.0,3,1.00,1180,5650,0,0,3,7,1180,0,1340,5650,1,0,0,0,0,0
1,538000.0,3,2.25,2570,7242,0,0,3,7,2170,400,1690,7639,0,0,1,0,0,0
2,180000.0,2,1.00,770,10000,0,0,3,6,770,0,2720,8062,1,0,0,0,0,0
3,604000.0,4,3.00,1960,5000,0,0,5,7,1050,910,1360,5000,1,0,0,0,0,0
4,510000.0,3,2.00,1680,8080,0,0,3,8,1680,0,1800,7503,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000.0,3,2.50,1530,1131,0,0,3,8,1530,0,1530,1509,0,0,0,0,1,0
21609,400000.0,4,2.50,2310,5813,0,0,3,8,2310,0,1830,7200,0,0,1,0,0,0
21610,402101.0,2,0.75,1020,1350,0,0,3,7,1020,0,1020,2007,0,0,1,0,0,0
21611,400000.0,3,2.50,1600,2388,0,0,3,8,1600,0,1410,1287,0,0,1,0,0,0


In [150]:
col = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'floors_1_0',
       'floors_1_5', 'floors_2_0', 'floors_2_5', 'floors_3_0', 'floors_3_5']

In [151]:
len(col)

19

In [152]:
oneHotEncoding.columns = col

In [153]:
fit2 = sm.ols('price~bedrooms+bathrooms+sqft_living+sqft_lot+waterfront+view+condition+grade+floors_1_0+floors_1_5+floors_2_0+floors_3_0', data= oneHotEncoding).fit()

In [154]:
fit2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.616
Model:,OLS,Adj. R-squared:,0.616
Method:,Least Squares,F-statistic:,2884.0
Date:,"Wed, 22 Mar 2023",Prob (F-statistic):,0.0
Time:,14:44:12,Log-Likelihood:,-297270.0
No. Observations:,21613,AIC:,594600.0
Df Residuals:,21600,BIC:,594700.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5.364e+05,2.53e+04,-21.185,0.000,-5.86e+05,-4.87e+05
bedrooms,-3.748e+04,2143.212,-17.489,0.000,-4.17e+04,-3.33e+04
bathrooms,2618.9106,3460.265,0.757,0.449,-4163.464,9401.285
sqft_living,196.1768,3.451,56.851,0.000,189.413,202.941
sqft_lot,-0.3658,0.038,-9.564,0.000,-0.441,-0.291
waterfront,5.795e+05,1.96e+04,29.594,0.000,5.41e+05,6.18e+05
view,5.916e+04,2324.793,25.447,0.000,5.46e+04,6.37e+04
condition,4.588e+04,2514.952,18.242,0.000,4.09e+04,5.08e+04
grade,1.048e+05,2211.518,47.407,0.000,1.01e+05,1.09e+05

0,1,2,3
Omnibus:,15392.108,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,869831.071
Skew:,2.838,Prob(JB):,0.0
Kurtosis:,33.556,Cond. No.,1170000.0


In [155]:
df = housing.drop(columns=['price'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       21613 non-null  int64  
 1   bathrooms      21613 non-null  float64
 2   sqft_living    21613 non-null  int64  
 3   sqft_lot       21613 non-null  int64  
 4   floors         21613 non-null  object 
 5   waterfront     21613 non-null  int64  
 6   view           21613 non-null  int64  
 7   condition      21613 non-null  int64  
 8   grade          21613 non-null  int64  
 9   sqft_above     21613 non-null  int64  
 10  sqft_basement  21613 non-null  int64  
 11  sqft_living15  21613 non-null  int64  
 12  sqft_lot15     21613 non-null  int64  
dtypes: float64(1), int64(11), object(1)
memory usage: 2.1+ MB


In [156]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X1 = add_constant(df)
vif = pd.DataFrame()
vif["Feature"] = X1.columns
vif["VIF"] = [variance_inflation_factor(X1.values, i) for i in range(len(X1.columns))]

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
vif

Unnamed: 0,Feature,VIF
0,const,123.312672
1,bedrooms,1.639525
2,bathrooms,2.929191
3,sqft_living,inf
4,sqft_lot,2.087742
5,floors,1.886524
6,waterfront,1.199293
7,view,1.377971
8,condition,1.104086
9,grade,3.153316
