In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV


import statsmodels.api as sm  

from sklearn.linear_model import Lasso

from sklearn.metrics import mean_absolute_error,r2_score

In [2]:
train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

In [3]:
attributes = train.drop('SalePrice', axis=1)
labels = train.SalePrice

In [4]:
more_than_zero_null_values = attributes.isnull().sum()>0

attributes.isnull().sum()[more_than_zero_null_values]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
num_features = attributes.select_dtypes(include = 'number').columns.tolist()
print(f"The numerical features are {len(num_features)}.")
print(f"They are {', '.join(num_features)}.")

The numerical features are 37.
They are Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold.


In [6]:
cat_features =  attributes.select_dtypes(exclude = 'number').columns.tolist()
print(f"The categorical features are {len(cat_features)}.")
print(f"They are {', '.join(cat_features)}.")

The categorical features are 43.
They are MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, KitchenQual, Functional, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence, MiscFeature, SaleType, SaleCondition.


In [7]:
numeric_pipeline = Pipeline(
    steps = [
        ('Numerical imputer', SimpleImputer(strategy='mean')),
        ('numerical scaler', MinMaxScaler()) 
    ])

categorical_pipeline = Pipeline(
    steps = [
        ('categorical imputer', SimpleImputer(strategy='most_frequent')),
        ('categorical one-hot', OneHotEncoder(handle_unknown='ignore',sparse = False)) 
    ])

In [8]:
full_transformer = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, num_features),
    ('category', categorical_pipeline, cat_features)
])

In [9]:
a = full_transformer.fit_transform(attributes)
a

array([[0.00000000e+00, 2.35294118e-01, 1.50684932e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [6.85400960e-04, 0.00000000e+00, 2.02054795e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.37080192e-03, 2.35294118e-01, 1.60958904e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [9.98629198e-01, 2.94117647e-01, 1.54109589e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [9.99314599e-01, 0.00000000e+00, 1.60958904e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 1.84931507e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [10]:
a = np.append(arr = np.ones((1460,1)).astype(int),values = a,axis = 1)


In [14]:
regressor_OLS=sm.OLS(endog = labels, exog=a).fit()  

In [15]:
regressor_OLS.summary()  

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.932
Model:,OLS,Adj. R-squared:,0.919
Method:,Least Squares,F-statistic:,68.77
Date:,"Sat, 03 Dec 2022",Prob (F-statistic):,0.0
Time:,12:27:23,Log-Likelihood:,-16580.0
No. Observations:,1460,AIC:,33650.0
Df Residuals:,1216,BIC:,34940.0
Df Model:,243,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.671e+04,3420.255,-4.884,0.000,-2.34e+04,-9995.319
x1,1353.5218,2251.953,0.601,0.548,-3064.622,5771.666
x2,-1.02e+04,1.41e+04,-0.725,0.469,-3.78e+04,1.74e+04
x3,1.165e+04,1.28e+04,0.909,0.364,-1.35e+04,3.68e+04
x4,1.561e+05,2.33e+04,6.687,0.000,1.1e+05,2.02e+05
x5,5.827e+04,9110.819,6.396,0.000,4.04e+04,7.61e+04
x6,4.632e+04,6972.130,6.644,0.000,3.26e+04,6e+04
x7,4.283e+04,1.05e+04,4.074,0.000,2.22e+04,6.35e+04
x8,5826.4378,3336.864,1.746,0.081,-720.211,1.24e+04

0,1,2,3
Omnibus:,402.113,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14666.286
Skew:,0.568,Prob(JB):,0.0
Kurtosis:,18.485,Cond. No.,2.54e+16


In [None]:
a = [:,[]]

In [None]:
regressor_OLS=sm.OLS(endog = labels, exog=a).fit()  