In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn import set_config
set_config(display='diagram')
from lazypredict.Supervised import LazyRegressor

In [2]:
data = pd.read_csv("data/train.csv")
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
X = data.drop(columns = ["Id", "SalePrice"])
y = data.SalePrice

In [4]:
X.replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1, 2, 3, 4, 5], inplace=True)
X.replace(['No', 'Mn','Av'], [1, 2, 3], inplace=True)
X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']] = X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']].fillna(value=0)

In [5]:
null = {}
drop = []
for i in range(0,X.shape[1]):
    if 0 < X.iloc[:,i].isna().sum() <= X.shape[0]*(0.3):
        null[X.columns[i]] = X.iloc[:,i].isna().sum()
    elif X.iloc[:,i].isna().sum() > X.shape[0]*(0.3):# If the null percentage is greater than 30% we will drop that columns. 
        drop.append(X.columns[i])
X.drop(columns = drop, inplace=True)

In [6]:
drop

['Alley', 'Fence', 'MiscFeature']

In [7]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X['GarageYrBlt'] = imputer.fit_transform(X[['GarageYrBlt']])

In [8]:
X_cat = X.select_dtypes(include = "object")
X_num = X.select_dtypes(include = ["int64", "float64"])

In [9]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(X_num)
df_scaled = pd.DataFrame(scaled)

In [10]:
X_cat = X_cat.fillna(value="None")

In [11]:
encoder = OneHotEncoder(sparse = False)
encoded = encoder.fit_transform(X_cat)
df_encoded = pd.DataFrame(encoded)

In [12]:
df_concat = pd.concat([df_scaled, df_encoded], axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_concat, y, test_size=0.30, random_state=42)

In [14]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:17<00:00,  2.43it/s]


In [20]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PoissonRegressor,0.85,0.93,21497.98,0.05
GammaRegressor,0.81,0.92,24234.45,0.03
GradientBoostingRegressor,0.80,0.91,24864.92,0.85
LGBMRegressor,0.77,0.90,26450.91,0.2
HistGradientBoostingRegressor,0.77,0.90,26571.15,3.51
RandomForestRegressor,0.77,0.90,26600.90,2.44
XGBRegressor,0.75,0.89,27806.68,0.38
BayesianRidge,0.74,0.89,27966.18,0.1
RidgeCV,0.74,0.89,28044.19,0.06
BaggingRegressor,0.74,0.89,28096.34,0.27


In [22]:
poisson_reg = PoissonRegressor(alpha = 2, max_iter = 2500)

In [23]:
poisson_reg.fit(X_train, y_train)

In [24]:
poisson_reg.score(X_train, y_train)

0.9490391712344209

In [25]:
poisson_reg.score(X_test, y_test)

0.9296673419175238

In [31]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_test, y_test, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [32]:
rmse_cv(poisson_reg).min()

28126.700421932583