In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import yeojohnson
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
import dill
import os
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from helper_functions import *

In [2]:
df = load_data('train.csv')

In [3]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:

df = drop_features(df, features_to_drop=['Alley','PoolQC','Fence','MiscFeature', ])

In [5]:
df = clean_data(df, target='SalePrice')

In [6]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
# with open('encode_data.pickle', 'rb') as f:
#     encode_data = dill.load(f)

# Get only categorical features:

Target_Encoding_list = ['MSZoning', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'SaleType']
Ordinal_Encoding_list= ['LotShape', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleCondition']

encoding_methods = {col: 'target' for col in Target_Encoding_list} 
encoding_methods.update({col: 'ordinal' for col in Ordinal_Encoding_list}) # merge the dict with the target dict

df = encode_data(df, encoding_methods, train=True, target =['SalePrice'])


In [8]:
# check if al categorical were encoded
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

[]

In [9]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,60.0,0.0,65.0,8450.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,0.000869,4.0,208500
1,2.0,20.0,0.0,80.0,9600.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,0.000869,4.0,181500
2,3.0,60.0,0.0,68.0,11250.0,0.0,0.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,0.000869,4.0,223500
3,4.0,70.0,0.0,60.0,9550.0,0.0,0.0,3.0,0.0,0.000869,...,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,0.000869,0.0,140000
4,5.0,60.0,0.0,84.0,14260.0,0.0,0.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,0.000869,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,0.0,62.0,7917.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,8.0,2007.0,0.000869,4.0,175000
1456,1457.0,20.0,0.0,85.0,13175.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,0.0,2.0,2010.0,0.000869,4.0,210000
1457,1458.0,70.0,0.0,66.0,9042.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,2500.0,5.0,2010.0,0.000869,4.0,266500
1458,1459.0,20.0,0.0,68.0,9717.0,0.0,3.0,3.0,0.0,0.000869,...,112.0,0.0,0.0,0.0,0.0,4.0,2010.0,0.000869,4.0,142125


In [10]:
# with open('transform_data.pickle', 'rb') as f:
#     transform_data = dill.load(f)

df = transform_data(df, 'SalePrice')


In [11]:
# check object 
with open('pt.pickle', 'rb') as f:
    pt = dill.load(f)

pt.lambdas_

array([-0.0769321])

In [12]:
min(df.transform_target)

-4.150030202740451

In [13]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,transform_target
0,1.0,60.0,0.0,65.0,8450.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,2.0,2008.0,0.000869,4.0,208500,0.571155
1,2.0,20.0,0.0,80.0,9600.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,5.0,2007.0,0.000869,4.0,181500,0.227627
2,3.0,60.0,0.0,68.0,11250.0,0.0,0.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,9.0,2008.0,0.000869,4.0,223500,0.741869
3,4.0,70.0,0.0,60.0,9550.0,0.0,0.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,2.0,2006.0,0.000869,0.0,140000,-0.425386
4,5.0,60.0,0.0,84.0,14260.0,0.0,0.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,12.0,2008.0,0.000869,4.0,250000,1.015293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,0.0,62.0,7917.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,8.0,2007.0,0.000869,4.0,175000,0.136679
1456,1457.0,20.0,0.0,85.0,13175.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,2.0,2010.0,0.000869,4.0,210000,0.588812
1457,1458.0,70.0,0.0,66.0,9042.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,2500.0,5.0,2010.0,0.000869,4.0,266500,1.170202
1458,1459.0,20.0,0.0,68.0,9717.0,0.0,3.0,3.0,0.0,0.000869,...,0.0,0.0,0.0,0.0,4.0,2010.0,0.000869,4.0,142125,-0.387136


In [14]:
# with open('split_data.pickle', 'rb') as f:
#     split_data = dill.load(f)


X_train, X_test, y_train, y_test = split_data(df, target ='transform_target', col_dropped = ['SalePrice','Id'], feature_selected= None)

In [15]:
# with open('train_model.pickle', 'rb') as f:
#     train_model = dill.load(f)

# model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train) 
# # if i need to add **args i add it at the end eg. train_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train, fit_intercept = False, etc.)


# with open('trained_model.pickle', 'wb') as f: 
#     dill.dump(model, f)

In [16]:
# with open('train_model.pickle', 'rb') as f:
#     train_model = dill.load(f)

param_grid = {
    # 'max_depth': [3, 5, 7, 9],
    # 'learning_rate': [0.1, 0.05, 0.01],
    # 'n_estimators': [50, 100, 200, 300],
    # 'gamma': [0, 0.1, 0.5],
    # 'subsample': [0.5, 0.8, 1]
}


model = train_model(xgb.XGBRegressor, xtrain=X_train, ytrain=y_train, param_grid=param_grid, best_combination=False) 

In [17]:
with open('trained_model1.pickle', 'wb') as f: 
    dill.dump(model, f)

In [18]:
with open('feature_list.pickle', 'wb') as f: 
    dill.dump(X_train.columns.tolist(), f)

In [19]:
model

### Check if model overffiting

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
len(model.predict(X_train))

1168

In [22]:
y_pred= model.predict(X_train)
len(y_pred.reshape(-1, 1).flatten())

1168

In [23]:
len(y_train)

1168

In [24]:
len(y_train.values)

1168

In [25]:
train_rmse = mean_squared_error(y_pred.reshape(-1, 1).flatten(), y_train.values, squared=False)



In [26]:
test_rmse = mean_squared_error(model.predict(X_test), y_test, squared=False)



In [27]:
train_rmse # if train is way better than test this is overfitting

np.float64(0.012922542079198344)

In [28]:
test_rmse

np.float64(0.389974626347761)