In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import yeojohnson
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
import dill
import os
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from helper_functions import *

In [23]:
df = load_data('train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:

df = drop_features(df, features_to_drop=['Alley','PoolQC','Fence','MiscFeature', ])

In [5]:
df = clean_data(df, target='SalePrice')

In [6]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
# with open('encode_data.pickle', 'rb') as f:
#     encode_data = dill.load(f)

# Get only categorical features:

Target_Encoding_list = ['MSZoning', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'Electrical', 'GarageType', 'SaleType']
Ordinal_Encoding_list= ['LotShape', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleCondition']

encoding_methods = {col: 'target' for col in Target_Encoding_list} 
encoding_methods.update({col: 'ordinal' for col in Ordinal_Encoding_list}) # merge the dict with the target dict

df = encode_data(df, encoding_methods, train=True, target =['SalePrice'])


In [10]:
df.CentralAir = np.where(df.CentralAir == 'N', 0, 1) 

In [11]:
# check if al categorical were encoded
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

[]

In [12]:
# with open('transform_data.pickle', 'rb') as f:
#     transform_data = dill.load(f)

df = transform_data(df, 'SalePrice')


In [13]:
# check object 
with open('pt.pickle', 'rb') as f:
    pt = dill.load(f)

pt.lambdas_

array([-0.0769321])

In [14]:
min(df.transform_target)

-4.150030202740451

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                1460 non-null   float64
 1   MSSubClass        1460 non-null   float64
 2   MSZoning          1460 non-null   float64
 3   LotFrontage       1460 non-null   float64
 4   LotArea           1460 non-null   float64
 5   Street            1460 non-null   float64
 6   LotShape          1460 non-null   float64
 7   LandContour       1460 non-null   float64
 8   Utilities         1460 non-null   float64
 9   LotConfig         1460 non-null   float64
 10  LandSlope         1460 non-null   float64
 11  Neighborhood      1460 non-null   float64
 12  Condition1        1460 non-null   float64
 13  Condition2        1460 non-null   float64
 14  BldgType          1460 non-null   float64
 15  HouseStyle        1460 non-null   float64
 16  OverallQual       1460 non-null   float64


In [16]:
# feature selection for streamlit project
X_train, X_test, y_train, y_test = split_data(df, target ='transform_target', col_dropped = ['SalePrice','Id'], feature_selected= ['BedroomAbvGr', 'FullBath', 'LotArea', 'GarageCars', 'CentralAir'])

In [14]:
# X_train, X_test, y_train, y_test = split_data(df, target ='transform_target', col_dropped = ['SalePrice','Id'], feature_selected= None)

In [17]:
with open('splitted_fdataset.pickle', 'wb') as f:
    dill.dump((X_train, X_test, y_train, y_test),f)

In [18]:
with open('train_model.pickle', 'rb') as f:
    train_model = dill.load(f)

lr_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train) 
# if i need to add **args i add it at the end eg. train_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train, fit_intercept = False, etc.)


with open('trained_fmodel.pickle', 'wb') as f: 
    dill.dump(lr_model, f)

In [19]:
# with open('train_model.pickle', 'rb') as f:
#     train_model = dill.load(f)

param_grid = {
    # 'max_depth': [3, 5, 7, 9],
    # 'learning_rate': [0.1, 0.05, 0.01],
    # 'n_estimators': [50, 100, 200, 300],
    # 'gamma': [0, 0.1, 0.5],
    # 'subsample': [0.5, 0.8, 1]
}


xgb_model = train_model(xgb.XGBRegressor, xtrain=X_train, ytrain=y_train, param_grid=param_grid, best_combination=False) 

In [20]:
with open('trained_fmodel1.pickle', 'wb') as f: 
    dill.dump(xgb_model, f)

In [21]:
with open('ffeature_list.pickle', 'wb') as f: 
    dill.dump(X_train.columns.tolist(), f)

### Check if model overffiting

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
# len(model.predict(X_train))

In [22]:
# y_pred= model.predict(X_train)
# len(y_pred.reshape(-1, 1).flatten())

In [23]:
# len(y_train)

In [24]:
# len(y_train.values)

In [25]:
# train_rmse = mean_squared_error(y_pred.reshape(-1, 1).flatten(), y_train.values, squared=False)

In [26]:
# test_rmse = mean_squared_error(model.predict(X_test), y_test, squared=False)

In [27]:
# train_rmse # if train is way better than test this is overfitting

In [28]:
# test_rmse