In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import yeojohnson
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dill
import os

In [2]:
with open('read_file.pickle', 'rb') as f: 
    load_data = dill.load(f)


df = load_data('train.csv')

In [3]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
with open('drop_features.pickle', 'rb') as f:
    drop_features = dill.load(f)

df = drop_features(df, features_to_drop=['Id', 'Alley','PoolQC','Fence','MiscFeature'])


In [5]:
with open('clean_data.pickle', 'rb') as f:
    clean_data = dill.load(f)

df = clean_data(df, target='SalePrice')

In [6]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
with open('encode_data.pickle', 'rb') as f:
    encode_data = dill.load(f)


# Get only categorical features:

Target_Encoding_list = ['MSZoning', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'SaleType']
Ordinal_Encoding_list= ['LotShape', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleCondition']

encoding_methods = {col: 'target' for col in Target_Encoding_list} 
encoding_methods.update({col: 'ordinal' for col in Ordinal_Encoding_list}) # merge the dict with the target dict

df = encode_data(df, encoding_methods, train=True, target = 'SalePrice')


In [8]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols


[]

In [9]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60.0,0.0,65.0,8450.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,0.000789,4.0,208500
1,20.0,0.0,80.0,9600.0,0.000688,3.0,3.0,0.000685,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,0.000789,4.0,181500
2,60.0,0.0,68.0,11250.0,0.000688,0.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,0.000789,4.0,223500
3,70.0,0.0,60.0,9550.0,0.000688,0.0,3.0,0.000685,0.00000,0.0,...,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,0.000789,0.0,140000
4,60.0,0.0,84.0,14260.0,0.000688,0.0,3.0,0.000685,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,0.000789,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60.0,0.0,62.0,7917.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2007.0,0.000789,4.0,175000
1456,20.0,0.0,85.0,13175.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2010.0,0.000789,4.0,210000
1457,70.0,0.0,66.0,9042.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,2500.0,5.0,2010.0,0.000789,4.0,266500
1458,20.0,0.0,68.0,9717.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,112.0,0.0,0.0,0.0,0.0,4.0,2010.0,0.000789,4.0,142125


In [10]:
with open('transform_data.pickle', 'rb') as f:
    transform_data = dill.load(f)


df = transform_data(df, 'SalePrice')


In [11]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,transform_target
0,60.0,0.0,65.0,8450.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,2.0,2008.0,0.000789,4.0,208500,0.571155
1,20.0,0.0,80.0,9600.0,0.000688,3.0,3.0,0.000685,0.00000,0.0,...,0.0,0.0,0.0,0.0,5.0,2007.0,0.000789,4.0,181500,0.227627
2,60.0,0.0,68.0,11250.0,0.000688,0.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,9.0,2008.0,0.000789,4.0,223500,0.741869
3,70.0,0.0,60.0,9550.0,0.000688,0.0,3.0,0.000685,0.00000,0.0,...,0.0,0.0,0.0,0.0,2.0,2006.0,0.000789,0.0,140000,-0.425386
4,60.0,0.0,84.0,14260.0,0.000688,0.0,3.0,0.000685,0.00000,0.0,...,0.0,0.0,0.0,0.0,12.0,2008.0,0.000789,4.0,250000,1.015293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60.0,0.0,62.0,7917.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,8.0,2007.0,0.000789,4.0,175000,0.136679
1456,20.0,0.0,85.0,13175.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,2.0,2010.0,0.000789,4.0,210000,0.588812
1457,70.0,0.0,66.0,9042.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,2500.0,5.0,2010.0,0.000789,4.0,266500,1.170202
1458,20.0,0.0,68.0,9717.0,0.000688,3.0,3.0,0.000685,0.00095,0.0,...,0.0,0.0,0.0,0.0,4.0,2010.0,0.000789,4.0,142125,-0.387136


In [12]:
with open('split_data.pickle', 'rb') as f:
    split_data = dill.load(f)


X_train, X_test, y_train, y_test = split_data(df, target ='transform_target', target_dropped = ['SalePrice'], feature_selected= None)

In [13]:
with open('train_model.pickle', 'rb') as f:
    train_model = dill.load(f)

model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train) 
# if i need to add **args i add it at the end eg. train_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train, fit_intercept = False, etc.)


with open('trained_model.pickle', 'wb') as f: 
    dill.dump(model, f)

In [18]:
train_columns = X_train.columns

with open('train_columns.pickle', 'wb') as f: 
    dill.dump(train_columns, f)

In [19]:
train_columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'