In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import yeojohnson
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dill
import os

In [2]:
with open('read_file.pickle', 'rb') as f: 
    load_data = dill.load(f)


df = load_data('train.csv')

In [3]:
with open('clean_data.pickle', 'rb') as f:
    clean_data = dill.load(f)

df = clean_data(df)

In [4]:
df.isnull().sum().sum()

np.int64(0)

In [5]:
with open('encode_data.pickle', 'rb') as f:
    encode_data = dill.load(f)


# Get only categorical features:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

df = encode_data(df, 'SalePrice', categorical_cols, train=True)


In [6]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,0.0,65.0,8450,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2008,0.001738,0.0,208500
1,2,20,0.0,80.0,9600,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,5,2007,0.001738,0.0,181500
2,3,60,0.0,68.0,11250,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,9,2008,0.001738,0.0,223500
3,4,70,0.0,60.0,9550,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2006,0.001738,0.0,140000
4,5,60,0.0,84.0,14260,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,12,2008,0.001738,0.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,0.0,62.0,7917,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,8,2007,0.001738,0.0,175000
1456,1457,20,0.0,85.0,13175,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2010,0.001738,0.0,210000
1457,1458,70,0.0,66.0,9042,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,2500,5,2010,0.001738,0.0,266500
1458,1459,20,0.0,68.0,9717,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,4,2010,0.001738,0.0,142125


In [7]:
with open('transform_data.pickle', 'rb') as f:
    transform_data = dill.load(f)


transform_data(df, 'SalePrice')


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,0.0,65.0,8450,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2008,0.001738,0.0,208500
1,2,20,0.0,80.0,9600,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,5,2007,0.001738,0.0,181500
2,3,60,0.0,68.0,11250,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,9,2008,0.001738,0.0,223500
3,4,70,0.0,60.0,9550,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2006,0.001738,0.0,140000
4,5,60,0.0,84.0,14260,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,12,2008,0.001738,0.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,0.0,62.0,7917,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,8,2007,0.001738,0.0,175000
1456,1457,20,0.0,85.0,13175,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,2,2010,0.001738,0.0,210000
1457,1458,70,0.0,66.0,9042,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,2500,5,2010,0.001738,0.0,266500
1458,1459,20,0.0,68.0,9717,0.0,0.0,0.000869,0.0,0.000869,...,0,0.000869,0.0,0.000869,0,4,2010,0.001738,0.0,142125


In [8]:
with open('split_data.pickle', 'rb') as f:
    split_data = dill.load(f)


X_train, X_test, y_train, y_test = split_data(df, 'SalePrice', feature_selected= None, features =['Id'])

In [10]:
with open('train_model.pickle', 'rb') as f:
    train_model = dill.load(f)

train_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train) 
# if i need to add **args i add it at the end eg. train_model = train_model(LinearRegression, xtrain=X_train, ytrain=y_train, fit_intercept = False, etc.)


with open('trained_model.pickle', 'wb') as f: 
    dill.dump(train_model, f)