In this notebook, I performed data preprocessing, feature engineering, and model training on a housing dataset to predict house prices. I started by loading and cleaning the data, followed by encoding categorical features and transforming the target variable. I then trained multiple models, including Linear Regression, XGBoost, Random Forest, and a Neural Network, to compare their performance in predicting house prices.

# Libraries

In [1]:
# !python -m pip install scikit-learn==1.3.1
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, OneHotEncoder, power_transform, PowerTransformer
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import yeojohnson
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
import dill
import os
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input
from define_function import *
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
df = load_data('train.csv')

In [3]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


# Drop Features

In [4]:
# drop uneeded features
df = drop_features(df, features_to_drop=['Alley','PoolQC','Fence','MiscFeature', ])

# Clean Data

In [5]:
# Impute Data to fill missing values
df = clean_data(df, target='SalePrice')

In [None]:
# Check for missing values
df.isnull().sum().sum()

0

# Encode Data

In [7]:
# Encode data to convert categorical data to numerical data
Target_Encoding_list = ['MSZoning', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'SaleType']
Ordinal_Encoding_list= ['LotShape', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleCondition']

encoding_methods = {col: 'target' for col in Target_Encoding_list}
encoding_methods.update({col: 'ordinal' for col in Ordinal_Encoding_list}) # merge the dict with the target dict

df = encode_data(df, encoding_methods, train=True, target =['SalePrice'])


In [8]:
# check if al categorical were encoded
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

[]

In [9]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,60.0,190995.949254,65.0,8450.0,181130.394622,3.0,3.0,180950.936426,176941.547778,...,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,173406.590852,4.0,208500
1,2.0,20.0,190995.949254,80.0,9600.0,181130.394622,3.0,3.0,180950.936426,177972.952552,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,173406.590852,4.0,181500
2,3.0,60.0,190995.949254,68.0,11250.0,181130.394622,0.0,3.0,180950.936426,176941.547778,...,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,173406.590852,4.0,223500
3,4.0,70.0,190995.949254,60.0,9550.0,181130.394622,0.0,3.0,180950.936426,181620.429673,...,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,173406.590852,0.0,140000
4,5.0,60.0,190995.949254,84.0,14260.0,181130.394622,0.0,3.0,180950.936426,177972.952552,...,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,173406.590852,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,60.0,190995.949254,62.0,7917.0,181130.394622,3.0,3.0,180950.936426,176941.547778,...,0.0,0.0,0.0,0.0,0.0,8.0,2007.0,173406.590852,4.0,175000
1456,1457.0,20.0,190995.949254,85.0,13175.0,181130.394622,3.0,3.0,180950.936426,176941.547778,...,0.0,0.0,0.0,0.0,0.0,2.0,2010.0,173406.590852,4.0,210000
1457,1458.0,70.0,190995.949254,66.0,9042.0,181130.394622,3.0,3.0,180950.936426,176941.547778,...,0.0,0.0,0.0,0.0,2500.0,5.0,2010.0,173406.590852,4.0,266500
1458,1459.0,20.0,190995.949254,68.0,9717.0,181130.394622,3.0,3.0,180950.936426,176941.547778,...,112.0,0.0,0.0,0.0,0.0,4.0,2010.0,173406.590852,4.0,142125


# Transform Target

In [10]:
# Transform target to normalize data
df = transform_data(df, 'SalePrice')

# Train_Test_Split

In [11]:
#split data into training and testing data
X_train, X_test, y_train, y_test = split_data(df, target ='transform_target', col_dropped = ['SalePrice','Id'], feature_selected= None)

# Train Model

### Linear Regression

In [12]:
# Train model using Linear Regression
model_lr = train_model(LinearRegression, xtrain=X_train, ytrain=y_train)

# save model
with open('trained_model_LR.pickle', 'wb') as f:
    dill.dump(model_lr, f)

### XGBRegressor 

In [13]:
# Train model using XGBoost
param_grid_xg = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [50, 100, 200, 300],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.5, 0.8, 1]
}


model_XG = train_model(XGBRegressor, xtrain=X_train, ytrain=y_train, param_grid=param_grid_xg, best_combination=True)

In [14]:
# save the model
with open('trained_model_XG.pickle', 'wb') as f:
    dill.dump(model_XG, f)

### Random Forest

In [15]:
# train model using Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


model_rf = train_model(RandomForestRegressor, xtrain=X_train, ytrain=y_train, param_grid=param_grid_rf, best_combination=True)

In [16]:
# save model
with open('trained_model_rf.pickle', 'wb') as f:
    dill.dump(model_rf, f)

In [17]:
# save feature list
with open('feature_list.pickle', 'wb') as f:
    dill.dump(X_train.columns.tolist(), f)

# Build Neural Network Model

In [18]:
# Split data without transformation for Neural Network
X_train_nn, X_test_nn, y_train_nn, y_test_nn = split_data(df, target ='SalePrice', col_dropped = ['transform_target','Id'], feature_selected= None)

In [19]:
# Train Neural Network 
model_nn, history = neural_network_model(X=X_train_nn, y=y_train_nn, loss='mse', metrics='mse', activations='relu', widths=[64], num_layers=2, epochs=100, learning_rate=0.0001, validation_split = 0.3333)


# Save the model
with open('trained_nn_model.pickle', 'wb') as f:
    dill.dump(model_nn, f)

Epoch 1/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 18690977792.0000 - mse: 18690977792.0000 - val_loss: 10750921728.0000 - val_mse: 10750921728.0000
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 10631590912.0000 - mse: 10631590912.0000 - val_loss: 5460333056.0000 - val_mse: 5460333056.0000
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6016963584.0000 - mse: 6016963584.0000 - val_loss: 4361133568.0000 - val_mse: 4361133568.0000
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4077316096.0000 - mse: 4077316096.0000 - val_loss: 4187310080.0000 - val_mse: 4187310080.0000
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 5252156416.0000 - mse: 5252156416.0000 - val_loss: 3973964800.0000 - val_mse: 3973964800.0000
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [20]:
model_nn.summary()