In this notebook, I performed data preprocessing, feature engineering, and model predictions for housing prices using various machine learning models. The purpose of this notebook is to predict the sale prices of houses based on the given features in the dataset. We utilized RandomForest, XGBRegressor, and a Neural Network model to make predictions, demonstrating the application of different regression techniques to improve the accuracy of our predictions.

# Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
import dill
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder
import os
from define_function import *
import warnings
warnings.filterwarnings('ignore')

# Load data

In [2]:
df_test = load_data('test.csv')

In [3]:
# Assign the ID feature to a variable so I can use it later for the submission file
id_test = df_test['Id']

# drop features

In [4]:
# drop uneeded features
df_test = drop_features(df_test, features_to_drop=['Alley','PoolQC','Fence','MiscFeature'])

# Clean Data

In [5]:
# Impute data to fill missing values
df_test = clean_data(df_test, train=False)

In [6]:
# check if there are any missing values
df_test.isna().sum().sum()

0

# Encode Data

In [7]:
# Encode data to numerical values
Target_Encoding_list = ['MSZoning', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'SaleType']
Ordinal_Encoding_list= ['LotShape', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleCondition']


encoding_methods = {col: 'target' for col in Target_Encoding_list}
encoding_methods.update({col: 'ordinal' for col in Ordinal_Encoding_list})

df_test = encode_data(df_test, encoding_methods , train=False, target=['SalePrice'])


{'target': TargetEncoder(target_type='continuous'), 'ordinal': OrdinalEncoder()}


# Predictions

### RandomForest Model Predictions

In [8]:
# load model and feature list
with open('trained_model_rf.pickle', 'rb') as f:
    trained_model_rf = dill.load(f)


with open('feature_list.pickle', 'rb') as f:
    train_columns = dill.load(f)


# select only the features used in training
df_test = df_test[train_columns]
for col in train_columns:
    df_test[col] = df_test[col].astype(float)


# predict the target variable
y_new_pred_rf = predict_model(df_test, trained_model_rf)
y_new_pred_rf = y_new_pred_rf.flatten()

Predictions after inverse transform (if applicable):[123611.20879274 149861.00548954 183461.9437456  ... 158309.59441399
 123880.77147012 218220.25182775]


### XgbRegressor Model Predictions

In [9]:
# load model and feature list
with open('trained_model_XG.pickle', 'rb') as f:
    trained_model_XG = dill.load(f)


with open('feature_list.pickle', 'rb') as f:
    train_columns = dill.load(f)

# select only the features used in training
df_test = df_test[train_columns]
for col in train_columns:
    df_test[col] = df_test[col].astype(float)


# predict the target variable
y_new_pred_XG = predict_model(df_test, trained_model_XG)
y_new_pred_XG = y_new_pred_XG.flatten()

Predictions after inverse transform (if applicable):[124631.555 154569.89  183071.38  ... 161633.47  122293.664 203357.47 ]


### Neural Network Model PRedictions

In [10]:
# load model
with open('trained_nn_model.pickle', 'rb') as f:
    trained_nn_model = dill.load(f)


# predict the target variable
y_new_pred_nn = predict_model(df_test, trained_nn_model)
y_new_pred_nn

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Inverse transform produced NaNs. Returning raw predictions.
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step
Predictions after inverse transform (if applicable):[[142586.19]
 [162550.73]
 [217901.25]
 ...
 [154674.  ]
 [180399.66]
 [159724.95]]


array([[142586.19],
       [162550.73],
       [217901.25],
       ...,
       [154674.  ],
       [180399.66],
       [159724.95]], dtype=float32)

In [11]:
# Save predictions with corresponding IDs for the random forest model
model_rf = pd.DataFrame({'Id': id_test, 'SalePrice': y_new_pred_rf})
model_rf.to_csv('prediction_rf.csv', index=False)

In [12]:
# Save predictions with corresponding IDs for the XGBRegressor model
model_xg = pd.DataFrame({'Id': id_test, 'SalePrice': y_new_pred_XG})
model_xg.to_csv('prediction_xg.csv', index=False)

In [13]:
# Save predictions with corresponding IDsvfor the neural network model
model_nn = pd.DataFrame({'Id': id_test, 'SalePrice': y_new_pred_nn.flatten()})
model_nn.to_csv('prediction_nn.csv', index=False)