# Model Training

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)

In [2]:
# load the data

# Set the path to the raw data folder
processed_data_path = 'C:\\Users\\prath\\Advanced-House-Price-Prediction\\data\\proccessed\\'


# Load the train.csv file into a pandas DataFrame
x_train = pd.read_csv(processed_data_path + 'x_train.csv')
y_train = pd.read_csv(processed_data_path + 'y_train.csv')
x_test = pd.read_csv(processed_data_path + 'x_test.csv')

In [3]:
# Split the training set into training and validation sets
x_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Print the shapes of the training, validation, and testing sets
print(f'Training set shape: {x_train.shape}')
print(f'Validation set shape: {X_val.shape}')


Training set shape: (1168, 21)
Validation set shape: (292, 21)


In [4]:
# Initialize the models
lr = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
svm = SVR()
knn = KNeighborsRegressor()
nb = GaussianNB()
mlp = MLPRegressor()
lasso = Lasso(random_state=42)
ridge = Ridge(random_state=42)

In [6]:
import pickle

# Train and evaluate the models
models = [lr,lasso,ridge, dt, rf, gb, svm, knn]
model_names = ['linear_regression', 'lasso', 'ridge', 'decision_tree', 'random_forest', 'gradient_boosting', 'svm', 'knn']

for model, name in zip(models, model_names):
    model.fit(x_train, y_train)
    
    # predict train data 
    y_train_pred = model.predict(x_train)

    # predict validation data
    y_val_pred = model.predict(X_val)

    # evaluation metrics for train data
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train,y_train_pred)
    train_r2 = r2_score(y_train,y_train_pred)
    train_rmse = np.sqrt(train_mse)

    # evaluation metrics for validation data
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(val_mse)

    # print all
    print(f'{model.__class__.__name__} Train MSE: {train_mse:.4f} | Validation MSE: {val_mse:.4f} | Train MAE: {train_mae:.4f} | Validation MAE: {val_mae:.4f} | Train R2 Score: {train_r2:.4f} | Validation R2 Score: {val_r2:.4f} | Train RMSE: {train_rmse:.4f} | Validation RMSE: {val_rmse:.4f}')

    # save the trained model as a pickle file
    with open(f'{name}.pickle', 'wb') as f:
        pickle.dump(model, f)


LinearRegression Train MSE: 0.0180 | Validation MSE: 0.0192 | Train MAE: 0.0941 | Validation MAE: 0.0985 | Train R2 Score: 0.8822 | Validation R2 Score: 0.8971 | Train RMSE: 0.1340 | Validation RMSE: 0.1386
Lasso Train MSE: 0.1524 | Validation MSE: 0.1877 | Train MAE: 0.3034 | Validation MAE: 0.3371 | Train R2 Score: 0.0000 | Validation R2 Score: -0.0058 | Train RMSE: 0.3904 | Validation RMSE: 0.4332
Ridge Train MSE: 0.0181 | Validation MSE: 0.0194 | Train MAE: 0.0944 | Validation MAE: 0.0986 | Train R2 Score: 0.8816 | Validation R2 Score: 0.8960 | Train RMSE: 0.1344 | Validation RMSE: 0.1393
DecisionTreeRegressor Train MSE: 0.0000 | Validation MSE: 0.0406 | Train MAE: 0.0001 | Validation MAE: 0.1424 | Train R2 Score: 1.0000 | Validation R2 Score: 0.7823 | Train RMSE: 0.0024 | Validation RMSE: 0.2016


  model.fit(x_train, y_train)


RandomForestRegressor Train MSE: 0.0029 | Validation MSE: 0.0197 | Train MAE: 0.0370 | Validation MAE: 0.0946 | Train R2 Score: 0.9812 | Validation R2 Score: 0.8943 | Train RMSE: 0.0535 | Validation RMSE: 0.1405


  y = column_or_1d(y, warn=True)


GradientBoostingRegressor Train MSE: 0.0087 | Validation MSE: 0.0186 | Train MAE: 0.0694 | Validation MAE: 0.0953 | Train R2 Score: 0.9429 | Validation R2 Score: 0.9003 | Train RMSE: 0.0933 | Validation RMSE: 0.1364


  y = column_or_1d(y, warn=True)


SVR Train MSE: 0.0107 | Validation MSE: 0.0211 | Train MAE: 0.0776 | Validation MAE: 0.1001 | Train R2 Score: 0.9296 | Validation R2 Score: 0.8871 | Train RMSE: 0.1036 | Validation RMSE: 0.1451
KNeighborsRegressor Train MSE: 0.0213 | Validation MSE: 0.0353 | Train MAE: 0.1057 | Validation MAE: 0.1323 | Train R2 Score: 0.8600 | Validation R2 Score: 0.8106 | Train RMSE: 0.1461 | Validation RMSE: 0.1880


**Models and their corresponding evaluation metrics**

- Linear Regression: Train R2 Score: 0.8822 | Validation R2 Score: 0.8971
- Lasso: Train R2 Score: 0.0000 | Validation R2 Score: -0.0058
- Ridge: Train R2 Score: 0.8816 | Validation R2 Score: 0.8960
- Decision Tree Regressor: Train R2 Score: 1.0000 | Validation R2 Score: 0.7805
- Random Forest Regressor: Train R2 Score: 0.9801 | Validation R2 Score: 0.8932
- Gradient Boosting Regressor: Train R2 Score: 0.9429 | Validation R2 Score: 0.9004
- Support Vector Regressor: Train R2 Score: 0.9296 | Validation R2 Score: 0.8871
- K-Neighbors Regressor: Train R2 Score: 0.8600 | Validation R2 Score: 0.8106