# Modeling
This notebook handles:
1. Model Training
2. Model Evaluation
3. Model Comparison

In [1]:
import pandas as pd

X_train = pd.read_csv("../data/split_data/X_train.csv")
X_test = pd.read_csv("../data/split_data/X_test.csv")
y_train = pd.read_csv("../data/split_data/y_train.csv", header=None).to_numpy().ravel()
y_test = pd.read_csv("../data/split_data/y_test.csv", header=None).to_numpy().ravel()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((21797, 36), (5450, 36), (21797,), (5450,))

## Model Training and Evaluation

In [2]:
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score, \
    mean_absolute_percentage_error
import joblib

evaluation = pd.DataFrame(columns=["mae", "mse", "rmse", "r2", "mape"])


def train_model(model: BaseEstimator, evaluate: bool = False, save: bool = False) -> pd.DataFrame:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_name = model.__class__.__name__
    if evaluate:
        evaluation.at[model_name, "mae"] = mean_absolute_error(y_test, y_pred)
        evaluation.at[model_name, "mse"] = mean_squared_error(y_test, y_pred)
        evaluation.at[model_name, "rmse"] = root_mean_squared_error(y_test, y_pred)
        evaluation.at[model_name, "r2"] = r2_score(y_test, y_pred)
        evaluation.at[model_name, "mape"] = mean_absolute_percentage_error(y_test, y_pred)
    if save:
        joblib.dump(model, f"../utils/{model_name}")
    return evaluation

### Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression

train_model(LinearRegression(n_jobs=-1), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435


### Ridge Regression

In [4]:
from sklearn.linear_model import Ridge

train_model(Ridge(random_state=42), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407


### ElasticNet Regression

In [5]:
from sklearn.linear_model import ElasticNet

train_model(ElasticNet(alpha=0.1), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236


### Support Vector Regression (SVR)

In [6]:
from sklearn.svm import SVR

train_model(SVR(kernel="linear"), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304


### Decision Tree Regression

In [7]:
from sklearn.tree import DecisionTreeRegressor

train_model(DecisionTreeRegressor(), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385


### Random Forest Regression

In [8]:
from sklearn.ensemble import RandomForestRegressor

train_model(RandomForestRegressor(random_state=42, n_jobs=-1), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385
RandomForestRegressor,170012.897366,95298301578.59706,308704.229933,0.776981,0.159316


### Gradient Boosting Regression

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

train_model(GradientBoostingRegressor(learning_rate=0.2, random_state=42), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385
RandomForestRegressor,170012.897366,95298301578.59706,308704.229933,0.776981,0.159316
GradientBoostingRegressor,182006.501028,102202198546.06413,319690.785832,0.760824,0.173901


### K-Nearest Neighbors Regression

In [10]:
from sklearn.neighbors import KNeighborsRegressor

train_model(KNeighborsRegressor(n_neighbors=9, weights="distance", metric="manhattan", n_jobs=-1), evaluate=True,
            save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385
RandomForestRegressor,170012.897366,95298301578.59706,308704.229933,0.776981,0.159316
GradientBoostingRegressor,182006.501028,102202198546.06413,319690.785832,0.760824,0.173901
KNeighborsRegressor,376939.158967,335732738294.4423,579424.488863,0.214311,0.417282


### Bayesian Regression

In [11]:
from sklearn.linear_model import BayesianRidge

train_model(BayesianRidge(), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385
RandomForestRegressor,170012.897366,95298301578.59706,308704.229933,0.776981,0.159316
GradientBoostingRegressor,182006.501028,102202198546.06413,319690.785832,0.760824,0.173901
KNeighborsRegressor,376939.158967,335732738294.4423,579424.488863,0.214311,0.417282
BayesianRidge,272983.422333,192231160532.1688,438441.741321,0.550136,0.289446


### ARD Regression

In [12]:
from sklearn.linear_model import ARDRegression

train_model(ARDRegression(), evaluate=True, save=True)

Unnamed: 0,mae,mse,rmse,r2,mape
LinearRegression,272905.762985,192136418851.99692,438333.684368,0.550358,0.289435
Ridge,272902.623572,192140028387.84012,438337.801687,0.550349,0.289407
ElasticNet,271504.451304,196379811432.29736,443147.618105,0.540427,0.282236
SVR,406144.434228,428789873771.283,654820.489731,-0.003464,0.388304
DecisionTreeRegressor,230835.14318,162282051591.0258,402842.46498,0.620224,0.21385
RandomForestRegressor,170012.897366,95298301578.59706,308704.229933,0.776981,0.159316
GradientBoostingRegressor,182006.501028,102202198546.06413,319690.785832,0.760824,0.173901
KNeighborsRegressor,376939.158967,335732738294.4423,579424.488863,0.214311,0.417282
BayesianRidge,272983.422333,192231160532.1688,438441.741321,0.550136,0.289446
ARDRegression,273310.778348,192841095929.42905,439136.762216,0.548709,0.289549
