# Model Training

### Import all libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Import data

In [2]:
df = pd.read_csv('data/stud.csv')

### Prepare the data

In [3]:
# Seperate X and Y
X = df.drop(columns=['test_score'])
y = df['test_score']

### Create a Pipeline

In [4]:
# Select numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ]
)

# Create a pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X = pipe.fit_transform(X)

### Train test split

In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Evaluation function that return all metrics of a model

In [6]:
def evaluate(y_pred, y_test):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

In [11]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'SVM': SVR(),
    'CatBoost': CatBoostRegressor(verbose=False),
    'XGBoost': XGBRegressor()
}
model_list = []
r2_list = []
mse_list = []
rmse_list = []
mae_list = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores = evaluate(y_pred, y_test)
    model_list.append(name)
    r2_list.append(scores['r2'])
    mse_list.append(scores['mse'])
    rmse_list.append(scores['rmse'])
    mae_list.append(scores['mae'])

results = pd.DataFrame({
    'model': model_list,
    'r2': r2_list,
    'mse': mse_list,
    'rmse': rmse_list,
    'mae': mae_list
})

results = results.sort_values(by='r2', ascending=False)
results

Unnamed: 0,model,r2,mse,rmse,mae
1,Ridge,0.880451,29.090767,5.393586,4.212515
0,Linear Regression,0.880433,29.09517,5.393994,4.214763
6,AdaBoost,0.856915,34.818053,5.900682,4.57581
5,Random Forest,0.853014,35.767298,5.980577,4.680917
8,CatBoost,0.849801,36.549149,6.045589,4.633506
2,Lasso,0.82532,42.506332,6.519688,5.157879
9,XGBoost,0.820924,43.576168,6.601225,5.13119
3,KNN,0.776335,54.4262,7.377411,5.711
4,Decision Tree,0.730889,65.485,8.09228,6.405
7,SVM,0.713014,69.834692,8.356715,5.504566


### Evaluating each model through cross validation

In [12]:
models_name = []
cross_val_score_list = []

for name, model in models.items():
    scores = cross_val_score(model, X_test, y_test, cv=10, scoring='r2')
    models_name.append(name)
    cross_val_score_list.append(scores.mean())

cross_val_results = pd.DataFrame({
    'model': models_name,
    'cross_val_score': cross_val_score_list
})
cross_val_score_results = cross_val_results.sort_values(by='cross_val_score', ascending=False)
cross_val_score_results

Unnamed: 0,model,cross_val_score
1,Ridge,0.857042
0,Linear Regression,0.856716
2,Lasso,0.808731
8,CatBoost,0.805396
5,Random Forest,0.797177
6,AdaBoost,0.786574
9,XGBoost,0.741542
3,KNN,0.670647
4,Decision Tree,0.644654
7,SVM,0.550523


In [14]:
performance = pd.merge(results, cross_val_score_results, on='model')
performance['diff'] = performance['r2'] - performance['cross_val_score']
performance

Unnamed: 0,model,r2,mse,rmse,mae,cross_val_score,diff
0,Ridge,0.880451,29.090767,5.393586,4.212515,0.857042,0.023409
1,Linear Regression,0.880433,29.09517,5.393994,4.214763,0.856716,0.023717
2,AdaBoost,0.856915,34.818053,5.900682,4.57581,0.786574,0.070341
3,Random Forest,0.853014,35.767298,5.980577,4.680917,0.797177,0.055838
4,CatBoost,0.849801,36.549149,6.045589,4.633506,0.805396,0.044405
5,Lasso,0.82532,42.506332,6.519688,5.157879,0.808731,0.016589
6,XGBoost,0.820924,43.576168,6.601225,5.13119,0.741542,0.079382
7,KNN,0.776335,54.4262,7.377411,5.711,0.670647,0.105688
8,Decision Tree,0.730889,65.485,8.09228,6.405,0.644654,0.086235
9,SVM,0.713014,69.834692,8.356715,5.504566,0.550523,0.162491
