# Model Training

### Import all libraries

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Import data

In [51]:
df = pd.read_csv('data/stud.csv')

### Prepare the data

In [52]:
# Seperate X and Y
X = df.drop(columns=['math_score'])
y = df['math_score']

In [59]:
X.select_dtypes(include=['int64', 'float64']).columns.values
X.select_dtypes(include=['object']).columns.values

array(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'], dtype=object)

In [65]:
# Select numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ]
)

# Create a pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X = pipe.fit_transform(X)

### Train test split

In [67]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Evaluation function that return all metrics of a model

In [68]:
def evaluate(y_pred, y_test):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

In [71]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'SVM': SVR(),
    'CatBoost': CatBoostRegressor(verbose=False),
    'XGBoost': XGBRegressor()
}
model_list = []
r2_list = []
mse_list = []
rmse_list = []
mae_list = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores = evaluate(y_pred, y_test)
    model_list.append(name)
    r2_list.append(scores['r2'])
    mse_list.append(scores['mse'])
    rmse_list.append(scores['rmse'])
    mae_list.append(scores['mae'])

results = pd.DataFrame({
    'model': model_list,
    'r2': r2_list,
    'mse': mse_list,
    'rmse': rmse_list,
    'mae': mae_list
})

results

Unnamed: 0,model,r2,mse,rmse,mae
0,Linear Regression,0.880433,29.09517,5.393994,4.214763
1,Ridge,0.880451,29.090767,5.393586,4.212515
2,Lasso,0.82532,42.506332,6.519688,5.157879
3,KNN,0.776335,54.4262,7.377411,5.711
4,Decision Tree,0.739088,63.49,7.968061,6.31
5,Random Forest,0.855042,35.273917,5.939185,4.615102
6,AdaBoost,0.84978,36.554289,6.046014,4.781104
7,SVM,0.713014,69.834692,8.356715,5.504566
8,CatBoost,0.849801,36.549149,6.045589,4.633506
9,XGBoost,0.820924,43.576168,6.601225,5.13119
