Model Training

In [41]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

In [42]:
df = pd.read_csv('data/stud.csv')

In [43]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [44]:
X = df.drop(columns=['math_score'],axis=1)
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [45]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [46]:
y = df['math_score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [47]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [48]:
X = preprocessor.fit_transform(X)

In [49]:
X.shape

(1000, 19)

In [50]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [51]:
# Function to evaluate model
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

Linear Regressioon

In [52]:
# Linear Regression
model = LinearRegression()

In [53]:
# Train model
model.fit(X_train, y_train)

In [54]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate model
train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

print("\nLinear Regression Performance:")
print("Training Set:")
print(f"MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
print("Test Set:")
print(f"MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")


Linear Regression Performance:
Training Set:
MAE: 4.2717, RMSE: 5.3285, R2: 0.8741
Test Set:
MAE: 4.2242, RMSE: 5.4176, R2: 0.8794


Ridge Regression

In [55]:
# Ridge Regression
ridge = Ridge()

In [56]:
# Define hyperparameter grid
param_grid = {
    "alpha": [0.1, 1, 10, 100]
}

In [57]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [58]:
# Best Ridge model
best_ridge = grid_search.best_estimator_

In [59]:
# Make predictions
y_train_pred_ridge = best_ridge.predict(X_train)
y_test_pred_ridge = best_ridge.predict(X_test)

# Evaluate Ridge model
ridge_train_mae, ridge_train_rmse, ridge_train_r2 = evaluate_model(y_train, y_train_pred_ridge)
ridge_test_mae, ridge_test_rmse, ridge_test_r2 = evaluate_model(y_test, y_test_pred_ridge)

print("\nRidge Regression Performance (Tuned):")
print("Training Set:")
print(f"MAE: {ridge_train_mae:.4f}, RMSE: {ridge_train_rmse:.4f}, R2: {ridge_train_r2:.4f}")
print("Test Set:")
print(f"MAE: {ridge_test_mae:.4f}, RMSE: {ridge_test_rmse:.4f}, R2: {ridge_test_r2:.4f}")



Ridge Regression Performance (Tuned):
Training Set:
MAE: 4.2650, RMSE: 5.3233, R2: 0.8743
Test Set:
MAE: 4.2111, RMSE: 5.3904, R2: 0.8806


In [60]:
# Function to train and evaluate a model
def train_and_evaluate_model(name, model, param_grid=None):
    print(f"\n{name}")
    print("=" * 40)
    
    if param_grid:  # If hyperparameter tuning is required
        print("Performing Grid Search for Hyperparameter Tuning...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=5, verbose=1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best Parameters: {grid_search.best_params_}")
    else:  # No hyperparameter tuning
        best_model = model
        best_model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # Evaluation
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    
    print("\nPerformance on Training Set:")
    print(f"MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
    
    print("\nPerformance on Test Set:")
    print(f"MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")

In [61]:
# Lasso Regression with Hyperparameter Tuning
lasso_params = {"alpha": [0.1, 1, 10, 100]}
train_and_evaluate_model("Lasso Regression", Lasso(), param_grid=lasso_params)


Lasso Regression
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'alpha': 0.1}

Performance on Training Set:
MAE: 4.3349, RMSE: 5.3985, R2: 0.8707

Performance on Test Set:
MAE: 4.1546, RMSE: 5.3685, R2: 0.8816


In [62]:
# Decision Tree Regressor with Hyperparameter Tuning
dt_params = {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]}
train_and_evaluate_model("Decision Tree", DecisionTreeRegressor(), param_grid=dt_params)


Decision Tree
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': 5, 'min_samples_split': 10}

Performance on Training Set:
MAE: 4.6465, RMSE: 5.7195, R2: 0.8549

Performance on Test Set:
MAE: 4.9361, RMSE: 6.5462, R2: 0.8239


In [63]:
# Random Forest Regressor with Hyperparameter Tuning
rf_params = {"n_estimators": [50, 100, 200], "max_depth": [5, 10, None]}
train_and_evaluate_model("Random Forest Regressor", RandomForestRegressor(random_state=42), param_grid=rf_params)


Random Forest Regressor
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'max_depth': 10, 'n_estimators': 200}

Performance on Training Set:
MAE: 2.2567, RMSE: 2.7747, R2: 0.9659

Performance on Test Set:
MAE: 4.5928, RMSE: 5.9364, R2: 0.8552


In [64]:
# K-Neighbors Regressor with Hyperparameter Tuning
knn_params = {"n_neighbors": [3, 5, 10], "weights": ["uniform", "distance"]}
train_and_evaluate_model("K-Neighbors Regressor", KNeighborsRegressor(), param_grid=knn_params)


K-Neighbors Regressor
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'n_neighbors': 10, 'weights': 'distance'}

Performance on Training Set:
MAE: 0.0188, RMSE: 0.2795, R2: 0.9997

Performance on Test Set:
MAE: 5.5987, RMSE: 7.1784, R2: 0.7882


In [65]:
# XGBRegressor with Hyperparameter Tuning
xgb_params = {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
train_and_evaluate_model("XGBRegressor", XGBRegressor(random_state=42), param_grid=xgb_params)


XGBRegressor
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}

Performance on Training Set:
MAE: 2.7651, RMSE: 3.4983, R2: 0.9457

Performance on Test Set:
MAE: 4.5879, RMSE: 5.8661, R2: 0.8586


In [66]:
# AdaBoost Regressor with Hyperparameter Tuning
adaboost_params = {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 1]}
train_and_evaluate_model("AdaBoost Regressor", AdaBoostRegressor(random_state=42), param_grid=adaboost_params)


AdaBoost Regressor
Performing Grid Search for Hyperparameter Tuning...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'learning_rate': 1, 'n_estimators': 200}

Performance on Training Set:
MAE: 4.7132, RMSE: 5.7206, R2: 0.8548

Performance on Test Set:
MAE: 4.6283, RMSE: 5.9801, R2: 0.8530
