In [52]:
# data manipulation
import numpy as np
import pandas as pd

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# machine learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# cross validation
from sklearn.model_selection import RandomizedSearchCV

# hyperparameter tuning
import optuna

# evaluation metrics
from tabulate import tabulate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# damp warnings
import logging

In [15]:
dataframe= pd.read_csv('./StudentsPerformance.csv')

In [16]:
dataframe.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Problem Statement
#### Here we'll try to predict the maths score given the other features for each student.

In [17]:
X= dataframe.drop('math score', axis= 1)
y= dataframe['math score']

In the section of EDA we have seen that besides test scores, all the other columns should be treated as categorical columns.

In [18]:
cat_col= X.select_dtypes(include= 'O').columns
num_col= X.select_dtypes(exclude= 'O').columns

Transforming the columns for training
- StandardScaler: To bring the entire numerical data at the origin region
- OneHotEncoder: As there are not a lot of categories in each column, we can use OneHotEncoder

In [19]:
num_trans= StandardScaler()
cat_trans= OneHotEncoder()

preprocessor= ColumnTransformer([('OneHotEncoder', cat_trans, cat_col),
                                 ('StandardScaler', num_trans, num_col)])

In [20]:
X= preprocessor.fit_transform(X)

In [21]:
X.shape

(1000, 19)

##### Making train-test split for model training

In [22]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

##### Making an evaluation metrics function

In [34]:
def evaluation_metrics(true, predicted):
    MAE= f'{mean_absolute_error(true, predicted):.3f}'
    RMSE= f'{mean_squared_error(true, predicted, squared= False):.3f}'
    R2= f'{r2_score(true, predicted):.3f}'

    metrics_data= {'Metric': ['MAE', 'RMSE', 'R2'],
                   'Value': [MAE, RMSE, R2]}
    
    metrics_dataframe= pd.DataFrame(metrics_data)
    table= tabulate(metrics_dataframe, headers= 'keys', tablefmt= 'pretty', showindex= False)

    return table

##### Evaluating the listed regression models

In [35]:
def evaluate_model(model, model_name, train_features, test_features, train_target, test_target):
    # fit model on train data
    model.fit(train_features, train_target)
    
    # find predictions for train and test data
    train_prediction= model.predict(train_features)
    test_prediction= model.predict(test_features)

    # evalute model performance
    train_metrics= evaluation_metrics(train_target, train_prediction)
    test_metrics= evaluation_metrics(test_target, test_prediction)

    print('===================================================================================')
    print(f'MODEL : {model_name}')
    print('-----------------------------------------------------------------------------------')
    print('TRAIN DATA')
    print(train_metrics)
    print('-----------------------------------------------------------------------------------')
    print('TEST DATA')
    print(test_metrics)

In [36]:
# listed models
models= {'Liner Regression': LinearRegression(),
         'Lasso': Lasso(),
         'Ridge': Ridge(),
         'Random Forest Regressor': RandomForestRegressor(),
         'XGBoost': XGBRegressor(verbosity= 0),
         'LGBM Regressor': LGBMRegressor(verbosity= -1),
         'CatBoost Regressor': CatBoostRegressor(verbose= False),
         'AdaBoost Regressor': AdaBoostRegressor()}

In [37]:
for model_name, model_object in models.items():
    evaluate_model(model_object, model_name, X_train, X_test, y_train, y_test)

MODEL : Liner Regression
-----------------------------------------------------------------------------------
TRAIN DATA
+--------+-------+
| Metric | Value |
+--------+-------+
|  MAE   | 4.272 |
|  RMSE  | 5.340 |
|   R2   | 0.874 |
+--------+-------+
-----------------------------------------------------------------------------------
TEST DATA
+--------+-------+
| Metric | Value |
+--------+-------+
|  MAE   | 4.225 |
|  RMSE  | 5.421 |
|   R2   | 0.879 |
+--------+-------+
MODEL : Lasso
-----------------------------------------------------------------------------------
TRAIN DATA
+--------+-------+
| Metric | Value |
+--------+-------+
|  MAE   | 5.206 |
|  RMSE  | 6.594 |
|   R2   | 0.807 |
+--------+-------+
-----------------------------------------------------------------------------------
TEST DATA
+--------+-------+
| Metric | Value |
+--------+-------+
|  MAE   | 5.158 |
|  RMSE  | 6.520 |
|   R2   | 0.825 |
+--------+-------+
MODEL : Ridge
-------------------------------------

##### Observations:
Best performing models:
- Linear Regression
- Random Forest Regressor
- CatBoost Regressor
- AdaBoost Regressor

Now, we can take each screened-in model and perform a hyperparameter tuning using optuna to find the best model out of them all.

### Hyperparameter tuning of selected models

In [45]:
# making the objective function for optuna
def objective(trial):
    # listing down the selected models
    model_name= trial.suggest_categorical('model', ['LR', 'RFR', 'CAT', 'ADA'])

    # defining hyperparameter range for selected model
    if model_name == 'LR':
        model= LinearRegression()
        
    elif model_name == 'RFR':
        parameters= {"n_estimators": trial.suggest_int("n_estimators_rf", 50, 200),
                     "max_depth": trial.suggest_int("max_depth_rf", 5, 15),
                     "min_samples_split": trial.suggest_int("min_samples_split_rf", 2, 10),
                     "min_samples_leaf": trial.suggest_int("min_samples_leaf_rf", 1, 5)}
        model= RandomForestRegressor(**parameters)

    elif model_name == 'CAT':
        parameters= {"n_estimators": trial.suggest_int("n_estimators_cb", 50, 200),
                     "learning_rate": trial.suggest_float("learning_rate_cb", 0.01, 0.2),
                     "depth": trial.suggest_int("depth_cb", 3, 10),
                     "subsample": trial.suggest_float("subsample_cb", 0.5, 1.0),
                     "colsample_bylevel": trial.suggest_float("colsample_bylevel_cb", 0.5, 1.0),
                     "border_count": trial.suggest_int("border_count_cb", 32, 255)}
        model= CatBoostRegressor(**parameters, verbose= False)

    elif model_name == 'ADA':
        parameters= {"n_estimators": trial.suggest_int("n_estimators_ab", 50, 200),
                     "learning_rate": trial.suggest_float("learning_rate_ab", 0.01, 0.2),
                     "loss": trial.suggest_categorical("loss_ab", ["linear", "square", "exponential"])}
        model= AdaBoostRegressor(**parameters)

    # train model
    model.fit(X_train, y_train)
    
    # making predictions
    y_prediction= model.predict(X_test)

    # loss function
    MSE= mean_squared_error(y_test, y_prediction)
    return MSE

##### Creating optuna study for best model selection

In [54]:
study= optuna.create_study(direction= 'minimize')
study.optimize(objective, n_trials= 100)

[I 2024-01-16 09:47:21,975] A new study created in memory with name: no-name-cdf52fca-6941-4eee-a091-b6020b88e8d0
[I 2024-01-16 09:47:22,521] Trial 0 finished with value: 36.01640059416855 and parameters: {'model': 'RFR', 'n_estimators_rf': 152, 'max_depth_rf': 12, 'min_samples_split_rf': 9, 'min_samples_leaf_rf': 5}. Best is trial 0 with value: 36.01640059416855.
[I 2024-01-16 09:47:22,528] Trial 1 finished with value: 29.39126953125 and parameters: {'model': 'LR'}. Best is trial 1 with value: 29.39126953125.
[I 2024-01-16 09:47:22,960] Trial 2 finished with value: 37.2147692615302 and parameters: {'model': 'ADA', 'n_estimators_ab': 170, 'learning_rate_ab': 0.1607887790854484, 'loss_ab': 'exponential'}. Best is trial 1 with value: 29.39126953125.
[I 2024-01-16 09:47:22,966] Trial 3 finished with value: 29.39126953125 and parameters: {'model': 'LR'}. Best is trial 1 with value: 29.39126953125.
[I 2024-01-16 09:47:23,262] Trial 4 finished with value: 38.18929303357922 and parameters: {'

In [55]:
best_parameters= study.best_params
best_parameters

{'model': 'LR'}

##### Observation:
- Linear Regression model is performing the best, as the loss function is the least.

#### Linear Regression Model Training

In [74]:
LR= LinearRegression()
LR.fit(X_train, y_train)
y_prediction= LR.predict(X_test)

In [75]:
print(evaluation_metrics(y_test, y_prediction))

+--------+-------+
| Metric | Value |
+--------+-------+
|  MAE   | 4.225 |
|  RMSE  | 5.421 |
|   R2   | 0.879 |
+--------+-------+


#### Difference between actual and predicted values

In [76]:
analysis= pd.DataFrame({'True Value': y_test,
                        'Predicted Value': y_prediction,
                        'Value Difference': y_test - y_prediction})

Calculating the standard deviation of the predictions.

In [77]:
round(analysis['Value Difference'].std(), 3)

5.431

##### The final model has an accuracy of 87.9% with a standard deviation of 5.431, which is satisfactory.