In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
student_df = pd.read_csv('data/StudentsPerformance.csv')

In [4]:
student_df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [5]:
X = student_df.drop(columns =['math score'], axis = 1)
y= student_df['math score']

In [6]:
columns = X.columns
cat_features, num_features =[],[]
for col in columns:
    if X[col].dtype == 'O':
        cat_features.append(col)
    else:
        num_features.append(col)
        
print(f"Categorical features: {cat_features}")
print(f"Numerical featues: {num_features}")

Categorical features: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
Numerical featues: ['reading score', 'writing score']


In [7]:
X[cat_features].nunique()

gender                         2
race/ethnicity                 5
parental level of education    6
lunch                          2
test preparation course        2
dtype: int64

In [8]:
numerical_transformers = StandardScaler()
categorical_transformers = OneHotEncoder()

prepocessor = ColumnTransformer(
    [
        ('OneHotEncoder',categorical_transformers,cat_features ),
        ('StandardScaler', numerical_transformers, num_features)
    ]
)


In [9]:
X_transformed = prepocessor.fit_transform(X)

In [10]:
X_transformed.shape


(1000, 19)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed,y, test_size = 0.2, shuffle = True)

In [12]:
X_train

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.55979316, -0.13523784],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.35894946,  0.91822145],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.22189642,  1.11574507],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.46810467, -0.06939663],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.53663119,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -1.10800534, -1.64958556]], shape=(800, 19))

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 19)
(200, 19)
(800,)
(200,)


In [14]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r_squared_score = r2_score(true, predicted)
    
    return mae, mse, r_squared_score

In [15]:
models ={
    "Linear Regression": LinearRegression(),
    "k-neighbours Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [16]:
model_list =[]
r2_list= []

for key, values in models.items():
    model = models[key]
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #Evaluting training
    model_train_mae, model_train_mse, model_train_r2_score = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_r2_score = evaluate_model(y_test, y_test_pred)
    
    print(key)
    model_list.append(key)
    
    print("Model performance for Training Set")
    print("-  Mean Squared Error:{:.4f}".format(model_train_mse))
    print("-  Mean absolute Error:{:.4f}".format(model_train_mae))
    print("-  R2 score:{:.4f}".format(model_train_r2_score))
    
    print('---------------------')
    print("Model performance for Test Set")
    print("-  Mean Squared Error:{:.4f}".format(model_test_mse))
    print("-  Mean absolute Error:{:.4f}".format(model_test_mae))
    print("-  R2 score:{:.4f}".format(model_test_r2_score))
    
    r2_list.append(model_test_r2_score)
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training Set
-  Mean Squared Error:28.4694
-  Mean absolute Error:4.2213
-  R2 score:0.8787
---------------------
Model performance for Test Set
-  Mean Squared Error:28.6171
-  Mean absolute Error:4.3681
-  R2 score:0.8631




found 0 physical cores < 1
  File "c:\Users\ankita\Desktop\Personal_Project\Student_performance_prdiction\venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


k-neighbours Regressor
Model performance for Training Set
-  Mean Squared Error:34.4879
-  Mean absolute Error:4.5953
-  R2 score:0.8531
---------------------
Model performance for Test Set
-  Mean Squared Error:51.2038
-  Mean absolute Error:5.8730
-  R2 score:0.7550


Decision Tree
Model performance for Training Set
-  Mean Squared Error:0.2031
-  Mean absolute Error:0.0387
-  R2 score:0.9991
---------------------
Model performance for Test Set
-  Mean Squared Error:68.5900
-  Mean absolute Error:6.5700
-  R2 score:0.6718


Random Forest Regressor
Model performance for Training Set
-  Mean Squared Error:5.4424
-  Mean absolute Error:1.8449
-  R2 score:0.9768
---------------------
Model performance for Test Set
-  Mean Squared Error:32.2555
-  Mean absolute Error:4.6086
-  R2 score:0.8457


XGBRegressor
Model performance for Training Set
-  Mean Squared Error:1.1961
-  Mean absolute Error:0.7087
-  R2 score:0.9949
---------------------
Model performance for Test Set
-  Mean Squared Er

In [17]:
model_list

['Linear Regression',
 'k-neighbours Regressor',
 'Decision Tree',
 'Random Forest Regressor',
 'XGBRegressor',
 'AdaBoost Regressor']

In [18]:
pd.DataFrame(data = list(zip(model_list, r2_list)), columns = ['Model_name','R2_score']).sort_values(by = ["R2_score"], ascending =False)

Unnamed: 0,Model_name,R2_score
0,Linear Regression,0.863075
3,Random Forest Regressor,0.845667
5,AdaBoost Regressor,0.836348
4,XGBRegressor,0.798446
1,k-neighbours Regressor,0.755004
2,Decision Tree,0.671816
