In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb

In [2]:
# data = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'

# !wget $data

In [3]:
%%bash

head jamb_exam_results.csv

JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,None,1
182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1
202,25,85,2,13.6,Public,Urban,Yes,No,Medium,Low,6,15,Male,Low,Tertiary,1
251,35,85,4,2.6,Public,Urban,No,Yes,Low,Medium,7,16,Female,Medium,Primary,4
129,27,75,3,9.4,Public,Urban,No,Yes,Low,Medium,8,19,Female,Low,Tertiary,3
220,23,85,3,4.6,Public,Rural,No,No,Low,Medium,9,19,Female,Medium,Tertiary,1


In [4]:
df = pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [5]:
df.shape

(5000, 17)

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [7]:
df.columns

Index(['jamb_score', 'study_hours_per_week', 'attendance_rate',
       'teacher_quality', 'distance_to_school', 'school_type',
       'school_location', 'extra_tutorials', 'access_to_learning_materials',
       'parent_involvement', 'it_knowledge', 'student_id', 'age', 'gender',
       'socioeconomic_status', 'parent_education_level',
       'assignments_completed'],
      dtype='object')

In [8]:
del df['student_id']

In [9]:
df.shape

(5000, 16)

In [10]:
df.isna().sum()

jamb_score                        0
study_hours_per_week              0
attendance_rate                   0
teacher_quality                   0
distance_to_school                0
school_type                       0
school_location                   0
extra_tutorials                   0
access_to_learning_materials      0
parent_involvement                0
it_knowledge                      0
age                               0
gender                            0
socioeconomic_status              0
parent_education_level          891
assignments_completed             0
dtype: int64

### Splitting Data

In [11]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

print('The size of train set is %s' %len(df_train))
print('The size of Validation set is %s' %len(df_val))
print('The size of test set is %s' %len(df_test))

The size of train set is 3000
The size of Validation set is 1000
The size of test set is 1000


In [12]:
df_full_train = df_full_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [13]:
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

In [14]:
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

#### DictVectorizer

In [15]:
dv = DictVectorizer(sparse = False)

In [16]:
X_train = df_train.to_dict(orient = 'records')
X_val = df_val.to_dict(orient = 'records')
X_test = df_test.to_dict(orient = 'records')

In [17]:
#Fit_transform the training, but only transform the val and test

X_train = dv.fit_transform(X_train)

X_val = dv.transform(X_val)

X_test = dv.transform(X_test)

In [18]:
dv.feature_names_

['access_to_learning_materials=No',
 'access_to_learning_materials=Yes',
 'age',
 'assignments_completed',
 'attendance_rate',
 'distance_to_school',
 'extra_tutorials=No',
 'extra_tutorials=Yes',
 'gender=Female',
 'gender=Male',
 'it_knowledge=High',
 'it_knowledge=Low',
 'it_knowledge=Medium',
 'parent_education_level',
 'parent_education_level=Primary',
 'parent_education_level=Secondary',
 'parent_education_level=Tertiary',
 'parent_involvement=High',
 'parent_involvement=Low',
 'parent_involvement=Medium',
 'school_location=Rural',
 'school_location=Urban',
 'school_type=Private',
 'school_type=Public',
 'socioeconomic_status=High',
 'socioeconomic_status=Low',
 'socioeconomic_status=Medium',
 'study_hours_per_week',
 'teacher_quality']

### Training DecisionTreeRegressor

In [19]:
dtr = DecisionTreeRegressor(max_depth=1) 

dtr.fit(X_train, y_train)

In [20]:
y_pred = dtr.predict(X_val)
#y_pred

In [21]:
print(export_text(dtr, feature_names = dv.feature_names_))

|--- study_hours_per_week <= 21.50
|   |--- value: [159.96]
|--- study_hours_per_week >  21.50
|   |--- value: [193.76]



#### Observation study_hours_per_week is the deciding variable (Q1

### Random Forest Regressor

In [22]:
rfr = RandomForestRegressor(n_estimators=10,
                            random_state=1,
                            n_jobs=-1,
                           )

rfr.fit(X_train, y_train)

In [23]:
y_pred = rfr.predict(X_val)

In [24]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
rmse.round()

42.0

#### Observation rmse = 41 (Q2)

In [88]:
#score = {}

estimators = list(np.arange(10,201,10))

for i in estimators:
    rfr = RandomForestRegressor(n_estimators=i,
                            random_state=1,
                            n_jobs=-1,
                               )

    rfr.fit(X_train, y_train)
    
    #Eval
    y_pred = rfr.predict(X_val)
    
    #rmse
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse).round(3)
    
    print('%s -> %s' % (i, rmse))
    
    
    

10 -> 41.396
20 -> 40.369
30 -> 40.288
40 -> 40.213
50 -> 40.046
60 -> 39.954
70 -> 39.903
80 -> 39.904
90 -> 39.856
100 -> 39.835
110 -> 39.734
120 -> 39.652
130 -> 39.648
140 -> 39.615
150 -> 39.597
160 -> 39.576
170 -> 39.56
180 -> 39.512
190 -> 39.512
200 -> 39.481


#### Observation: At 80 it stops improving (Q3)

In [89]:
depth = [10, 15, 20, 25]

estimators = list(np.arange(10,201,10))

for d in depth:
    for e in estimators:
        
        rfr = RandomForestRegressor(n_estimators=e,
                                    max_depth = d,
                                    random_state=1,
                                    n_jobs=-1,
                                   )

        rfr.fit(X_train, y_train)

        #Eval
        y_pred = rfr.predict(X_val)

        #rmse
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse).round(3)

        print('(%s %3s) -> %s' % (d, e, rmse))


(10  10) -> 40.842
(10  20) -> 39.982
(10  30) -> 39.934
(10  40) -> 39.848
(10  50) -> 39.699
(10  60) -> 39.623
(10  70) -> 39.643
(10  80) -> 39.649
(10  90) -> 39.628
(10 100) -> 39.58
(10 110) -> 39.496
(10 120) -> 39.481
(10 130) -> 39.47
(10 140) -> 39.428
(10 150) -> 39.415
(10 160) -> 39.407
(10 170) -> 39.389
(10 180) -> 39.34
(10 190) -> 39.338
(10 200) -> 39.309
(15  10) -> 41.601
(15  20) -> 40.55
(15  30) -> 40.379
(15  40) -> 40.252
(15  50) -> 40.052
(15  60) -> 39.913
(15  70) -> 39.896
(15  80) -> 39.871
(15  90) -> 39.858
(15 100) -> 39.843
(15 110) -> 39.729
(15 120) -> 39.652
(15 130) -> 39.629
(15 140) -> 39.598
(15 150) -> 39.605
(15 160) -> 39.586
(15 170) -> 39.543
(15 180) -> 39.482
(15 190) -> 39.474
(15 200) -> 39.449
(20  10) -> 41.431
(20  20) -> 40.517
(20  30) -> 40.398
(20  40) -> 40.247
(20  50) -> 40.021
(20  60) -> 39.891
(20  70) -> 39.87
(20  80) -> 39.863
(20  90) -> 39.824
(20 100) -> 39.812
(20 110) -> 39.747
(20 120) -> 39.677
(20 130) -> 39.68

#### Depth 15, Estimator 10 is best at 41.6 rmse

In [28]:
rfr = RandomForestRegressor(n_estimators=10,
                           max_depth=20,
                           random_state=1,
                           n_jobs=-1
                           ) 

rfr.fit(X_train, y_train)

In [29]:
rfr.feature_importances_

array([0.00915206, 0.00969821, 0.06333469, 0.03686602, 0.14641126,
       0.14392933, 0.01199014, 0.00871744, 0.01010811, 0.01135039,
       0.02075289, 0.01306686, 0.01097987, 0.01287561, 0.01240107,
       0.01363259, 0.01288401, 0.02188263, 0.01389837, 0.01132737,
       0.00955292, 0.00834019, 0.01031942, 0.01204265, 0.01698614,
       0.0130963 , 0.01056672, 0.24629572, 0.07754101])

In [30]:
feature_importance = rfr.feature_importances_

In [33]:
df_feat_imp = pd.DataFrame({'feature':dv.feature_names_, 'feature_importance': feature_importance})
df_feat_imp

Unnamed: 0,feature,feature_importance
0,access_to_learning_materials=No,0.009152
1,access_to_learning_materials=Yes,0.009698
2,age,0.063335
3,assignments_completed,0.036866
4,attendance_rate,0.146411
5,distance_to_school,0.143929
6,extra_tutorials=No,0.01199
7,extra_tutorials=Yes,0.008717
8,gender=Female,0.010108
9,gender=Male,0.01135


#### Study hours is the most important feature. 0.24 score

In [25]:
import xgboost as xgb

In [28]:
#Create an xgb matrix for training set

features = dv.feature_names_
dtrain = xgb.DMatrix(X_train, label = y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label = y_val, feature_names = features)

In [29]:
dtrain

<xgboost.core.DMatrix at 0x1957fe796d0>

In [30]:
dval = xgb.DMatrix(X_val, label = y_val, feature_names = features)

In [35]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round = 200)

In [33]:
# d = 'https://github.com/dafriedman97/mlbook/blob/master/book.pdf'
# !wget $d

In [36]:
y_pred = model.predict(dval)
y_pred

array([149.54027 , 288.29657 , 191.76028 , 249.38756 , 160.1788  ,
       226.8718  , 157.39159 , 133.40932 , 239.23856 , 163.12344 ,
       164.764   , 166.9097  , 182.88322 , 196.18097 , 171.07779 ,
       160.49368 , 180.7611  , 172.13115 , 175.5134  , 181.626   ,
       228.63069 , 148.26729 , 165.25511 , 181.92169 , 198.5632  ,
       201.87936 , 151.94286 , 156.21945 , 144.42862 , 136.41873 ,
       205.93178 , 269.56174 , 206.74501 , 166.4127  , 138.94382 ,
       178.2859  , 130.01555 , 169.0604  , 191.8803  , 158.73224 ,
       150.37746 , 129.32857 , 245.34892 , 157.60133 , 149.68877 ,
       224.0331  , 169.9342  , 138.79428 , 167.10315 , 179.38301 ,
       247.57878 , 139.12692 , 154.7045  , 159.33148 , 162.79164 ,
       131.25282 , 153.47707 , 241.00966 , 251.015   , 129.74443 ,
       166.09198 , 206.2988  , 217.02829 , 165.037   , 202.2927  ,
       151.76225 , 179.56802 , 199.77797 , 207.03047 , 176.14453 ,
       196.91605 , 252.47313 , 159.86366 , 222.06673 , 163.651

In [39]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse).round(3)
rmse

43.373

In [40]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round = 200)

In [41]:
y_pred = model.predict(dval)
y_pred

array([133.42189 , 242.84396 , 155.85428 , 245.87595 , 163.65271 ,
       213.54332 , 152.23294 , 166.60893 , 206.79015 , 176.09958 ,
       182.01715 , 159.24297 , 168.0592  , 203.86656 , 171.60265 ,
       191.53282 , 182.17274 , 149.71452 , 183.22668 , 179.3352  ,
       232.01515 , 145.20316 , 143.91298 , 172.48969 , 181.02925 ,
       175.98999 , 178.57753 , 132.30173 , 166.03706 , 146.59172 ,
       196.1733  , 282.0722  , 205.7309  , 163.2765  , 143.21854 ,
       173.3828  , 138.04163 , 181.23463 , 202.04272 , 145.78899 ,
       126.644   , 131.68402 , 214.67265 , 156.8211  , 155.37808 ,
       215.14911 , 193.32275 , 150.9679  , 176.99591 , 194.40266 ,
       242.86462 , 150.52896 , 171.76321 , 149.39839 , 162.02774 ,
       120.42929 , 174.61394 , 229.66551 , 260.7071  , 131.54614 ,
       175.03532 , 189.28178 , 217.05548 , 158.57167 , 202.79895 ,
       160.51001 , 190.29997 , 191.13812 , 192.67441 , 160.71564 ,
       188.5156  , 242.35751 , 152.47104 , 183.36115 , 181.700

In [42]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse).round(3)
rmse

40.316