In [11]:
import numpy as np
import pandas as pd
import requests
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text

In [12]:
# Getting the data 1
url = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'
dataset_path = '../datasets/'
response = requests.get(url)
with open(f'{dataset_path}jamb_exam_results.csv', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

In [13]:
# Preparing the dataset
df = pd.read_csv(f'{dataset_path}jamb_exam_results.csv')
# First, let's make the names lowercase:
df.columns = df.columns.str.lower().str.replace(' ', '_')
# Remove the student_id column.
del df['student_id']
# Fill missing values with zeros.
df = df.fillna(0)

In [14]:
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df), len(df_full_train), len(df_train), len(df_test), len(df_val)

(5000, 4000, 3000, 1000, 1000)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.jamb_score.astype('int').values
y_val = df_val.jamb_score.astype('int').values
y_test = df_test.jamb_score.astype('int').values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [16]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=True)

In [17]:
# Question 1
# Let's train a decision tree regressor to predict the jamb_score variable.
# 
# Train a model with max_depth=1.
# Which feature is used for splitting the data?
# 
# study_hours_per_week
# attendance_rate
# teacher_quality
# distance_to_school

In [18]:
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [19]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [20]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [21]:
y_pred = dt.predict(X_val)
y_pred

array([188.59301587, 155.24421053, 188.59301587, 155.24421053,
       188.59301587, 188.59301587, 188.59301587, 155.24421053,
       155.24421053, 188.59301587, 188.59301587, 188.59301587,
       155.24421053, 188.59301587, 155.24421053, 155.24421053,
       155.24421053, 155.24421053, 188.59301587, 155.24421053,
       188.59301587, 155.24421053, 188.59301587, 188.59301587,
       188.59301587, 188.59301587, 155.24421053, 155.24421053,
       155.24421053, 188.59301587, 188.59301587, 188.59301587,
       155.24421053, 188.59301587, 188.59301587, 155.24421053,
       155.24421053, 188.59301587, 155.24421053, 155.24421053,
       155.24421053, 155.24421053, 188.59301587, 155.24421053,
       155.24421053, 188.59301587, 188.59301587, 188.59301587,
       155.24421053, 188.59301587, 155.24421053, 155.24421053,
       155.24421053, 188.59301587, 188.59301587, 188.59301587,
       188.59301587, 155.24421053, 188.59301587, 155.24421053,
       188.59301587, 188.59301587, 155.24421053, 155.24

In [22]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]


In [23]:
# Question 2
# Train a random forest regressor with these parameters:
# 
# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?
# 
# 22.13
# 42.13
# 62.13
# 82.12

In [24]:
def train_random_forest_regressor(X_train, y_train, X_val, n_estimators=100, random_state=1, max_depth=None):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    return y_pred

In [25]:
y_pred = train_random_forest_regressor(X_train, y_train, X_val, n_estimators=10, random_state=1, max_depth=None)
y_pred

array([234.7, 145. , 195.2, 139.4, 196.8, 249.9, 222.5, 211.1, 164.3,
       222.9, 201.2, 172.5, 189. , 202.7, 169.9, 138.8, 160.9, 131.7,
       179.5, 128.3, 189.1, 139.1, 198.2, 194.2, 215.5, 183.6, 160.2,
       154.9, 140.9, 147.4, 189.8, 218.6, 140.6, 163.9, 222. , 140. ,
       150.4, 191.1, 165.1, 168.1, 180.4, 142.7, 191.9, 150.8, 121.9,
       154.8, 182.1, 194.1, 148.9, 197.3, 145.2, 157.5, 158.4, 226.7,
       262.6, 230.2, 177.6, 152.2, 157.5, 150.5, 199.2, 160.1, 143.5,
       157.2, 141.2, 209.1, 260.3, 178.4, 198.5, 149.3, 186.3, 148.4,
       199.4, 160.4, 163.3, 149.3, 280.7, 156.5, 167.4, 197.2, 220.4,
       189.2, 141.1, 179.7, 201.8, 145. , 186.3, 131.9, 162.8, 160.7,
       160.6, 160.6, 162.1, 158.4, 151.3, 126.2, 147.6, 176.8, 180.7,
       171.8, 134. , 204.4, 192.5, 195.5, 164.9, 174.8, 177.3, 132.1,
       207.6, 126.8, 186.4, 175.9, 184.8, 155.3, 253.3, 145.3, 220.3,
       168.4, 204.2, 160.7, 137.7, 144.7, 155.4, 133.7, 228.5, 171.5,
       170. , 202.2,

In [26]:
def rmse(y, y_p):
    se = (y - y_p) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [27]:
rmse(y_val, y_pred)

np.float64(42.13724207871227)

In [28]:
# Question 3
# Now let's experiment with the n_estimators parameter
# 
# Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.
# After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.
# 
# 10
# 25
# 80
# 200

In [29]:
for n in range(10, 201, 10):
    y_pred = train_random_forest_regressor(X_train, y_train, X_val, n_estimators=n, random_state=1, max_depth=None)
    rmse_val = rmse(y_val, y_pred)
    print(n, rmse_val.round(3))

10 42.137
20 41.461
30 41.106
40 40.917
50 40.852
60 40.784
70 40.677
80 40.539
90 40.504
100 40.517
110 40.593
120 40.625
130 40.651
140 40.595
150 40.597
160 40.604
170 40.628
180 40.641
190 40.631
200 40.601


In [30]:
# Question 4
# Let's select the best max_depth:
# 
# Try different values of max_depth: [10, 15, 20, 25]
# For each of these values,
# try different values of n_estimators from 10 till 200 (with step 10)
# calculate the mean RMSE
# Fix the random seed: random_state=1
# What's the best max_depth, using the mean RMSE?
# 
# 10
# 15
# 20
# 25

In [31]:
scores = []
for depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        y_pred = train_random_forest_regressor(X_train, y_train, X_val, n_estimators=n, random_state=1, max_depth=depth)
        rmse_val = rmse(y_val, y_pred).round(3)
        scores.append((depth, n, rmse_val))

In [32]:
columns = ['max_depth', 'n_estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores

Unnamed: 0,max_depth,n_estimators,rmse
0,10,10,41.258
1,10,20,40.881
2,10,30,40.625
3,10,40,40.270
4,10,50,40.317
...,...,...,...
75,25,160,40.600
76,25,170,40.624
77,25,180,40.639
78,25,190,40.629


In [33]:
df_scores.loc[df_scores['max_depth'] == 10].rmse.mean()

np.float64(40.3924)

In [34]:
df_scores.groupby('max_depth')['rmse'].mean()

max_depth
10    40.39240
15    40.73535
20    40.73970
25    40.78785
Name: rmse, dtype: float64

In [35]:
# Question 5
# We can extract feature importance information from tree-based models.
# 
# At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.
# 
# In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.
# 
# For this homework question, we'll find the most important feature:
# 
# Train the model with these parameters:
# n_estimators=10,
# max_depth=20,
# random_state=1,
# n_jobs=-1 (optional)
# Get the feature importance information from this model
# What's the most important feature (among these 4)?
# 
# study_hours_per_week
# attendance_rate
# distance_to_school
# teacher_quality

In [36]:
rfc = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=1)
rfc.fit(X_train, y_train)

In [37]:
columns = ['feature_importances', 'feature_names']
df_f = pd.DataFrame(rfc.feature_importances_, columns=['feature_importances'], index=dv.get_feature_names_out())
df_f = df_f.feature_importances.sort_values(ascending=False)
df_f.iloc[:1]

distance_to_school    0.116631
Name: feature_importances, dtype: float64

In [38]:
# Question 6
# Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
# 
# Install XGBoost
# Create DMatrix for train and validation
# Create a watchlist
# Train a model with these parameters for 100 rounds:
# xgb_params = {
#     'eta': 0.3, 
#     'max_depth': 6,
#     'min_child_weight': 1,
#     
#     'objective': 'reg:squarederror',
#     'nthread': 8,
#     
#     'seed': 1,
#     'verbosity': 1,
# }
# Now change eta from 0.3 to 0.1.
# 
# Which eta leads to the best RMSE score on the validation dataset?
# 
# 0.3
# 0.1
# Both give equal value

In [39]:
import xgboost as xgb

In [40]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [41]:
watchlist = [(dval, 'val')]

In [42]:
%%capture captured1

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model1 = xgb.train(xgb_params, dval, num_boost_round=100,evals=watchlist)

In [43]:
# noinspection PyUnresolvedReferences
print(captured1)

[0]	val-rmse:42.22092
[1]	val-rmse:38.00098
[2]	val-rmse:35.30160
[3]	val-rmse:33.14341
[4]	val-rmse:31.17431
[5]	val-rmse:29.87183
[6]	val-rmse:28.47100
[7]	val-rmse:27.04806
[8]	val-rmse:26.08545
[9]	val-rmse:24.83442
[10]	val-rmse:23.79271
[11]	val-rmse:23.09088
[12]	val-rmse:22.18666
[13]	val-rmse:21.73843
[14]	val-rmse:20.98967
[15]	val-rmse:19.93035
[16]	val-rmse:18.99987
[17]	val-rmse:18.40591
[18]	val-rmse:17.93142
[19]	val-rmse:17.16781
[20]	val-rmse:16.60290
[21]	val-rmse:15.74102
[22]	val-rmse:15.52199
[23]	val-rmse:15.17426
[24]	val-rmse:14.70306
[25]	val-rmse:14.14268
[26]	val-rmse:13.70990
[27]	val-rmse:13.42489
[28]	val-rmse:12.97467
[29]	val-rmse:12.86120
[30]	val-rmse:12.44184
[31]	val-rmse:11.78380
[32]	val-rmse:11.58079
[33]	val-rmse:11.22204
[34]	val-rmse:10.94894
[35]	val-rmse:10.67196
[36]	val-rmse:10.45297
[37]	val-rmse:10.35956
[38]	val-rmse:10.07197
[39]	val-rmse:9.80441
[40]	val-rmse:9.42840
[41]	val-rmse:9.16857
[42]	val-rmse:9.03478
[43]	val-rmse:8.86470
[44

In [44]:
%%capture captured2

xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model2 = xgb.train(xgb_params, dtrain, num_boost_round=100,evals=watchlist)

In [45]:
# noinspection PyUnresolvedReferences
print(captured2)

[0]	val-rmse:47.00533
[1]	val-rmse:45.92344
[2]	val-rmse:44.98366
[3]	val-rmse:44.25755
[4]	val-rmse:43.57339
[5]	val-rmse:43.11181
[6]	val-rmse:42.61054
[7]	val-rmse:42.18883
[8]	val-rmse:41.86754
[9]	val-rmse:41.64338
[10]	val-rmse:41.39235
[11]	val-rmse:41.14265
[12]	val-rmse:40.95201
[13]	val-rmse:40.81778
[14]	val-rmse:40.75008
[15]	val-rmse:40.61341
[16]	val-rmse:40.51800
[17]	val-rmse:40.41659
[18]	val-rmse:40.33546
[19]	val-rmse:40.25632
[20]	val-rmse:40.25010
[21]	val-rmse:40.19826
[22]	val-rmse:40.21101
[23]	val-rmse:40.14758
[24]	val-rmse:40.15079
[25]	val-rmse:40.12003
[26]	val-rmse:40.08693
[27]	val-rmse:40.08567
[28]	val-rmse:40.07263
[29]	val-rmse:40.14000
[30]	val-rmse:40.13806
[31]	val-rmse:40.11982
[32]	val-rmse:40.13489
[33]	val-rmse:40.16486
[34]	val-rmse:40.17118
[35]	val-rmse:40.16103
[36]	val-rmse:40.19895
[37]	val-rmse:40.19394
[38]	val-rmse:40.20019
[39]	val-rmse:40.18898
[40]	val-rmse:40.17753
[41]	val-rmse:40.19907
[42]	val-rmse:40.23107
[43]	val-rmse:40.2442