In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_text
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv -P '../data'

--2024-11-04 21:36:11--  https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv
Résolution de github.com (github.com)… 140.82.121.4
Connexion à github.com (github.com)|140.82.121.4|:443… connecté.
requête HTTP transmise, en attente de la réponse… 302 Found
Emplacement : https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv [suivant]
--2024-11-04 21:36:11--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Résolution de raw.githubusercontent.com (raw.githubusercontent.com)… 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connexion à raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 391501 (382K) [text/plain]
Sauvegarde en : « ../data/jamb_exam_results.csv.1 »


2024-11-04 21:36:11 (6,15 MB/s) — « ../data/jamb_exam_results.csv.1 » sauvegardé 

In [3]:
df = pd.read_csv('../data/jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

df.drop(columns='student_id', axis=1, inplace=True)

print(f'Contains NA: {df.columns[df.isnull().any()].values}')
df.fillna(0, inplace=True)
print(f'Contains NA: {df.columns[df.isnull().any()].values}')


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=len(df) * 0.2 / len(df_full_train), random_state=1)

y_train = df_train['jamb_score']
y_val = df_val['jamb_score']
y_test = df_test['jamb_score']

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

dv = DictVectorizer(sparse=True)
train_serie_dict = df_train.to_dict(orient='records')
val_serie_dict = df_val.to_dict(orient='records')
test_serie_dict = df_test.to_dict(orient='records')

X_train = dv.fit_transform(train_serie_dict)
X_val = dv.transform(val_serie_dict)
X_test = dv.transform(test_serie_dict)

Contains NA: ['parent_education_level']
Contains NA: []


### Question 1

In [4]:
model = DecisionTreeRegressor(max_depth=1)
model.fit(X_train, y_train)

print(export_text(model, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



### Question 2

In [5]:
def train_model(estimators: int, max_depth: int = None) -> RandomForestRegressor:
    model = RandomForestRegressor(n_estimators=estimators, random_state=1, n_jobs=-1, max_depth=max_depth)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    print(f'Estimators: {estimators} \nMax depth: {max_depth} \nRMSE: {round(rmse, 3)} \n---------------------------------')
    
    return model
    
train_model(10)

Estimators: 10 
Max depth: None 
RMSE: 42.137 
---------------------------------


### Question 3

In [6]:
for n_estimator in range(10, 201, 10):
    train_model(n_estimator)

Estimators: 10 
Max depth: None 
RMSE: 42.137 
---------------------------------
Estimators: 20 
Max depth: None 
RMSE: 41.461 
---------------------------------
Estimators: 30 
Max depth: None 
RMSE: 41.106 
---------------------------------
Estimators: 40 
Max depth: None 
RMSE: 40.917 
---------------------------------
Estimators: 50 
Max depth: None 
RMSE: 40.852 
---------------------------------
Estimators: 60 
Max depth: None 
RMSE: 40.784 
---------------------------------
Estimators: 70 
Max depth: None 
RMSE: 40.677 
---------------------------------
Estimators: 80 
Max depth: None 
RMSE: 40.539 
---------------------------------
Estimators: 90 
Max depth: None 
RMSE: 40.504 
---------------------------------
Estimators: 100 
Max depth: None 
RMSE: 40.517 
---------------------------------
Estimators: 110 
Max depth: None 
RMSE: 40.593 
---------------------------------
Estimators: 120 
Max depth: None 
RMSE: 40.625 
---------------------------------
Estimators: 130 
Max dept

### Question 4

In [7]:
for depth in range(10, 26, 5):
    train_model(estimators=10, max_depth=depth)

Estimators: 10 
Max depth: 10 
RMSE: 41.258 
---------------------------------
Estimators: 10 
Max depth: 15 
RMSE: 42.004 
---------------------------------
Estimators: 10 
Max depth: 20 
RMSE: 42.025 
---------------------------------
Estimators: 10 
Max depth: 25 
RMSE: 42.16 
---------------------------------


### Question 5

In [8]:
model = train_model(10, 20)
feature_importances = model.feature_importances_

feature_importances_df = pd.DataFrame({
    'Feature': list(dv.feature_names_),
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
feature_importances_df

Estimators: 10 
Max depth: 20 
RMSE: 42.025 
---------------------------------


Unnamed: 0,Feature,Importance
27,study_hours_per_week,0.248354
4,attendance_rate,0.149729
5,distance_to_school,0.136486
28,teacher_quality,0.082682
2,age,0.069311
3,assignments_completed,0.031517
24,socioeconomic_status=High,0.025714
17,parent_involvement=High,0.022919
10,it_knowledge=High,0.017719
15,parent_education_level=Secondary,0.016957


### Question 6

In [None]:
def train_xgboost_model(eta: float) -> xgb.Booster:
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,

        'objective': 'reg:squarederror',
        'nthread': 8,

        'seed': 1,
        'verbosity': 1,
    }

    features = list(dv.get_feature_names_out())
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

    watchlist = [(dtrain, 'train'), (dval, 'val')]
    xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=100, verbose_eval=5, evals=watchlist)
    print('-------------------------------------------------')
    
for eta in [0.3, 0.1]:
    train_xgboost_model(eta)

[0]	train-rmse:42.69384	val-rmse:44.89114
[5]	train-rmse:34.57756	val-rmse:40.69096
[10]	train-rmse:31.63404	val-rmse:40.48319
[15]	train-rmse:29.41497	val-rmse:40.86107
[20]	train-rmse:27.49658	val-rmse:41.27921
[25]	train-rmse:26.34353	val-rmse:41.57975
[30]	train-rmse:24.21076	val-rmse:41.72928
[35]	train-rmse:22.46394	val-rmse:42.03417
[40]	train-rmse:21.35340	val-rmse:42.24363
[45]	train-rmse:20.24355	val-rmse:42.27966
[50]	train-rmse:19.25157	val-rmse:42.43824
[55]	train-rmse:18.28398	val-rmse:42.54750
[60]	train-rmse:17.12178	val-rmse:42.64446
[65]	train-rmse:16.41573	val-rmse:42.77416
[70]	train-rmse:15.78314	val-rmse:42.84909
[75]	train-rmse:14.80007	val-rmse:43.00760
[80]	train-rmse:13.96907	val-rmse:43.08250
[85]	train-rmse:13.39102	val-rmse:43.16297
[90]	train-rmse:12.46485	val-rmse:43.25161
[95]	train-rmse:11.95568	val-rmse:43.37919
[99]	train-rmse:11.39140	val-rmse:43.41882
-------------------------------------------------
[0]	train-rmse:45.49999	val-rmse:47.00533
[5]	tra