In [86]:
import numpy as np
import pandas as pd
import requests
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text

In [87]:
# Getting the data 1
url = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'
dataset_path = '../datasets/'
response = requests.get(url)
with open(f'{dataset_path}jamb_exam_results.csv', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

In [88]:
# Preparing the dataset
df = pd.read_csv(f'{dataset_path}jamb_exam_results.csv')
# First, let's make the names lowercase:
df.columns = df.columns.str.lower().str.replace(' ', '_')
# Remove the student_id column.
del df['student_id']
# Fill missing values with zeros.
df = df.fillna(0)

In [89]:
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df), len(df_full_train), len(df_train), len(df_test), len(df_val)

(5000, 4000, 3000, 1000, 1000)

In [90]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=True)

In [91]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [92]:
y_train = df_train.jamb_score.astype('int').values
y_val = df_val.jamb_score.astype('int').values
y_test = df_test.jamb_score.astype('int').values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [93]:
df_train

Unnamed: 0,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,20,72,3,4.4,Public,Urban,No,Yes,Medium,Low,21,Female,Low,0,3
1,11,80,2,3.3,Public,Urban,Yes,Yes,Medium,High,22,Female,Medium,Secondary,1
2,31,82,1,8.3,Public,Urban,Yes,Yes,Low,High,19,Female,High,Tertiary,2
3,29,79,1,15.8,Public,Rural,Yes,Yes,Low,Low,19,Male,Low,Primary,2
4,28,96,2,8.9,Private,Rural,Yes,Yes,Medium,Low,19,Male,High,Secondary,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1,79,3,12.2,Public,Urban,No,No,Low,High,17,Male,High,Secondary,1
2996,3,87,1,9.6,Public,Urban,No,No,Medium,Medium,17,Male,Medium,Primary,1
2997,17,96,4,13.7,Private,Urban,No,Yes,High,Medium,16,Male,Medium,Primary,2
2998,25,74,2,3.4,Public,Rural,No,No,High,High,21,Male,Low,Secondary,1


In [94]:
# Question 1
# Let's train a decision tree regressor to predict the jamb_score variable.
# 
# Train a model with max_depth=1.
# Which feature is used for splitting the data?
# 
# study_hours_per_week
# attendance_rate
# teacher_quality
# distance_to_school

In [95]:
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [96]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [97]:
dt = DecisionTreeRegressor() 
dt.fit(X_train, y_train)

In [98]:
y_pred = dt.predict(X_val)
y_pred

array([292., 103., 207., 108., 237., 287., 208., 149., 118., 140., 237.,
       170., 153., 166., 116., 100., 161., 133., 217., 121., 175., 142.,
       215., 244., 216., 167., 188., 105., 187., 193., 151., 184., 133.,
       194., 241., 142., 188., 227., 245., 124., 148., 179., 246., 138.,
       194., 110., 185., 215., 125., 220., 187., 132., 143., 226., 248.,
       217., 249., 133., 119., 148., 219., 219., 128., 203., 273., 162.,
       272., 200., 156., 150., 227., 118., 263., 202., 149., 135., 230.,
       174., 100., 114., 221., 210., 121., 214., 196., 102., 148., 117.,
       100., 183., 129., 168., 225., 185., 101., 115., 148., 198., 208.,
       160., 101., 233., 135., 250., 118., 189., 205., 175., 231., 136.,
       193., 200., 218., 152., 288., 100., 227., 226., 263., 102., 176.,
       109., 218., 191., 245., 231., 149., 248., 115., 152., 103., 158.,
       116., 231., 159., 157., 132., 240., 214., 182., 142., 205., 165.,
       242., 184., 155., 199., 234., 191., 158., 17

In [99]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- study_hours_per_week <= 18.50
|   |--- teacher_quality <= 2.50
|   |   |--- attendance_rate <= 89.50
|   |   |   |--- study_hours_per_week <= 14.50
|   |   |   |   |--- distance_to_school <= 6.75
|   |   |   |   |   |--- attendance_rate <= 83.50
|   |   |   |   |   |   |--- age <= 21.50
|   |   |   |   |   |   |   |--- gender=Male <= 0.50
|   |   |   |   |   |   |   |   |--- distance_to_school <= 0.45
|   |   |   |   |   |   |   |   |   |--- study_hours_per_week <= 5.50
|   |   |   |   |   |   |   |   |   |   |--- value: [101.00]
|   |   |   |   |   |   |   |   |   |--- study_hours_per_week >  5.50
|   |   |   |   |   |   |   |   |   |   |--- extra_tutorials=No <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- value: [223.00]
|   |   |   |   |   |   |   |   |   |   |--- extra_tutorials=No >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- value: [167.00]
|   |   |   |   |   |   |   |   |--- distance_to_school >  0.45
|   |   |   |   |   |   |   |   |   |--- distance_t

In [100]:
# Question 2
# Train a random forest regressor with these parameters:
# 
# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?
# 
# 22.13
# 42.13
# 62.13
# 82.12

In [101]:
rf = RandomForestClassifier(n_estimators=10, random_state=1)
rf.fit(X_train, y_train)

In [102]:
y_pred = rf.predict(X_val)
y_pred

array([116, 193, 150, 149, 101, 118, 119, 149, 196, 107, 132, 168, 116,
       160, 108, 158, 192, 104, 110, 100, 150, 102, 114, 181, 168, 163,
       103, 197, 138, 138, 118, 181, 153, 168, 160, 118, 118, 221, 189,
       191, 151, 117, 231, 119, 118, 101, 212, 178, 116, 226, 103, 100,
       134, 242, 103, 134, 102, 263, 144, 181, 194, 219, 115, 167, 143,
       289, 132, 103, 107, 107, 139, 161, 120, 130, 182, 130, 135, 111,
       100, 195, 100, 156, 101, 124, 153, 101, 114, 122, 197, 118, 117,
       108, 110, 119, 107, 102, 103, 243, 107, 201, 136, 170, 117, 201,
       225, 298, 124, 101, 139, 102, 128, 138, 158, 152, 104, 100, 116,
       134, 173, 154, 127, 138, 200, 102, 242, 157, 128, 227, 224, 168,
       145, 136, 199, 100, 112, 104, 102, 102, 139, 241, 215, 171, 129,
       171, 241, 130, 108, 165, 191, 103, 134, 227, 170, 164, 129, 184,
       110, 123, 112, 112, 244, 106, 138, 183, 131, 144, 106, 169, 115,
       176, 138, 213, 246, 156, 129, 184, 120, 111, 102, 101, 11

In [103]:
def rmse(y, y_p):
    se = (y - y_p) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [104]:
rmse(y_val, y_pred)

np.float64(64.60024767754378)