In [232]:
import pandas as pd
import requests
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text

In [233]:
# Getting the data 1
url = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'
dataset_path = '../datasets/'
response = requests.get(url)
with open(f'{dataset_path}jamb_exam_results.csv', 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

In [234]:
# Preparing the dataset
df = pd.read_csv(f'{dataset_path}jamb_exam_results.csv')
# First, let's make the names lowercase:
df.columns = df.columns.str.lower().str.replace(' ', '_')
# Remove the student_id column.
del df['student_id']
# Fill missing values with zeros.
df = df.fillna(0)

In [235]:
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df), len(df_full_train), len(df_train), len(df_test), len(df_val)

(5000, 4000, 3000, 1000, 1000)

In [236]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=True)

In [237]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [238]:
y_train = df_train.jamb_score.astype('int').values
y_val = df_val.jamb_score.astype('int').values
y_test = df_test.jamb_score.astype('int').values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [239]:
df_train

Unnamed: 0,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,20,72,3,4.4,Public,Urban,No,Yes,Medium,Low,21,Female,Low,0,3
1,11,80,2,3.3,Public,Urban,Yes,Yes,Medium,High,22,Female,Medium,Secondary,1
2,31,82,1,8.3,Public,Urban,Yes,Yes,Low,High,19,Female,High,Tertiary,2
3,29,79,1,15.8,Public,Rural,Yes,Yes,Low,Low,19,Male,Low,Primary,2
4,28,96,2,8.9,Private,Rural,Yes,Yes,Medium,Low,19,Male,High,Secondary,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1,79,3,12.2,Public,Urban,No,No,Low,High,17,Male,High,Secondary,1
2996,3,87,1,9.6,Public,Urban,No,No,Medium,Medium,17,Male,Medium,Primary,1
2997,17,96,4,13.7,Private,Urban,No,Yes,High,Medium,16,Male,Medium,Primary,2
2998,25,74,2,3.4,Public,Rural,No,No,High,High,21,Male,Low,Secondary,1


In [240]:
# Question 1
# Let's train a decision tree regressor to predict the jamb_score variable.
# 
# Train a model with max_depth=1.
# Which feature is used for splitting the data?
# 
# study_hours_per_week
# attendance_rate
# teacher_quality
# distance_to_school

In [241]:
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [242]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [243]:
dt = DecisionTreeRegressor() 

In [244]:
dt.fit(X_train, y_train)

In [245]:
y_pred = dt.predict(X_val)

In [246]:
y_pred

array([292., 125., 207., 117., 237., 287., 214., 149., 118., 120., 237.,
       170., 155., 166., 116., 100., 161., 133., 216., 121., 175., 218.,
       215., 205., 216., 167., 188., 156., 187., 152., 151., 208., 135.,
       194., 242., 137., 188., 227., 234., 141., 148., 167., 246., 185.,
       176., 108., 185., 176., 125., 220., 187., 239., 143., 246., 248.,
       217., 219., 134., 119., 148., 215., 219., 130., 203., 273., 130.,
       272., 210., 139., 116., 186., 118., 263., 190., 174., 150., 273.,
       203., 100., 114., 221., 210., 115., 226., 172., 104., 148., 111.,
       100., 183., 129., 177., 225., 176., 115., 119., 148., 193., 199.,
       106., 101., 233., 133., 250., 118., 189., 260., 192., 231., 136.,
       148., 207., 218., 116., 288., 100., 227., 226., 235., 118., 172.,
       115., 210., 191., 253., 184., 124., 247., 164., 191., 103., 139.,
       115., 231., 172., 157., 132., 243., 224., 184., 142., 194., 183.,
       246., 184., 150., 199., 236., 191., 158., 16

In [251]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- study_hours_per_week <= 18.50
|   |--- teacher_quality <= 2.50
|   |   |--- attendance_rate <= 89.50
|   |   |   |--- study_hours_per_week <= 14.50
|   |   |   |   |--- distance_to_school <= 6.75
|   |   |   |   |   |--- attendance_rate <= 83.50
|   |   |   |   |   |   |--- age <= 21.50
|   |   |   |   |   |   |   |--- gender=Male <= 0.50
|   |   |   |   |   |   |   |   |--- distance_to_school <= 0.45
|   |   |   |   |   |   |   |   |   |--- study_hours_per_week <= 5.50
|   |   |   |   |   |   |   |   |   |   |--- value: [101.00]
|   |   |   |   |   |   |   |   |   |--- study_hours_per_week >  5.50
|   |   |   |   |   |   |   |   |   |   |--- socioeconomic_status=Medium <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- value: [223.00]
|   |   |   |   |   |   |   |   |   |   |--- socioeconomic_status=Medium >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- value: [167.00]
|   |   |   |   |   |   |   |   |--- distance_to_school >  0.45
|   |   |   |   |   |   |   |   |

In [ ]:
# Question 2
# Train a random forest regressor with these parameters:
# 
# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?
# 
# 22.13
# 42.13
# 62.13
# 82.12