In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

In [2]:
# !curl -O https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

In [3]:
df = pd.read_csv("jamb_exam_results.csv")
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
df.drop(columns=["student_id"]).head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [14]:
df.fillna(0).head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [9]:
train_full, test = train_test_split(df, test_size=0.2, random_state=1)
train, val = train_test_split(train_full, test_size=0.25, random_state=1)

In [10]:
dv = DictVectorizer(sparse=True)

In [None]:
train_dict = train.to_dict(orient="records")
val_dict = val.to_dict(orient="records")
test_dict = test.to_dict(orient="records")

X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)

In [15]:
X_train_dict = train.drop(columns=["jamb_score"]).to_dict(orient="records")
X_train_encoded = dv.fit_transform(X_train_dict)

y_train = train["jamb_score"]

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

feature_index = dt.tree_.feature[0]
splitting_feature = dv.feature_names_[feature_index]

splitting_feature

'study_hours_per_week'

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

rf.fit(X_train_encoded, y_train)

X_val_dict = val.drop(columns=["jamb_score"]).to_dict(orient="records")
X_val_encoded = dv.transform(X_val_dict)
y_val = val["jamb_score"]
y_pred = rf.predict(X_val_encoded)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))

rmse


41.29965036171614

In [17]:
rmse_values = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_encoded, y_train)  
    
    y_pred = rf.predict(X_val_encoded)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append((n, rmse))
    print(f"n_estimators={n}, RMSE={rmse:.3f}")

for i in range(1, len(rmse_values)):
    if abs(rmse_values[i][1] - rmse_values[i-1][1]) < 0.001:
        {rmse_values[i-1][0]}
        break


n_estimators=10, RMSE=41.300
n_estimators=20, RMSE=41.124
n_estimators=30, RMSE=40.775
n_estimators=40, RMSE=40.487
n_estimators=50, RMSE=40.419
n_estimators=60, RMSE=40.513
n_estimators=70, RMSE=40.574
n_estimators=80, RMSE=40.538
n_estimators=90, RMSE=40.548
n_estimators=100, RMSE=40.580
n_estimators=110, RMSE=40.574
n_estimators=120, RMSE=40.562
n_estimators=130, RMSE=40.551
n_estimators=140, RMSE=40.558
n_estimators=150, RMSE=40.542
n_estimators=160, RMSE=40.546
n_estimators=170, RMSE=40.565
n_estimators=180, RMSE=40.544
n_estimators=190, RMSE=40.480
n_estimators=200, RMSE=40.465


In [20]:
max_depth_values = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)

mean_rmse_per_depth = {}

for max_depth in max_depth_values:
    rmses = []
    for n_estimators in n_estimators_range:
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmses.append(rmse)
    
    mean_rmse_per_depth[max_depth] = np.mean(rmses)

best_max_depth = min(mean_rmse_per_depth, key=mean_rmse_per_depth.get)
best_rmse = mean_rmse_per_depth[best_max_depth]

best_max_depth
best_rmse


ValueError: could not convert string to float: 'Public'