## Homework - Unit 6

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import wget
# import zipfile

In [2]:
data = wget.download('https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv')

In [54]:
df = pd.read_csv(data)
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [55]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [56]:
df = df.drop('student_id', axis=1)

In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [58]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [59]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

In [60]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [61]:
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

In [62]:
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [63]:
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [64]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

In [65]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

### Question 1

Let's train a decision tree regressor to predict the jamb_score variable.
- Train a model with max_depth=1.

Which feature is used for splitting the data?

In [66]:
dt = DecisionTreeClassifier(max_depth=1)
dt.fit(X_train, y_train)

In [67]:
# Check the feature used for splitting
feature_importance = dt.tree_.feature[0]
feature_name = dv.feature_names_[feature_importance]
print(f"The feature used for the first split is: {feature_name}")

The feature used for the first split is: study_hours_per_week


### Question 2
Train a random forest regressor with these parameters:

- n_estimators=10
- random_state=1
- n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on the validation data?

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

In [69]:
rf = RandomForestClassifier(n_estimators=10,
                            random_state=1,
                            n_jobs=-1)
rf.fit(X_train, y_train)

In [70]:
# Step 2: Predict on the validation data
y_pred = rf.predict(X_val)

# Step 3: Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"The RMSE of the model on the validation data is: {rmse}")

The RMSE of the model on the validation data is: 66.56616257529046


### Question 3
Now let's experiment with the n_estimators parameter

- Try different values of this parameter from 10 to 200 with step 10.
- Set random_state to 1.
- Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

In [71]:
from sklearn.ensemble import RandomForestRegressor

In [72]:
rmse_values = []

for n in range(10, 201, 10):
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    
    rf_model.fit(X_train, y_train)
    
    y_pred = rf_model.predict(X_val)
    
    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
    rmse_values.append((n, rmse))
    
    print(f"n_estimators: {n}, RMSE: {rmse}")

n_estimators: 10, RMSE: 41.825
n_estimators: 20, RMSE: 41.196
n_estimators: 30, RMSE: 40.857
n_estimators: 40, RMSE: 40.666
n_estimators: 50, RMSE: 40.678
n_estimators: 60, RMSE: 40.604
n_estimators: 70, RMSE: 40.535
n_estimators: 80, RMSE: 40.404
n_estimators: 90, RMSE: 40.377
n_estimators: 100, RMSE: 40.43
n_estimators: 110, RMSE: 40.472
n_estimators: 120, RMSE: 40.522
n_estimators: 130, RMSE: 40.53
n_estimators: 140, RMSE: 40.465
n_estimators: 150, RMSE: 40.436
n_estimators: 160, RMSE: 40.462
n_estimators: 170, RMSE: 40.479
n_estimators: 180, RMSE: 40.482
n_estimators: 190, RMSE: 40.468
n_estimators: 200, RMSE: 40.454


### Question 4
Let's select the best max_depth:

- Try different values of max_depth: [10, 15, 20, 25]
- For each of these values,
    - try different values of n_estimators from 10 till 200 (with step 10)
    - calculate the mean RMSE
- Fix the random seed: random_state=1

What's the best max_depth, using the mean RMSE?

In [82]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

mean_rmse_per_depth = {}

for max_depth in max_depth_values:
    rmse_values = []
    
    for n_estimators in n_estimators_values:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        
        y_pred = rf_model.predict(X_val)
        
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_values.append(rmse)
    
    # Calculamos el RMSE promedio para este max_depth
    mean_rmse = np.mean(rmse_values)
    mean_rmse_per_depth[max_depth] = mean_rmse
    print(f"max_depth: {max_depth}, Mean RMSE: {mean_rmse}")

max_depth: 10, Mean RMSE: 40.36502295763947
max_depth: 15, Mean RMSE: 40.63976646924758
max_depth: 20, Mean RMSE: 40.68456298423325
max_depth: 25, Mean RMSE: 40.6113622859957


### Question 5
We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

- Train the model with these parameters:
    - n_estimators=10,
    - max_depth=20,
    - random_state=1,
    - n_jobs=-1 (optional)
- Get the feature importance information from this model

What's the most important feature (among these 4)?

In [83]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

feature_importances = rf_model.feature_importances_

# Asociar la importancia con los nombres de las características
feature_importance_dict = dict(zip(dv.feature_names_, feature_importances))

# Ordenar características por importancia en orden descendente
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Mostrar la importancia de las características
print("Feature Importances (sorted):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.4f}")

Feature Importances (sorted):
study_hours_per_week: 0.2463
attendance_rate: 0.1490
distance_to_school: 0.1349
teacher_quality: 0.0822
age: 0.0698
assignments_completed: 0.0305
socioeconomic_status=High: 0.0257
parent_involvement=High: 0.0220
it_knowledge=High: 0.0176
parent_education_level=Primary: 0.0150
parent_education_level=Secondary: 0.0150
parent_education_level=Tertiary: 0.0142
parent_involvement=Low: 0.0140
it_knowledge=Low: 0.0122
extra_tutorials=Yes: 0.0117
parent_education_level=None: 0.0116
parent_involvement=Medium: 0.0111
it_knowledge=Medium: 0.0110
access_to_learning_materials=No: 0.0109
gender=Female: 0.0106
extra_tutorials=No: 0.0102
access_to_learning_materials=Yes: 0.0101
school_location=Rural: 0.0100
socioeconomic_status=Medium: 0.0099
socioeconomic_status=Low: 0.0098
school_type=Private: 0.0095
gender=Male: 0.0089
school_location=Urban: 0.0083
school_type=Public: 0.0080


### Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

- Install XGBoost
- Create DMatrix for train and validation
- Create a watchlist
- Train a model with these parameters for 100 rounds: [code]

Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

In [84]:
import xgboost as xgb

In [85]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [86]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [88]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
np.sqrt(mean_squared_error(y_val, y_pred))

43.418817345871766

In [89]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [90]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
np.sqrt(mean_squared_error(y_val, y_pred))

41.05034017683498