In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [25]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv

--2024-11-04 14:04:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 391501 (382K) [text/plain]
Saving to: 'jamb_exam_results.csv.3'

     0K .......... .......... .......... .......... .......... 13% 4.75M 0s
    50K .......... .......... .......... .......... .......... 26% 5.01M 0s
   100K .......... .......... .......... .......... .......... 39% 12.5M 0s
   150K .......... .......... .......... .......... .......... 52% 11.2M 0s
   200K .......... .......... .......... .......... .......... 65% 9.92M 0s
   250K .......... .......... .......... .......... .......... 78% 10.4M 0s
   300K .......... .......... .......... .......... .......... 91% 8.38M 0s


### Q1

In [26]:
# Load the dataset
df = pd.read_csv('jamb_exam_results.csv')

In [27]:
# Make column names lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Drop the 'student_id' column
df = df.drop(columns=['student_id'])

# Fill missing values with zeros
df = df.fillna(0)

In [28]:
# Split the data: 60% train, 20% validation, 20% test
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

In [29]:
# Prepare the target variable and feature matrices
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

X_train = df_train.drop(columns=['jamb_score'])
X_val = df_val.drop(columns=['jamb_score'])
X_test = df_test.drop(columns=['jamb_score'])

In [30]:
# Initialize DictVectorizer
dv = DictVectorizer(sparse=True)

In [31]:
# Convert the feature dataframes to dictionaries and fit_transform
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

In [32]:
# Train a Decision Tree Regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

In [33]:
# Retrieve the feature used for splitting
split_feature = dv.feature_names_[dt.tree_.feature[0]]
split_feature

'study_hours_per_week'

### Q2

In [34]:
# Train a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [35]:
# Predict on the validation set
y_pred = rf.predict(X_val)

In [36]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

np.float64(43.157758977963624)

### Q3

In [37]:
# List to store RMSE values for each n_estimators
rmse_values = []

In [38]:
# Loop over n_estimators from 10 to 200 with step 10
for n in range(10, 201, 10):
    # Train a Random Forest Regressor with the current n_estimators
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = rf.predict(X_val)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append((n, rmse))

In [39]:
# Display the RMSE values
for n, rmse in rmse_values:
    print(f"n_estimators={n}, RMSE={rmse:.3f}")

n_estimators=10, RMSE=43.158
n_estimators=20, RMSE=41.790
n_estimators=30, RMSE=41.556
n_estimators=40, RMSE=41.076
n_estimators=50, RMSE=40.957
n_estimators=60, RMSE=40.774
n_estimators=70, RMSE=40.588
n_estimators=80, RMSE=40.503
n_estimators=90, RMSE=40.435
n_estimators=100, RMSE=40.365
n_estimators=110, RMSE=40.348
n_estimators=120, RMSE=40.302
n_estimators=130, RMSE=40.286
n_estimators=140, RMSE=40.263
n_estimators=150, RMSE=40.254
n_estimators=160, RMSE=40.200
n_estimators=170, RMSE=40.187
n_estimators=180, RMSE=40.136
n_estimators=190, RMSE=40.152
n_estimators=200, RMSE=40.138


### Q4

In [40]:
# Dictionary to store the mean RMSE for each max_depth
mean_rmse_per_depth = {}

In [41]:
# Loop over the different values of max_depth
for depth in [10, 15, 20, 25]:
    rmse_values = []
    
    # Loop over n_estimators from 10 to 200 in increments of 10
    for n in range(10, 201, 10):
        # Train the Random Forest Regressor with the current max_depth and n_estimators
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        # Predict on the validation set
        y_pred = rf.predict(X_val)
        
        # Calculate RMSE and add it to the list for the current depth
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_values.append(rmse)
    
    # Calculate mean RMSE for the current max_depth and store it
    mean_rmse = np.mean(rmse_values)
    mean_rmse_per_depth[depth] = mean_rmse
    print(f"max_depth={depth}, mean RMSE={mean_rmse:.3f}")


max_depth=10, mean RMSE=40.138
max_depth=15, mean RMSE=40.644
max_depth=20, mean RMSE=40.610
max_depth=25, mean RMSE=40.688


In [42]:
# Identify the best max_depth based on the lowest mean RMSE
best_max_depth = min(mean_rmse_per_depth, key=mean_rmse_per_depth.get)
best_max_depth

10

### Q5

In [43]:
# Train the Random Forest Regressor with the given parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [44]:
# Get the feature importances
feature_importances = rf.feature_importances_

In [45]:
# Map feature names to their importance values
feature_importance_dict = dict(zip(dv.feature_names_, feature_importances))

In [46]:
# Print the importance for the specified features
for feature in ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']:
    print(f"{feature}: {feature_importance_dict.get(feature, 0):.4f}")

study_hours_per_week: 0.2541
attendance_rate: 0.1521
distance_to_school: 0.1358
teacher_quality: 0.0817


In [47]:
# Identify the most important feature among the specified features
most_important_feature = max(['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality'], key=feature_importance_dict.get)
most_important_feature

'study_hours_per_week'

### Q6

In [48]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
    --------------------------------------- 2.9/124.9 MB 13.9 MB/s eta 0:00:09
   - -------------------------------------- 5.5/124.9 MB 14.0 MB/s eta 0:00:09
   -- ------------------------------------- 9.2/124.9 MB 15.4 MB/s eta 0:00:08
   --- ------------------------------------ 12.3/124.9 MB 15.4 MB/s eta 0:00:08
   ---- ----------------------------------- 15.5/124.9 MB 15.4 MB/s eta 0:00:08
   ------ --------------------------------- 19.4/124.9 MB 16.1 MB/s eta 0:00:07
   ------- -------------------------------- 22.5/124.9 MB 16.4 MB/s eta 0:00:07
   ------- -------------------------------- 22.5/124.9 MB 16.4 MB/s eta 0:00:07
   ------- -------------------------------- 23.1/124.9 MB 12.8 MB/s eta 0:00:08
   -------- ------------------------------- 25.4/124.9 MB 12.5 M

In [49]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

In [50]:
# Convert train and validation datasets into DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [51]:
# Create a watchlist to monitor performance on the validation set
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [52]:
# Define parameters for eta=0.3
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

In [53]:
# Train the model with eta=0.3
model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

In [54]:
# Predict and calculate RMSE for eta=0.3
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))

In [55]:
# Define parameters for eta=0.1
xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

In [56]:
# Train the model with eta=0.1
model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

In [57]:
# Predict and calculate RMSE for eta=0.1
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

In [58]:
# Display RMSE values
print(f"RMSE with eta=0.3: {rmse_03:.3f}")
print(f"RMSE with eta=0.1: {rmse_01:.3f}")

RMSE with eta=0.3: 41.160
RMSE with eta=0.1: 40.257
