In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
import optuna
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature


In [19]:
data = pd.read_csv("data/train.csv")
data2 = pd.read_csv("data/Exam_Score_Prediction.csv")
print(f"data = {data.shape}")
print(f"data2 = {data2.shape}")

data = (630000, 13)
data2 = (20000, 13)


In [20]:
data.head()

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [21]:
data2.head()

Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,54.8
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,29.7
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7


In [22]:
data.drop("id", axis=1, inplace=True)
data2.drop("student_id", axis=1, inplace=True)

In [23]:
df = pd.concat([data, data2], axis=0)
print(f"df = {df.shape}")

df = (650000, 12)


In [24]:
df.head()

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


# Encoding 

In [25]:
category_columns = [col for col in df.columns if df[col].dtype == "O"]

ohe_encode = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
encode_columns = ohe_encode.fit_transform(df[category_columns])
encode_columns = pd.DataFrame(encode_columns, columns=ohe_encode.get_feature_names_out(category_columns), index=df.index)
encode_columns.head()

Unnamed: 0,gender_male,gender_other,course_b.sc,course_b.tech,course_ba,course_bba,course_bca,course_diploma,internet_access_yes,sleep_quality_good,sleep_quality_poor,study_method_group study,study_method_mixed,study_method_online videos,study_method_self-study,facility_rating_low,facility_rating_medium,exam_difficulty_hard,exam_difficulty_moderate
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [26]:
df = df.drop(category_columns, axis=1)
df = pd.concat([df, encode_columns], axis=1)
df.head()

Unnamed: 0,age,study_hours,class_attendance,sleep_hours,exam_score,gender_male,gender_other,course_b.sc,course_b.tech,course_ba,...,sleep_quality_good,sleep_quality_poor,study_method_group study,study_method_mixed,study_method_online videos,study_method_self-study,facility_rating_low,facility_rating_medium,exam_difficulty_hard,exam_difficulty_moderate
0,21,7.91,98.8,4.9,78.3,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,18,4.95,94.8,4.7,46.7,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,20,4.68,92.6,5.8,99.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,19,2.0,49.5,8.3,63.9,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,23,7.65,86.9,9.6,100.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Modelling

In [27]:
X = df.drop("exam_score", axis=1)
y = df["exam_score"]

In [28]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2 , random_state=15)

## Model Comparison

In [29]:
result = []
random_s = 15
models = {
    "Decision Tree Regressor" : DecisionTreeRegressor(random_state=random_s),
    "Random Forest Regressor" : RandomForestRegressor(n_jobs=-1, random_state=random_s),
    "LightGBM Regressor": LGBMRegressor(random_state=random_s),
    "XGBoost Regressor": XGBRegressor(eval_metric="rmse", objective="reg:squarederror")
}

for model_name, model in models.items():
    
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring="neg_root_mean_squared_error")
    
    rmse = - scores
    result.append({"Model":model_name,
                   "RMSE mean": rmse.mean(),
                   "RMSE std": rmse.std()})

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 609
[LightGBM] [Info] Number of data points in the train set: 416000, number of used features: 23
[LightGBM] [Info] Start training from score 62.492317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 607
[LightGBM] [Info] Number of data points in the train set: 416000, number of used features: 23
[LightGBM] [Info] Start training from score 62.524603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003677 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [30]:
pd.DataFrame(result).sort_values(by="RMSE mean")

Unnamed: 0,Model,RMSE mean,RMSE std
2,LightGBM Regressor,8.847872,0.011802
3,XGBoost Regressor,8.851408,0.00987
1,Random Forest Regressor,9.198193,0.011645
0,Decision Tree Regressor,12.95329,0.015254


## Hyperparameter Tuning (LightGBM)

In [None]:
def objective_lgbm(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'num_leaves': trial.suggest_int('num_leaves',30, 255 ),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'metric' : 'rmse',
        'verbosity': -1,
        'random_state' : 15
        
    }

    model= LGBMRegressor(**params)

    scores = cross_val_score(model,
                             X,
                             y,
                             cv=3,
                             scoring="neg_root_mean_squared_error")
    
    rmse = - scores
    mean_rmse = rmse.mean()
    rmse_cv_std = rmse.std()

    trial.set_user_attr("rmse_std", rmse_cv_std)
    

    return mean_rmse


study_lgbm = optuna.create_study(direction='minimize')
optuna.logging.set_verbosity(optuna.logging.INFO)
study_lgbm.optimize(objective_lgbm, n_trials=30)


[32m[I 2026-01-24 15:02:20,269][0m A new study created in memory with name: no-name-6bfdff8d-f908-42cd-a8d1-1d799cae200a[0m
[32m[I 2026-01-24 15:03:21,295][0m Trial 0 finished with value: 8.812125947701395 and parameters: {'max_depth': 12, 'learning_rate': 0.014426360804159263, 'n_estimators': 1101, 'num_leaves': 76, 'min_child_samples': 19}. Best is trial 0 with value: 8.812125947701395.[0m
[32m[I 2026-01-24 15:04:57,230][0m Trial 1 finished with value: 8.792144579124008 and parameters: {'max_depth': 6, 'learning_rate': 0.033222102442464076, 'n_estimators': 1629, 'num_leaves': 182, 'min_child_samples': 44}. Best is trial 1 with value: 8.792144579124008.[0m
[32m[I 2026-01-24 15:05:27,880][0m Trial 2 finished with value: 8.799562486605305 and parameters: {'max_depth': 11, 'learning_rate': 0.052742864907281264, 'n_estimators': 727, 'num_leaves': 69, 'min_child_samples': 48}. Best is trial 1 with value: 8.792144579124008.[0m
[32m[I 2026-01-24 15:06:45,454][0m Trial 3 finishe

In [None]:
print(f"best trial: {study_lgbm.best_trial}")
print(f"best value: {study_lgbm.best_value}")
print(f"best params: {study_lgbm.best_params}")

best trial: FrozenTrial(number=24, state=<TrialState.COMPLETE: 1>, values=[8.78818034127626], datetime_start=datetime.datetime(2026, 1, 24, 15, 35, 3, 54261), datetime_complete=datetime.datetime(2026, 1, 24, 15, 36, 14, 67430), params={'max_depth': 4, 'learning_rate': 0.07039468177158607, 'n_estimators': 1982, 'num_leaves': 211, 'min_child_samples': 57}, user_attrs={'rmse_std': np.float64(0.04673255522689491)}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.005, step=None), 'n_estimators': IntDistribution(high=2000, log=False, low=500, step=1), 'num_leaves': IntDistribution(high=255, log=False, low=30, step=1), 'min_child_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=24, value=None)
best value: 8.78818034127626
best params: {'max_depth': 4, 'learning_rate': 0.07039468177158607, 'n_estimators': 1982, 'num_leaves': 211, 'min_chi

### Test Data Enoding

In [33]:
test = pd.read_csv("data/test.csv")
test_id = test["id"]
test.drop("id", axis=1, inplace=True)


test_cat_columns = [col for col in test.columns if test[col].dtype == "O"]

test_encode_cat_col =ohe_encode.transform(test[test_cat_columns])
test_encode_cat_col = pd.DataFrame(test_encode_cat_col, columns=ohe_encode.get_feature_names_out(test_cat_columns), index=test.index)


test.drop(test_cat_columns, axis=1, inplace=True)
test_data = pd.concat([test, test_encode_cat_col], axis=1)

X_features = X.columns
test_data.reindex(columns=X_features, fill_value=0)
test_data.head()

Unnamed: 0,age,study_hours,class_attendance,sleep_hours,gender_male,gender_other,course_b.sc,course_b.tech,course_ba,course_bba,...,sleep_quality_good,sleep_quality_poor,study_method_group study,study_method_mixed,study_method_online videos,study_method_self-study,facility_rating_low,facility_rating_medium,exam_difficulty_hard,exam_difficulty_moderate
0,24,6.85,65.2,5.2,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18,6.61,45.0,9.3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,24,6.6,98.5,6.2,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,24,3.03,66.3,5.7,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,20,2.03,42.4,9.2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## Predict

In [44]:
final_model = LGBMRegressor(**study.best_params,
    random_state=15).fit(X,y)

y_pred = final_model.predict(test_data)

In [45]:
mlflow.set_experiment("Kaggle_exam_score_regression")
signature = infer_signature(X[:1], final_model.predict(X[:1]))


with mlflow.start_run(run_name="Student exam Score pred - LightGBM"):
    mlflow.log_param("n_trials", len(study.trials))
    mlflow.log_params(study.best_params)
    mlflow.log_metric("rmse_cv_mean",study.best_value)
    mlflow.log_metric(
        "rmse_cv_std",
        study.best_trial.user_attrs["rmse_std"]
    )
    mlflow.set_tag("Model_type", final_model.__class__.__name__)
    mlflow.set_tag("tuning", "optuna")
    mlflow.sklearn.log_model(sk_model=final_model, artifact_path="LightGBM_Regressor", signature=signature)




## Submission File

In [47]:
submission = pd.DataFrame({
    "id": test_id,
    "exam_score": y_pred
})

submission.to_csv("submission3.csv", index=False)