In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline

import sys
from pathlib import Path

ROOT_DIR = Path.cwd().parents[0]
sys.path.append(str(ROOT_DIR))

from src.preprocessing import processing_data



train_data = pd.read_csv("../data/raw/train.csv", index_col='id')
X_test = pd.read_csv("../data/raw/test.csv", index_col='id')

#Further analysis of data
y = train_data["exam_score"]
X = train_data.drop(columns="exam_score")

#categorical columns
cat_col = [col for col in X.columns if(X[col].dtype == 'object')]

#numerical columns
num_col = [col for col in X.columns if(X[col].dtype == 'int64' or X[col].dtype == 'float64')]

ordinal_col = ["internet_access","facility_rating","exam_difficulty"]
nominal_col = ["gender", "course","sleep_quality","study_method"]

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2,random_state=1)

preprocess_trf = processing_data(num_col, ordinal_col, nominal_col)

kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
#Training the Ridge model using KFold method

oof_ridge = np.zeros(len(X))
test_ridge = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    ridge = Ridge(alpha=0.05, random_state=42)

    ridge_pipe = Pipeline([
        ("preprocess", preprocess_trf),
        ("model", ridge)
    ])

    ridge_pipe.fit(X_tr, y_tr)

    oof_ridge[val_idx] = ridge_pipe.predict(X_val)
    test_ridge += ridge_pipe.predict(X_test) / kf.n_splits

print("OOF Ridge RMSE:",
      root_mean_squared_error(y, oof_ridge))

OOF Ridge RMSE: 8.894825663262274


In [None]:
#Training the HGB model using KFold method

oof_hgb = np.zeros(len(X))
test_hgb = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    hgb = HistGradientBoostingRegressor(
        learning_rate=0.03,
        max_iter=1200,
        max_leaf_nodes=64,
        max_depth=6,
        min_samples_leaf=20,
        max_features=0.7,
        l2_regularization=0.3,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=50,
        random_state=7
    )

    hgb_pipe = Pipeline([
        ("preprocess", preprocess_trf),
        ("model", hgb)
    ])

    hgb_pipe.fit(X_tr, y_tr)

    oof_hgb[val_idx] = hgb_pipe.predict(X_val)
    test_hgb += hgb_pipe.predict(X_test) / kf.n_splits

print("OOF HGB RMSE:",
      root_mean_squared_error(y, oof_hgb))

OOF HGB RMSE: 8.760338247300833


In [None]:
#Testing weighted combined values of the 2 models
for w in [0.1, 0.2, 0.25, 0.3, 0.35]:
    oof_ens = w * oof_ridge + (1 - w) * oof_hgb
    rmse = root_mean_squared_error(y, oof_ens)
    print(f"Ridge weight {w:.2f} → OOF RMSE {rmse:.5f}")

Ridge weight 0.10 → OOF RMSE 8.76262
Ridge weight 0.20 → OOF RMSE 8.76656
Ridge weight 0.25 → OOF RMSE 8.76955
Ridge weight 0.30 → OOF RMSE 8.77321
Ridge weight 0.35 → OOF RMSE 8.77755


In [3]:
sample_submission = pd.read_csv("../data/raw/sample_submission.csv")
final_test_pred = test_hgb
submission = sample_submission.copy()
submission["exam_score"] = final_test_pred
submission.to_csv(
    "../sumissions/submission_oof_hgb.csv",
    index=False
)