In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline

import sys
from pathlib import Path

ROOT_DIR = Path.cwd().parents[0]
sys.path.append(str(ROOT_DIR))

from src.preprocessing import processing_data


train_data = pd.read_csv("../data/raw/train.csv", index_col='id')
X_test = pd.read_csv("../data/raw/test.csv", index_col='id')

#Further analysis of data
y = train_data["exam_score"]
X = train_data.drop(columns="exam_score")

#categorical columns
cat_col = [col for col in X.columns if(X[col].dtype == 'object')]

#numerical columns
num_col = [col for col in X.columns if(X[col].dtype == 'int64' or X[col].dtype == 'float64')]

ordinal_col = ["internet_access","facility_rating","exam_difficulty"]
nominal_col = ["gender", "course","sleep_quality","study_method"]

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2,random_state=1)

preprocess_trf = processing_data(num_col, ordinal_col, nominal_col)



In [None]:
#Training HGB model using Multi seed values

seeds = [1, 7, 42]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_hgb_seeds = []
test_hgb_seeds = []

for seed in seeds:
    print(f"\nRunning OOF HGB with seed {seed}")
    
    oof_hgb = np.zeros(len(X))
    test_hgb = np.zeros(len(X_test))
    
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        hgb = HistGradientBoostingRegressor(
            learning_rate=0.03,
            max_iter=1200,
            max_leaf_nodes=64,
            max_depth=6,
            min_samples_leaf=20,
            max_features=0.7,
            l2_regularization=0.3,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=50,
            random_state= seed
        )

        hgb_pipe = Pipeline([
            ("preprocess", preprocess_trf),
            ("model", hgb)
        ])

        hgb_pipe.fit(X_tr, y_tr)

        oof_hgb[val_idx] = hgb_pipe.predict(X_val)
        test_hgb += hgb_pipe.predict(X_test) / kf.n_splits

    rmse = root_mean_squared_error(y, oof_hgb)
    print(f"Seed {seed} OOF RMSE: {rmse:.5f}")

    oof_hgb_seeds.append(oof_hgb)
    test_hgb_seeds.append(test_hgb)



Running OOF HGB with seed 1
Seed 1 OOF RMSE: 8.76140

Running OOF HGB with seed 7
Seed 7 OOF RMSE: 8.76034

Running OOF HGB with seed 42
Seed 42 OOF RMSE: 8.76196


In [None]:
#Getting mean value of the predicted values by all seeds
oof_hgb_final = np.mean(oof_hgb_seeds, axis=0)
print(
    "Final multi-seed OOF RMSE:",
    root_mean_squared_error(y, oof_hgb_final)
)


Final multi-seed OOF RMSE: 8.752358336772186


In [9]:
sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

test_hgb_final = np.mean(test_hgb_seeds, axis=0)

submission = sample_submission.copy()
submission["exam_score"] = test_hgb_final
submission.to_csv(
    "../sumissions/submission_hgb_oof_multiseed.csv",
    index=False
)