In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [None]:
# https://docs.google.com/spreadsheets/d/1CAzsKw0GvI7la2iWbsViBGUTJILOjTYagwxvAvGtatg/edit?usp=sharing

import os
import json
import random
import re
from tqdm.auto import tqdm
import warnings

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import numpy as np
import torch

import category_encoders as ce
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, accuracy_score
import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import optuna

In [None]:
def set_seed(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    torch.use_deterministic_algorithms(False)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

    print(f"Random seed set to {seed}")

In [None]:
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

warnings.filterwarnings("ignore", category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

Random seed set to 42
Device: cuda


In [None]:
INPUT_ROOT = "/content/drive/MyDrive/DataFlow2026/data"
WORK_DIR = "/content/drive/MyDrive/DataFlow2026/working"

ACADEMIC_CSV = f"{INPUT_ROOT}/academic_records.csv"
ADMISSION_CSV = f"{INPUT_ROOT}/admission.csv"
TEST_CSV = f"{INPUT_ROOT}/test.csv"

SUBMISSION_CSV = f"{WORK_DIR}/submission.csv"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
admission_df = pd.read_csv(ADMISSION_CSV)
academic_df = pd.read_csv(ACADEMIC_CSV)

semester_order = [
    "HK1 2020-2021", "HK2 2020-2021",
    "HK1 2021-2022", "HK2 2021-2022",
    "HK1 2022-2023", "HK2 2022-2023",
    "HK1 2023-2024", "HK2 2023-2024",
    "HK1 2024-2025"
]
semester_mapping = {sem: i for i, sem in enumerate(semester_order)}

student_df = pd.merge(academic_df, admission_df, on="MA_SO_SV", how="left")
student_df["SEMESTER_INDEX"] = student_df["HOC_KY"].map(semester_mapping)

student_df = student_df.sort_values(
    by=["MA_SO_SV", "HOC_KY", "TC_DANGKY", "TC_HOANTHANH", "GPA"],
    ascending=[True, True, True, False, False]
)
student_df = student_df.drop_duplicates(subset=["MA_SO_SV", "HOC_KY"], keep="first")
student_df = student_df.sort_values(["MA_SO_SV", "SEMESTER_INDEX"]).reset_index(drop=True)

valid_semesters = ["HK2 2023-2024"]
train_df_raw = student_df[~student_df["HOC_KY"].isin(valid_semesters)].copy()
val_df_raw = student_df[student_df["HOC_KY"].isin(valid_semesters)].copy()

test_df_raw = pd.read_csv(TEST_CSV)
test_df_raw = pd.merge(test_df_raw, admission_df, on="MA_SO_SV", how="left")
test_df_raw["SEMESTER_INDEX"] = test_df_raw["HOC_KY"].map(semester_mapping)

print(f"Raw Train shape: {train_df_raw.shape}")
print(f"Raw Val shape: {val_df_raw.shape}")
print(f"Raw Test shape: {test_df_raw.shape}")

Raw Train shape: (90122, 12)
Raw Val shape: (15144, 12)
Raw Test shape: (16502, 9)


In [None]:
EXAM_STATS_DETAILED = {
    2020: {
        "A00": {"mean": 21.4471, "std": 3.3425},
        "A01": {"mean": 20.0417, "std": 3.3084},
        "B00": {"mean": 20.3388, "std": 3.0812},
        "D01": {"mean": 18.1417, "std": 3.7811},
        "D07": {"mean": 20.0117, "std": 3.1333},
        "D24": {"mean": 22.8503, "std": 2.8891},
        "D29": {"mean": 22.3681, "std": 2.8403},
        "OTHER": {"mean": 19.5364, "std": 3.6661}
    },
    2021: {
        "A00": {"mean": 21.0262, "std": 3.1863},
        "A01": {"mean": 21.1029, "std": 3.4455},
        "B00": {"mean": 19.9892, "std": 3.0890},
        "D01": {"mean": 19.2666, "std": 4.1166},
        "D07": {"mean": 21.1338, "std": 3.2841},
        "D24": {"mean": 21.9896, "std": 3.0011},
        "D29": {"mean": 21.4177, "std": 3.0370},
        "OTHER": {"mean": 20.2071, "std": 3.7104}
    },
    2022: {
        "A00": {"mean": 21.0955, "std": 3.2378},
        "A01": {"mean": 20.2909, "std": 3.3396},
        "B00": {"mean": 19.4039, "std": 3.1555},
        "D01": {"mean": 18.4381, "std": 3.8846},
        "D07": {"mean": 20.2397, "std": 3.2064},
        "OTHER": {"mean": 19.5196, "std": 3.6548}
    },
    2023: {
        "A00": {"mean": 20.7745, "std": 3.0941},
        "A01": {"mean": 20.2743, "std": 3.3399},
        "B00": {"mean": 20.6047, "std": 2.7763},
        "D01": {"mean": 18.8891, "std": 3.8137},
        "D07": {"mean": 20.4216, "std": 3.0619},
        "D24": {"mean": 20.8364, "std": 3.3689},
        "D29": {"mean": 20.2285, "std": 3.5744},
        "OTHER": {"mean": 19.8594, "std": 3.4893}
    },
    2024: {
        "A00": {"mean": 20.9046, "std": 3.3804},
        "A01": {"mean": 20.4724, "std": 3.3509},
        "B00": {"mean": 20.5311, "std": 2.9818},
        "D01": {"mean": 19.4939, "std": 3.6232},
        "D07": {"mean": 20.4510, "std": 3.1120},
        "D24": {"mean": 21.8162, "std": 3.2930},
        "D29": {"mean": 20.9934, "std": 3.5598},
        "OTHER": {"mean": 20.1512, "std": 3.4281}
    },
}

In [None]:
def get_features(input_df):
    df = input_df.copy()

    target_keys = set(zip(df["MA_SO_SV"], df["HOC_KY"]))
    student_filtered = student_df[
        ~student_df.set_index(["MA_SO_SV", "HOC_KY"]).index.isin(target_keys)
    ].copy()

    df = pd.concat([student_filtered, df], axis=0, ignore_index=True)
    df = df.sort_values(["MA_SO_SV", "SEMESTER_INDEX"])

    df["PTXT"] = df["PTXT"].replace({
        "5": "100",
        "3": "200",
        "1": "303"
    })

    def calculate_z_score(row):
        year = row["NAM_TUYENSINH"]
        score = row["DIEM_TRUNGTUYEN"]
        block = str(row["TOHOP_XT"]).upper().strip()

        if year not in EXAM_STATS_DETAILED:
            return np.nan

        year_stats = EXAM_STATS_DETAILED[year]
        if block in year_stats:
            stats = year_stats[block]
        else:
            stats = year_stats["OTHER"]

        mean_val = stats["mean"]
        std_val = stats["std"]

        if std_val < 0.1:
            std_val = 1.0

        return (score - mean_val) / std_val

    df["Z_SCORE"] = df.apply(calculate_z_score, axis=1)

    if "TC_HOANTHANH" in df.columns:
        df["FAIL_CREDITS"] = df["TC_DANGKY"] - df["TC_HOANTHANH"]
    else:
        df["FAIL_CREDITS"] = 0

    df["SCORE_GAP"] = df["DIEM_TRUNGTUYEN"] - df["DIEM_CHUAN"]
    df["GAP_RATIO"] = df["SCORE_GAP"] / (df["DIEM_CHUAN"] + 1.0)

    df["ENTRY_RANK"] = df.groupby("NAM_TUYENSINH")["DIEM_TRUNGTUYEN"].transform(
        lambda x: x.rank(pct=True, method='average')
    )
    df["BENCHMARK_TIER"] = df.groupby("NAM_TUYENSINH")["DIEM_CHUAN"].transform(
        lambda x: x.rank(pct=True)
    )

    grouped = df.groupby("MA_SO_SV")
    df["LAST_GPA"] = grouped["GPA"].shift(1)
    df["LAST_FAIL"] = grouped["FAIL_CREDITS"].shift(1)
    df["LAST_PASSED"] = grouped["TC_HOANTHANH"].shift(1)
    df["LAST_DANGKY"] = grouped["TC_DANGKY"].shift(1)
    df["LAST_PASS_RATIO"] = df["LAST_PASSED"] / df["LAST_DANGKY"]

    df["HIST_AVG_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().mean())
    df["TOTAL_EARNED"] = grouped["TC_HOANTHANH"].transform(lambda x: x.shift(1).fillna(0).cumsum())
    df["HIST_MAX_PASSED"] = grouped["TC_HOANTHANH"].transform(lambda x: x.shift(1).expanding().max())
    df["HIST_MAX_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().max())
    df["HIST_STD_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().std())
    df["OVERLOAD_VS_MAX"] = df["TC_DANGKY"] - df["HIST_MAX_PASSED"]

    window2 = grouped.rolling(window=2, min_periods=1, closed='left')
    df["R2_AVG_GPA"] = window2["GPA"].mean().reset_index(0, drop=True)
    df["R2_SUM_FAIL"] = window2["FAIL_CREDITS"].sum().reset_index(0, drop=True)
    df["R2_AVG_PASSED"] = window2["TC_HOANTHANH"].mean().reset_index(0, drop=True)
    df["R2_SUM_DANGKY"] = window2["TC_DANGKY"].sum().reset_index(0, drop=True)
    df["R2_SUM_PASSED"] = window2["TC_HOANTHANH"].sum().reset_index(0, drop=True)
    df["R2_PASS_RATE"] = df["R2_SUM_PASSED"] / df["R2_SUM_DANGKY"]
    df["PRESSURE_VS_R2"] = df["TC_DANGKY"] / df["R2_AVG_PASSED"]
    df["GPA_TREND_R2"] = df["R2_AVG_GPA"] - df["HIST_AVG_GPA"]
    df["FAIL_TREND_R2"] = df["LAST_FAIL"] - (df["R2_SUM_FAIL"] / 2)

    window3 = grouped.rolling(window=3, min_periods=1, closed='left')
    df["R3_AVG_GPA"] = window3["GPA"].mean().reset_index(0, drop=True)
    df["R3_SUM_FAIL"] = window3["FAIL_CREDITS"].sum().reset_index(0, drop=True)
    df["R3_AVG_PASSED"] = window3["TC_HOANTHANH"].mean().reset_index(0, drop=True)
    df["PRESSURE_VS_R3"] = df["TC_DANGKY"] / df["R3_AVG_PASSED"]
    df["OVERLOAD_R3"] = df["TC_DANGKY"] - df["R3_AVG_PASSED"]

    def parse_hoc_ky(hk_str):
        hk_str = str(hk_str)
        year_match = re.search(r"(\d{4})", hk_str)
        year = int(year_match.group(1)) if year_match else 2024
        return year

    df["YEAR_START"] = df["HOC_KY"].apply(parse_hoc_ky)
    df["SV_NAM_THU"] = df["YEAR_START"] - df["NAM_TUYENSINH"] + 1
    df.loc[df["SV_NAM_THU"] < 1, "SV_NAM_THU"] = 1

    fill_0 = [
        "LAST_FAIL", "R2_SUM_FAIL", "R3_SUM_FAIL", "FAIL_TREND_R2", "OVERLOAD_R3",
        "TOTAL_EARNED", "OVERLOAD_VS_MAX", "HIST_STD_GPA", "GPA_TREND_R2"
    ]
    df[fill_0] = df[fill_0].fillna(0.0)

    fill_1 = ["LAST_PASS_RATIO", "R2_PASS_RATE", "PRESSURE_VS_R2", "PRESSURE_VS_R3"]
    df[fill_1] = df[fill_1].fillna(1.0)

    fill_mean_gpa = ["LAST_GPA", "R2_AVG_GPA", "R3_AVG_GPA", "HIST_AVG_GPA", "HIST_MAX_GPA"]
    df[fill_mean_gpa] = df[fill_mean_gpa].fillna(academic_df["GPA"].mean())

    fill_15 = ["LAST_PASSED", "R2_AVG_PASSED", "R3_AVG_PASSED", "HIST_MAX_PASSED"]
    df[fill_15] = df[fill_15].fillna(15.0)

    df = df.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    final_df = pd.merge(
        input_df[["MA_SO_SV", "HOC_KY"]],
        df,
        on=["MA_SO_SV", "HOC_KY"],
        how="left"
    )

    return final_df

train_final = get_features(train_df_raw)
val_final = get_features(val_df_raw)
test_final = get_features(test_df_raw)

In [None]:
print(train_final.columns)
print(val_final.columns)
print(test_final.columns)

Index(['MA_SO_SV', 'HOC_KY', 'CPA', 'GPA', 'TC_DANGKY', 'TC_HOANTHANH',
       'NAM_TUYENSINH', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN',
       'SEMESTER_INDEX', 'Z_SCORE', 'FAIL_CREDITS', 'SCORE_GAP', 'GAP_RATIO',
       'ENTRY_RANK', 'BENCHMARK_TIER', 'LAST_GPA', 'LAST_FAIL', 'LAST_PASSED',
       'LAST_DANGKY', 'LAST_PASS_RATIO', 'HIST_AVG_GPA', 'TOTAL_EARNED',
       'HIST_MAX_PASSED', 'HIST_MAX_GPA', 'HIST_STD_GPA', 'OVERLOAD_VS_MAX',
       'R2_AVG_GPA', 'R2_SUM_FAIL', 'R2_AVG_PASSED', 'R2_SUM_DANGKY',
       'R2_SUM_PASSED', 'R2_PASS_RATE', 'PRESSURE_VS_R2', 'GPA_TREND_R2',
       'FAIL_TREND_R2', 'R3_AVG_GPA', 'R3_SUM_FAIL', 'R3_AVG_PASSED',
       'PRESSURE_VS_R3', 'OVERLOAD_R3', 'YEAR_START', 'SV_NAM_THU'],
      dtype='object')
Index(['MA_SO_SV', 'HOC_KY', 'CPA', 'GPA', 'TC_DANGKY', 'TC_HOANTHANH',
       'NAM_TUYENSINH', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN',
       'SEMESTER_INDEX', 'Z_SCORE', 'FAIL_CREDITS', 'SCORE_GAP', 'GAP_RATIO',
       'ENTR

In [None]:
target = "FAIL_CREDITS"
categorical_cols = ["PTXT", "TOHOP_XT"]

cbe = ce.CatBoostEncoder(
    cols=categorical_cols,
    handle_missing="return_nan"
)

train_final[categorical_cols] = cbe.fit_transform(
    train_final[categorical_cols],
    train_final[target]
)

val_final[categorical_cols] = cbe.transform(val_final[categorical_cols])
test_final[categorical_cols] = cbe.transform(test_final[categorical_cols])

print(f"Final Train shape: {train_final.shape}")
print(f"Final Val shape: {val_final.shape}")
print(f"Final Test shape: {test_final.shape}")

Final Train shape: (90122, 45)
Final Val shape: (15144, 45)
Final Test shape: (16502, 45)


In [None]:
def split_by_year(df):
    df_fresh = df[df["SV_NAM_THU"] == 1].copy()
    df_senior = df[df["SV_NAM_THU"] > 1].copy()

    return df_fresh, df_senior

train_fresh, train_senior = split_by_year(train_final)
val_fresh, val_senior = split_by_year(val_final)
test_fresh, test_senior = split_by_year(test_final)

In [None]:
feats_senior = [
    "TC_DANGKY", "SEMESTER_INDEX", "SV_NAM_THU",

    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO",

    "R2_AVG_GPA", "R2_SUM_FAIL", "R2_PASS_RATE",
    "R3_AVG_GPA", "R3_SUM_FAIL",

    "PRESSURE_VS_R2", "PRESSURE_VS_R3", "OVERLOAD_R3",
    "FAIL_TREND_R2", "GPA_TREND_R2",

    "TOTAL_EARNED", "HIST_AVG_GPA",
    "HIST_MAX_PASSED",
    "HIST_MAX_GPA",
    "HIST_STD_GPA",
    "OVERLOAD_VS_MAX"
]

feats_fresh = [
    "TC_DANGKY", "SEMESTER_INDEX",
    "PTXT", "TOHOP_XT",

    "DIEM_TRUNGTUYEN", "DIEM_CHUAN",
    "SCORE_GAP", "ENTRY_RANK", "BENCHMARK_TIER",
    "Z_SCORE", "GAP_RATIO",

    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO",
    "PRESSURE_VS_R2"
]

meta_cols = ["MA_SO_SV", "HOC_KY", "FAIL_CREDITS"]

def filter_cols(df, features):
    desired_cols = set(features + meta_cols)
    existing_cols = [c for c in df.columns if c in desired_cols]
    return df[existing_cols].copy()

train_fresh = filter_cols(train_fresh, feats_fresh)
val_fresh = filter_cols(val_fresh, feats_fresh)
test_fresh = filter_cols(test_fresh, feats_fresh)

train_senior = filter_cols(train_senior, feats_senior)
val_senior = filter_cols(val_senior, feats_senior)
test_senior = filter_cols(test_senior, feats_senior)

full_train_fresh = pd.concat([train_fresh, val_fresh], axis=0, ignore_index=True)
full_train_senior = pd.concat([train_senior, val_senior], axis=0, ignore_index=True)

In [None]:
print(f"Train Fresher: {train_fresh.shape} | Train Senior: {train_senior.shape}")
print(f"Val Fresher: {val_fresh.shape} | Val Senior: {val_senior.shape}")
print(f"Test Fresher: {test_fresh.shape} | Test Senior: {test_senior.shape}")

Train Fresher: (24996, 18) | Train Senior: (65126, 25)
Val Fresher: (3504, 18) | Val Senior: (11640, 25)
Test Fresher: (4326, 18) | Test Senior: (12176, 25)


In [None]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

FIXED_XGB_PARAMS = {
    "objective": "reg:tweedie",
    "eval_metric": "rmse",
    "n_estimators": 4000,
    "random_state": RANDOM_SEED,
    "n_jobs": -1,
    "tree_method": "hist",
    # "device": "cuda",
}

In [None]:
def get_xgb_optuna_params(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "tweedie_variance_power": trial.suggest_float("tweedie_variance_power", 1.1, 1.9),

        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "max_depth": trial.suggest_int("max_depth", 3, 12),

        "max_leaves": trial.suggest_int("max_leaves", 20, 300) if trial.params.get("grow_policy") == "lossguide" else 0,

        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 100, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 10.0),

        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True),

        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),

        "max_bin": trial.suggest_categorical("max_bin", [255, 512]),
    }

def optimize_xgboost(train_df, val_df, feats, target_col, n_trial, optim_type=""):

    X_train = train_df[feats]
    y_train = train_df[target_col]
    X_val = val_df[feats]
    y_val = val_df[target_col]

    def objective_xgb(trial):
        params = {
            **FIXED_XGB_PARAMS,
            **get_xgb_optuna_params(trial),
        }

        params["early_stopping_rounds"] = 100

        model = xgb.XGBRegressor(**params)

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        if hasattr(model, "best_iteration"):
            best_iter = model.best_iteration
        else:
            best_iter = None

        preds = model.predict(X_val, iteration_range=(0, best_iter + 1) if best_iter else None)

        preds = np.maximum(preds, 0)
        if "TC_DANGKY" in val_df.columns:
            limit = val_df["TC_DANGKY"].values
            preds = np.minimum(preds, limit)

        return np.sqrt(mean_squared_error(y_val, preds))

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
    )

    print(f" Tuning XGBoost {optim_type} with {n_trial} trials...")
    with tqdm(total=n_trial, desc=f"XGB {optim_type}") as pbar:
        def tqdm_callback(study, trial):
            pbar.update(1)
            if study.best_value:
                pbar.set_postfix({"Best RMSE": f"{study.best_value:.4f}"})

        study.optimize(objective_xgb, n_trials=n_trial, callbacks=[tqdm_callback])

    print(f"\n Best RMSE: {study.best_value:.4f}")

    best_params = {**FIXED_XGB_PARAMS, **study.best_params}

    best_params["early_stopping_rounds"] = 100

    return best_params, study.best_value

In [None]:
N_TRIALS = 300

best_fresh_params, best_fresh_rmse = optimize_xgboost(
    train_fresh, val_fresh, feats_fresh, "FAIL_CREDITS", N_TRIALS, "Fresher"
)

best_senior_params, best_senior_rmse = optimize_xgboost(
    train_senior, val_senior, feats_senior,"FAIL_CREDITS", N_TRIALS, "Senior"
)

 Tuning XGBoost Fresher with 300 trials...


XGB Fresher:   0%|          | 0/300 [00:00<?, ?it/s]


 Best RMSE: 3.9380
 Tuning XGBoost Senior with 300 trials...


XGB Senior:   0%|          | 0/300 [00:00<?, ?it/s]


 Best RMSE: 3.6983


In [None]:
print(f"Best Fresher RMSE: {best_fresh_rmse:.4f}")
print(f"Best Senior RMSE: {best_senior_rmse:.4f}")

print(f"Best Fresher Params: {best_fresh_params}")
print(f"Best Senior Params: {best_senior_params}")

Best Fresher RMSE: 3.9380
Best Senior RMSE: 3.6983
Best Fresher Params: {'objective': 'reg:tweedie', 'eval_metric': 'rmse', 'n_estimators': 4000, 'random_state': 42, 'n_jobs': -1, 'tree_method': 'hist', 'learning_rate': 0.01944896778397786, 'tweedie_variance_power': 1.5207836072687866, 'grow_policy': 'lossguide', 'max_depth': 12, 'max_leaves': 20, 'min_child_weight': 0.5897167678263666, 'gamma': 7.452454748599296, 'reg_alpha': 7.180310277904978e-07, 'reg_lambda': 2.47047919086806e-08, 'subsample': 0.5198053278268525, 'colsample_bytree': 0.9331663462711142, 'max_bin': 255, 'early_stopping_rounds': 100}
Best Senior Params: {'objective': 'reg:tweedie', 'eval_metric': 'rmse', 'n_estimators': 4000, 'random_state': 42, 'n_jobs': -1, 'tree_method': 'hist', 'learning_rate': 0.0016170574741031355, 'tweedie_variance_power': 1.1009599769416711, 'grow_policy': 'lossguide', 'max_depth': 12, 'max_leaves': 60, 'min_child_weight': 0.03870839176393461, 'gamma': 3.1465018534191898, 'reg_alpha': 0.173810

In [None]:
import xgboost as xgb
import numpy as np

def train_xgboost(params, train_df, val_df, feats, model_type, target_col="FAIL_CREDITS"):
    print(f"Training {model_type} Model (XGBoost)...")

    xgb_params = params.copy()

    if "early_stopping_rounds" not in xgb_params:
        xgb_params["early_stopping_rounds"] = 100

    model = xgb.XGBRegressor(**xgb_params)

    model.fit(
        train_df[feats],
        train_df[target_col],
        eval_set=[(val_df[feats], val_df[target_col])],
        verbose=100
    )

    if hasattr(model, "best_iteration"):
        best_iter = model.best_iteration
    else:
        best_iter = None

    iter_range = (0, best_iter + 1) if best_iter else None
    preds = model.predict(val_df[feats], iteration_range=iter_range)

    if "TC_DANGKY" in val_df.columns:
        limit = val_df["TC_DANGKY"].values
        preds = np.clip(preds, 0, limit)
    else:
        preds = np.maximum(preds, 0)

    return best_iter, preds


TARGET_COL = "FAIL_CREDITS"

best_iter_fresh, preds_fresh = train_xgboost(
    best_fresh_params,
    train_fresh,
    val_fresh,
    feats_fresh,
    "Fresher",
    target_col=TARGET_COL
)

best_iter_senior, preds_senior = train_xgboost(
    best_senior_params,
    train_senior,
    val_senior,
    feats_senior,
    "Senior",
    target_col=TARGET_COL
)

Training Fresher Model (XGBoost)...
[0]	validation_0-rmse:5.55873
[100]	validation_0-rmse:4.22169
[200]	validation_0-rmse:3.99191
[300]	validation_0-rmse:3.94898
[400]	validation_0-rmse:3.95318
[433]	validation_0-rmse:3.95791
Training Senior Model (XGBoost)...
[0]	validation_0-rmse:5.30419
[100]	validation_0-rmse:4.97275
[200]	validation_0-rmse:4.70113
[300]	validation_0-rmse:4.48243
[400]	validation_0-rmse:4.31179
[500]	validation_0-rmse:4.17669
[600]	validation_0-rmse:4.07202
[700]	validation_0-rmse:3.98894
[800]	validation_0-rmse:3.92626
[900]	validation_0-rmse:3.87706
[1000]	validation_0-rmse:3.83908
[1100]	validation_0-rmse:3.81019
[1200]	validation_0-rmse:3.78660
[1300]	validation_0-rmse:3.76839
[1400]	validation_0-rmse:3.75371
[1500]	validation_0-rmse:3.74359
[1600]	validation_0-rmse:3.73434
[1700]	validation_0-rmse:3.72638
[1800]	validation_0-rmse:3.72092
[1900]	validation_0-rmse:3.71646
[2000]	validation_0-rmse:3.71279
[2100]	validation_0-rmse:3.70940
[2200]	validation_0-rmse:

In [None]:
val_targets = np.concatenate([val_fresh[target].values, val_senior[target].values])
val_preds = np.concatenate([preds_fresh, preds_senior])

def calculate_wmape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if np.sum(y_true) == 0:
        return 0
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

rmse = np.sqrt(mean_squared_error(val_targets, val_preds))
mse = mean_squared_error(val_targets, val_preds)
r2 = r2_score(val_targets, val_preds)
wmape = calculate_wmape(val_targets, val_preds)

print(f"RMSE : {rmse:.4f}")
print(f"MSE : {mse:.4f}")
print(f"R^2 : {r2:.4f}")
print(f"wMAPE : {wmape:.4f}")

RMSE : 3.5074
MSE : 12.3020
R^2 : 0.5659
wMAPE : 0.5881


In [None]:

def test_xgboost(params, best_iter, full_train_df, train_df, test_df, feats, model_type, target_col="FAIL_CREDITS"):
    print(f"Testing {model_type} Model (XGBoost)...")
    scale_ratio = len(full_train_df) / len(train_df)

    final_params = params.copy()

    if best_iter is None:
        current_n = final_params.get("n_estimators", 1000)
        final_params["n_estimators"] = int(current_n * scale_ratio)
    else:
        final_params["n_estimators"] = int(best_iter * scale_ratio)

    if "early_stopping_rounds" in final_params:
        del final_params["early_stopping_rounds"]
    model = xgb.XGBRegressor(**final_params)

    model.fit(
        full_train_df[feats],
        full_train_df[target_col],
        verbose=False
    )

    if not os.path.exists(WORK_DIR):
        os.makedirs(WORK_DIR)
    model.save_model(f"{WORK_DIR}/xgb_{model_type.lower()}.json")

    preds = model.predict(test_df[feats])

    if "TC_DANGKY" in test_df.columns:
        limit = test_df["TC_DANGKY"].values
        preds = np.clip(preds, 0, limit)
        preds = limit - preds
    else:
        preds = np.maximum(preds, 0)

    return preds

TARGET_COL = "FAIL_CREDITS"

preds_fresh = test_xgboost(
    best_fresh_params,
    best_iter_fresh,
    full_train_fresh,
    train_fresh,
    test_fresh,
    feats_fresh,
    "Fresher",
    target_col=TARGET_COL
)

preds_senior = test_xgboost(
    best_senior_params,
    best_iter_senior,
    full_train_senior,
    train_senior,
    test_senior,
    feats_senior,
    "Senior",
    target_col=TARGET_COL
)

print("Sample Output (Fresher):", preds_fresh[:5])

Testing Fresher Model (XGBoost)...
Testing Senior Model (XGBoost)...
Sample Output (Fresher): [18.89827776 18.73236442 16.09582877 17.98536408 16.55190873]


In [None]:
sub_fresh_df = pd.DataFrame({
    "MA_SO_SV": test_fresh["MA_SO_SV"],
    "PRED_TC_HOANTHANH": preds_fresh
})

sub_senior_df = pd.DataFrame({
    "MA_SO_SV": test_senior["MA_SO_SV"],
    "PRED_TC_HOANTHANH": preds_senior
})

preds_df = pd.concat([sub_fresh_df, sub_senior_df], axis=0)

submission = pd.merge(
    test_df_raw[["MA_SO_SV"]],
    preds_df,
    on=["MA_SO_SV"],
    how="left"
)

submission.to_csv(SUBMISSION_CSV, index=False)

In [None]:
print(submission)

           MA_SO_SV  PRED_TC_HOANTHANH
0      481436e2064d           1.743736
1      6c8a97d22131           2.571168
2      e87f62beabbb           4.654152
3      438aff5ef524           0.163784
4      ad172a9b0722          15.396597
...             ...                ...
16497  9e803a0d26f0          42.946918
16498  dbc819721795          51.765213
16499  9e1c8deafb70          44.250534
16500  ffecfc70f83a          46.845641
16501  dc7b37953745          52.166172

[16502 rows x 2 columns]
