In [1]:
!pip install category_encoders



In [2]:
# https://docs.google.com/spreadsheets/d/1CAzsKw0GvI7la2iWbsViBGUTJILOjTYagwxvAvGtatg/edit?usp=sharing

import os
import json
import random
import re
from tqdm.auto import tqdm
import warnings

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import numpy as np 
import torch

import category_encoders as ce
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, accuracy_score
import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import optuna

In [3]:
def set_seed(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) 
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    torch.use_deterministic_algorithms(False)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    
    print(f"Random seed set to {seed}")

In [4]:
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

warnings.filterwarnings("ignore", category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

Random seed set to 42
Device: cuda


In [5]:
INPUT_ROOT = "/kaggle/input/df2026"
WORK_DIR = "/kaggle/working"

ACADEMIC_CSV = f"{INPUT_ROOT}/academic_records.csv"
ADMISSION_CSV = f"{INPUT_ROOT}/admission.csv"
TEST_CSV = f"{INPUT_ROOT}/test.csv"

SUBMISSION_CSV = f"{WORK_DIR}/submission.csv"

In [6]:
admission_df = pd.read_csv(ADMISSION_CSV)
academic_df = pd.read_csv(ACADEMIC_CSV)

semester_order = [
    "HK1 2020-2021", "HK2 2020-2021",
    "HK1 2021-2022", "HK2 2021-2022",
    "HK1 2022-2023", "HK2 2022-2023",
    "HK1 2023-2024", "HK2 2023-2024",
    "HK1 2024-2025"
]
semester_mapping = {sem: i for i, sem in enumerate(semester_order)}

student_df = pd.merge(academic_df, admission_df, on="MA_SO_SV", how="left")
student_df["SEMESTER_INDEX"] = student_df["HOC_KY"].map(semester_mapping)

student_df = student_df.sort_values(
    by=["MA_SO_SV", "HOC_KY", "TC_DANGKY", "TC_HOANTHANH", "GPA"],
    ascending=[True, True, True, False, False]
)
student_df = student_df.drop_duplicates(subset=["MA_SO_SV", "HOC_KY"], keep="first")
student_df = student_df.sort_values(["MA_SO_SV", "SEMESTER_INDEX"]).reset_index(drop=True)

valid_semesters = ["HK2 2023-2024"]
train_df_raw = student_df[~student_df["HOC_KY"].isin(valid_semesters)].copy()
val_df_raw = student_df[student_df["HOC_KY"].isin(valid_semesters)].copy()

test_df_raw = pd.read_csv(TEST_CSV)
test_df_raw = pd.merge(test_df_raw, admission_df, on="MA_SO_SV", how="left")
test_df_raw["SEMESTER_INDEX"] = test_df_raw["HOC_KY"].map(semester_mapping)

print(f"Raw Train shape: {train_df_raw.shape}")
print(f"Raw Val shape: {val_df_raw.shape}")
print(f"Raw Test shape: {test_df_raw.shape}")

Raw Train shape: (90122, 12)
Raw Val shape: (15144, 12)
Raw Test shape: (16502, 9)


In [7]:
EXAM_STATS_DETAILED = {
    2020: {
        "A00": {"mean": 21.4471, "std": 3.3425},
        "A01": {"mean": 20.0417, "std": 3.3084},
        "B00": {"mean": 20.3388, "std": 3.0812},
        "D01": {"mean": 18.1417, "std": 3.7811},
        "D07": {"mean": 20.0117, "std": 3.1333},
        "D24": {"mean": 22.8503, "std": 2.8891},
        "D29": {"mean": 22.3681, "std": 2.8403},
        "OTHER": {"mean": 19.5364, "std": 3.6661}
    },
    2021: {
        "A00": {"mean": 21.0262, "std": 3.1863},
        "A01": {"mean": 21.1029, "std": 3.4455},
        "B00": {"mean": 19.9892, "std": 3.0890},
        "D01": {"mean": 19.2666, "std": 4.1166},
        "D07": {"mean": 21.1338, "std": 3.2841},
        "D24": {"mean": 21.9896, "std": 3.0011},
        "D29": {"mean": 21.4177, "std": 3.0370},
        "OTHER": {"mean": 20.2071, "std": 3.7104}
    },
    2022: {
        "A00": {"mean": 21.0955, "std": 3.2378},
        "A01": {"mean": 20.2909, "std": 3.3396},
        "B00": {"mean": 19.4039, "std": 3.1555},
        "D01": {"mean": 18.4381, "std": 3.8846},
        "D07": {"mean": 20.2397, "std": 3.2064},
        "OTHER": {"mean": 19.5196, "std": 3.6548}
    },
    2023: {
        "A00": {"mean": 20.7745, "std": 3.0941},
        "A01": {"mean": 20.2743, "std": 3.3399},
        "B00": {"mean": 20.6047, "std": 2.7763},
        "D01": {"mean": 18.8891, "std": 3.8137},
        "D07": {"mean": 20.4216, "std": 3.0619},
        "D24": {"mean": 20.8364, "std": 3.3689},
        "D29": {"mean": 20.2285, "std": 3.5744},
        "OTHER": {"mean": 19.8594, "std": 3.4893}
    },
    2024: {
        "A00": {"mean": 20.9046, "std": 3.3804},
        "A01": {"mean": 20.4724, "std": 3.3509},
        "B00": {"mean": 20.5311, "std": 2.9818},
        "D01": {"mean": 19.4939, "std": 3.6232},
        "D07": {"mean": 20.4510, "std": 3.1120},
        "D24": {"mean": 21.8162, "std": 3.2930},
        "D29": {"mean": 20.9934, "std": 3.5598},
        "OTHER": {"mean": 20.1512, "std": 3.4281}
    },
}

In [8]:
def get_features(input_df):
    df = input_df.copy()
    
    target_keys = set(zip(df["MA_SO_SV"], df["HOC_KY"]))
    student_filtered = student_df[
        ~student_df.set_index(["MA_SO_SV", "HOC_KY"]).index.isin(target_keys)
    ].copy()
    
    df = pd.concat([student_filtered, df], axis=0, ignore_index=True)
    df = df.sort_values(["MA_SO_SV", "SEMESTER_INDEX"])

    df["PTXT"] = df["PTXT"].replace({
        "5": "100",
        "3": "200",
        "1": "303"
    })

    def calculate_z_score(row):
        year = row["NAM_TUYENSINH"]
        score = row["DIEM_TRUNGTUYEN"]
        block = str(row["TOHOP_XT"]).upper().strip()
        
        if year not in EXAM_STATS_DETAILED:
            return np.nan
            
        year_stats = EXAM_STATS_DETAILED[year]
        if block in year_stats:
            stats = year_stats[block]
        else:
            stats = year_stats["OTHER"]
            
        mean_val = stats["mean"]
        std_val = stats["std"]
        
        if std_val < 0.1: 
            std_val = 1.0
            
        return (score - mean_val) / std_val

    df["Z_SCORE"] = df.apply(calculate_z_score, axis=1)
    
    if "TC_HOANTHANH" in df.columns:
        df["FAIL_CREDITS"] = df["TC_DANGKY"] - df["TC_HOANTHANH"]
    else:
        df["FAIL_CREDITS"] = 0 

    df["SCORE_GAP"] = df["DIEM_TRUNGTUYEN"] - df["DIEM_CHUAN"]
    df["GAP_RATIO"] = df["SCORE_GAP"] / (df["DIEM_CHUAN"] + 1.0)
    
    df["ENTRY_RANK"] = df.groupby("NAM_TUYENSINH")["DIEM_TRUNGTUYEN"].transform(
        lambda x: x.rank(pct=True, method='average')
    )
    df["BENCHMARK_TIER"] = df.groupby("NAM_TUYENSINH")["DIEM_CHUAN"].transform(
        lambda x: x.rank(pct=True)
    )
    
    grouped = df.groupby("MA_SO_SV")
    df["LAST_GPA"] = grouped["GPA"].shift(1)
    df["LAST_FAIL"] = grouped["FAIL_CREDITS"].shift(1)
    df["LAST_PASSED"] = grouped["TC_HOANTHANH"].shift(1)
    df["LAST_DANGKY"] = grouped["TC_DANGKY"].shift(1)
    df["LAST_PASS_RATIO"] = df["LAST_PASSED"] / df["LAST_DANGKY"]
    
    df["HIST_AVG_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().mean())
    df["TOTAL_EARNED"] = grouped["TC_HOANTHANH"].transform(lambda x: x.shift(1).fillna(0).cumsum())
    df["HIST_MAX_PASSED"] = grouped["TC_HOANTHANH"].transform(lambda x: x.shift(1).expanding().max())
    df["HIST_MAX_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().max())
    df["HIST_STD_GPA"] = grouped["GPA"].transform(lambda x: x.shift(1).expanding().std())
    df["OVERLOAD_VS_MAX"] = df["TC_DANGKY"] - df["HIST_MAX_PASSED"]
    
    window2 = grouped.rolling(window=2, min_periods=1, closed='left')
    df["R2_AVG_GPA"] = window2["GPA"].mean().reset_index(0, drop=True)
    df["R2_SUM_FAIL"] = window2["FAIL_CREDITS"].sum().reset_index(0, drop=True)
    df["R2_AVG_PASSED"] = window2["TC_HOANTHANH"].mean().reset_index(0, drop=True)
    df["R2_SUM_DANGKY"] = window2["TC_DANGKY"].sum().reset_index(0, drop=True)
    df["R2_SUM_PASSED"] = window2["TC_HOANTHANH"].sum().reset_index(0, drop=True)
    df["R2_PASS_RATE"] = df["R2_SUM_PASSED"] / df["R2_SUM_DANGKY"]
    df["PRESSURE_VS_R2"] = df["TC_DANGKY"] / df["R2_AVG_PASSED"]
    df["GPA_TREND_R2"] = df["R2_AVG_GPA"] - df["HIST_AVG_GPA"]
    df["FAIL_TREND_R2"] = df["LAST_FAIL"] - (df["R2_SUM_FAIL"] / 2)

    window3 = grouped.rolling(window=3, min_periods=1, closed='left')
    df["R3_AVG_GPA"] = window3["GPA"].mean().reset_index(0, drop=True)
    df["R3_SUM_FAIL"] = window3["FAIL_CREDITS"].sum().reset_index(0, drop=True)
    df["R3_AVG_PASSED"] = window3["TC_HOANTHANH"].mean().reset_index(0, drop=True)
    df["PRESSURE_VS_R3"] = df["TC_DANGKY"] / df["R3_AVG_PASSED"]
    df["OVERLOAD_R3"] = df["TC_DANGKY"] - df["R3_AVG_PASSED"]
    
    def parse_hoc_ky(hk_str):
        hk_str = str(hk_str)
        year_match = re.search(r"(\d{4})", hk_str)
        year = int(year_match.group(1)) if year_match else 2024
        return year

    df["YEAR_START"] = df["HOC_KY"].apply(parse_hoc_ky)
    df["SV_NAM_THU"] = df["YEAR_START"] - df["NAM_TUYENSINH"] + 1
    df.loc[df["SV_NAM_THU"] < 1, "SV_NAM_THU"] = 1 
    
    fill_0 = [
        "LAST_FAIL", "R2_SUM_FAIL", "R3_SUM_FAIL", "FAIL_TREND_R2", "OVERLOAD_R3", 
        "TOTAL_EARNED", "OVERLOAD_VS_MAX", "HIST_STD_GPA", "GPA_TREND_R2"
    ]
    df[fill_0] = df[fill_0].fillna(0.0)

    fill_1 = ["LAST_PASS_RATIO", "R2_PASS_RATE", "PRESSURE_VS_R2", "PRESSURE_VS_R3"]
    df[fill_1] = df[fill_1].fillna(1.0)
    
    fill_mean_gpa = ["LAST_GPA", "R2_AVG_GPA", "R3_AVG_GPA", "HIST_AVG_GPA", "HIST_MAX_GPA"]
    df[fill_mean_gpa] = df[fill_mean_gpa].fillna(academic_df["GPA"].mean())
        
    fill_15 = ["LAST_PASSED", "R2_AVG_PASSED", "R3_AVG_PASSED", "HIST_MAX_PASSED"]
    df[fill_15] = df[fill_15].fillna(15.0)
        
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    
    final_df = pd.merge(
        input_df[["MA_SO_SV", "HOC_KY"]], 
        df, 
        on=["MA_SO_SV", "HOC_KY"], 
        how="left"
    )
    
    return final_df

train_final = get_features(train_df_raw)
val_final = get_features(val_df_raw)
test_final = get_features(test_df_raw)

In [9]:
print(train_final)
print(val_final)
print(test_final)

           MA_SO_SV         HOC_KY   CPA   GPA  TC_DANGKY  TC_HOANTHANH  \
0      00003e092652  HK1 2023-2024  1.64  1.97         18            15   
1      000e15519006  HK1 2021-2022  3.85  3.85          9             9   
2      000e15519006  HK2 2021-2022  2.77  3.12         19            19   
3      000e15519006  HK1 2022-2023  2.83  2.98         21            21   
4      000e15519006  HK2 2022-2023  2.68  2.92         18            18   
...             ...            ...   ...   ...        ...           ...   
90117  fffd51317dd2  HK1 2022-2023  1.69  1.75         20            17   
90118  fffd51317dd2  HK2 2022-2023  0.61  1.78         15             5   
90119  ffff4d891f10  HK1 2022-2023  3.04  3.04         18            18   
90120  ffff4d891f10  HK2 2022-2023  3.16  3.12         18            18   
90121  ffff4d891f10  HK1 2023-2024  2.88  3.00         21            21   

       NAM_TUYENSINH PTXT TOHOP_XT  DIEM_TRUNGTUYEN  ...  PRESSURE_VS_R2  \
0               2023  1

In [10]:
#Training target: Pass ratio

target = "PASS_RATIO"
categorical_cols = ["PTXT", "TOHOP_XT"]

train_final["PASS_RATIO"] = train_final["TC_HOANTHANH"] / train_final["TC_DANGKY"]
val_final["PASS_RATIO"] = val_final["TC_HOANTHANH"] / val_final["TC_DANGKY"]

train_final["PASS_RATIO"] = train_final["PASS_RATIO"].replace([np.inf, -np.inf], 0).fillna(0).clip(0, 1)
val_final["PASS_RATIO"] = val_final["PASS_RATIO"].replace([np.inf, -np.inf], 0).fillna(0).clip(0, 1)

In [11]:
def split_by_year(df):
    df_fresh = df[df["SV_NAM_THU"] == 1].copy()
    df_senior = df[df["SV_NAM_THU"] > 1].copy()
    
    return df_fresh, df_senior

In [12]:
train_fresh, train_senior = split_by_year(train_final)
val_fresh, val_senior = split_by_year(val_final)
test_fresh, test_senior = split_by_year(test_final)

In [13]:
feats_fresh = [
    "TC_DANGKY", "SEMESTER_INDEX", "PTXT", "TOHOP_XT",
    "DIEM_TRUNGTUYEN", "DIEM_CHUAN", "SCORE_GAP", "ENTRY_RANK", 
    "BENCHMARK_TIER", "Z_SCORE", "GAP_RATIO",
    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO", "PRESSURE_VS_R2" 
]

feats_senior = [
    "TC_DANGKY", "SEMESTER_INDEX", "SV_NAM_THU", "PTXT", "TOHOP_XT",
    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO", "R2_AVG_GPA", "R2_SUM_FAIL", 
    "R2_PASS_RATE", "R3_AVG_GPA", "R3_SUM_FAIL", "PRESSURE_VS_R2", 
    "PRESSURE_VS_R3", "OVERLOAD_R3", "FAIL_TREND_R2", "GPA_TREND_R2",
    "TOTAL_EARNED", "HIST_AVG_GPA", "HIST_MAX_PASSED", "HIST_MAX_GPA", 
    "HIST_STD_GPA", "OVERLOAD_VS_MAX"
]

meta_cols = ["MA_SO_SV", "HOC_KY", "TC_HOANTHANH", "TC_DANGKY", "PASS_RATIO", "PTXT", "TOHOP_XT"]

def filter_cols(df, features):
    desired_cols = set(features + meta_cols)
    existing_cols = [c for c in df.columns if c in desired_cols]
    return df[existing_cols].copy()

train_fresh = filter_cols(train_fresh, feats_fresh)
val_fresh = filter_cols(val_fresh, feats_fresh)
test_fresh = filter_cols(test_fresh, feats_fresh)

train_senior = filter_cols(train_senior, feats_senior)
val_senior = filter_cols(val_senior, feats_senior)
test_senior = filter_cols(test_senior, feats_senior)

full_train_fresh = pd.concat([train_fresh, val_fresh], axis=0, ignore_index=True)
full_train_senior = pd.concat([train_senior, val_senior], axis=0, ignore_index=True)

In [14]:
print(f"Train Fresher: {train_fresh.shape} | Train Senior: {train_senior.shape}")
print(f"Val Fresher: {val_fresh.shape} | Val Senior: {val_senior.shape}")
print(f"Test Fresher: {test_fresh.shape} | Test Senior: {test_senior.shape}")

Train Fresher: (24996, 19) | Train Senior: (65126, 28)
Val Fresher: (3504, 19) | Val Senior: (11640, 28)
Test Fresher: (4326, 18) | Test Senior: (12176, 27)


In [15]:
from sklearn.model_selection import GroupKFold

N_FOLDS = 5
gkf = GroupKFold(n_splits=N_FOLDS)

groups_fresh = train_fresh["MA_SO_SV"].values
groups_senior = train_senior["MA_SO_SV"].values

In [16]:
FIXED_PARAMS = {
    "objective": "tweedie",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "random_state": RANDOM_SEED,
    "n_estimators": 4000,
    "device": "cpu",
    "verbosity": -1,
    "boost_from_average": True
}

In [17]:
def get_optuna_params(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "tweedie_variance_power": trial.suggest_float("tweedie_variance_power", 1.01, 1.99),
        
        "num_leaves": trial.suggest_int("num_leaves", 20, 300), 
        "max_depth": trial.suggest_int("max_depth", 3, 12), 
        
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 100, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 10.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True),
        "path_smooth": trial.suggest_float("path_smooth", 0.0, 10.0),
        
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
        
        "max_bin": trial.suggest_categorical("max_bin", [255, 512]),
    }
    
    return params


def optimize_lgb(train_df, feats, n_trial, optim_type, categorical_cols):
    groups = train_df["MA_SO_SV"].values 
    
    def objective_lgb(trial):
        params = {**FIXED_PARAMS, **get_optuna_params(trial)}
        
        gkf_optuna = GroupKFold(n_splits=3)
        cv_scores = []

        for train_idx, val_idx in gkf_optuna.split(train_df, groups=groups):
            df_tr = train_df.iloc[train_idx].copy()
            df_val = train_df.iloc[val_idx].copy()
            
            encoder = ce.CatBoostEncoder(cols=categorical_cols, handle_missing="return_nan")
            df_tr[categorical_cols] = encoder.fit_transform(df_tr[categorical_cols], df_tr[target])
            df_val[categorical_cols] = encoder.transform(df_val[categorical_cols])
            
            X_tr, y_tr = df_tr[feats], df_tr[target]
            X_val, y_val = df_val[feats], df_val[target]
            
            tc_dangky_val = df_val["TC_DANGKY"].values
            y_true_tc = df_val["TC_HOANTHANH"].values

            model = LGBMRegressor(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric="rmse", # Metric cho ratio
                callbacks=[early_stopping(50, verbose=False)]
            )
            
            preds_ratio = model.predict(X_val, num_iteration=model.best_iteration_)
            preds_tc = np.clip(preds_ratio * tc_dangky_val, 0, tc_dangky_val)
            
            rmse = np.sqrt(mean_squared_error(y_true_tc, preds_tc))
            cv_scores.append(rmse)
            
        return np.mean(cv_scores)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
    
    with tqdm(total=n_trial, desc=f"CV Tuning {optim_type}") as pbar:
        def tqdm_callback(s, t):
            pbar.update(1)
            if s.best_value is not None:
                pbar.set_postfix({"Best CV RMSE": f"{s.best_value:.4f}"})
        study.optimize(objective_lgb, n_trials=n_trial, callbacks=[tqdm_callback])

    return {**FIXED_PARAMS, **study.best_params}, study.best_value

In [18]:
full_train_fresh_cv = pd.concat([train_fresh, val_fresh], axis=0).reset_index(drop=True)
full_train_senior_cv = pd.concat([train_senior, val_senior], axis=0).reset_index(drop=True)

N_TRIALS = 80

best_senior_params, best_senior_rmse = optimize_lgb(
    full_train_senior_cv, feats_senior, N_TRIALS, "Senior", categorical_cols
)

best_fresh_params, best_fresh_rmse = optimize_lgb(
    full_train_fresh_cv, feats_fresh, N_TRIALS, "Fresher", categorical_cols
)

CV Tuning Senior:   0%|          | 0/80 [00:00<?, ?it/s]

CV Tuning Fresher:   0%|          | 0/80 [00:00<?, ?it/s]

In [19]:
print(f"Best Fresher RMSE: {best_fresh_rmse:.4f}")
print(f"Best Senior RMSE: {best_senior_rmse:.4f}")

print(f"Best Fresher Params: {best_fresh_params}")
print(f"Best Senior Params: {best_senior_params}")

Best Fresher RMSE: 3.7232
Best Senior RMSE: 3.7126
Best Fresher Params: {'objective': 'tweedie', 'metric': 'rmse', 'boosting_type': 'gbdt', 'random_state': 42, 'n_estimators': 4000, 'device': 'cpu', 'verbosity': -1, 'boost_from_average': True, 'learning_rate': 0.06822282118196367, 'tweedie_variance_power': 1.4526723894681812, 'num_leaves': 252, 'max_depth': 8, 'min_child_weight': 7.341831158430957, 'min_split_gain': 0.0029435331764019745, 'min_child_samples': 20, 'reg_alpha': 5.387459226596861, 'reg_lambda': 2.2657420080316053e-05, 'path_smooth': 4.185453899149557, 'subsample': 0.8721335698880228, 'subsample_freq': 1, 'colsample_bytree': 0.7708021463084817, 'extra_trees': False, 'max_bin': 512}
Best Senior Params: {'objective': 'tweedie', 'metric': 'rmse', 'boosting_type': 'gbdt', 'random_state': 42, 'n_estimators': 4000, 'device': 'cpu', 'verbosity': -1, 'boost_from_average': True, 'learning_rate': 0.03263204396642612, 'tweedie_variance_power': 1.196972605615063, 'num_leaves': 250, 'm

In [20]:
import category_encoders as ce

def train_lgb_cv(params, train_df, groups, feats, categorical_cols, model_type):
    print(f"Training {model_type}...")
    models = []
    encoders = []
    oof_preds = np.zeros(len(train_df))
    
    gkf = GroupKFold(n_splits=5)
    
    for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=groups)):
        df_train_fold = train_df.iloc[train_idx].copy()
        df_val_fold = train_df.iloc[val_idx].copy()
        
        encoder = ce.CatBoostEncoder(cols=categorical_cols, handle_missing="return_nan")
        df_train_fold[categorical_cols] = encoder.fit_transform(df_train_fold[categorical_cols], df_train_fold[target])
        df_val_fold[categorical_cols] = encoder.transform(df_val_fold[categorical_cols])
        
        X_tr, y_tr = df_train_fold[feats], df_train_fold[target]
        X_val, y_val = df_val_fold[feats], df_val_fold[target]
        tc_dangky_val = df_val_fold["TC_DANGKY"].values

        def feval_rmse_credits(labels, preds):
            true_tc = labels * tc_dangky_val
            pred_tc = np.clip(preds * tc_dangky_val, 0, tc_dangky_val)
            rmse = np.sqrt(mean_squared_error(true_tc, pred_tc))
            return "rmse_credits", rmse, False

        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric=feval_rmse_credits,
            callbacks=[early_stopping(100), log_evaluation(0)]
        )
        
        preds_ratio = model.predict(X_val, num_iteration=model.best_iteration_)
        oof_preds[val_idx] = np.clip(preds_ratio * tc_dangky_val, 0, tc_dangky_val)
        models.append(model)
        encoders.append(encoder)
        
        current_rmse = model.best_score_['valid_0']['rmse_credits']
        print(f"Fold {fold+1} completed. RMSE credits: {current_rmse:.4f}")

    return models, encoders, oof_preds

groups_fresh_full = full_train_fresh_cv["MA_SO_SV"].values
groups_senior_full = full_train_senior_cv["MA_SO_SV"].values

models_fresh, encoders_fresh, oof_fresh = train_lgb_cv(
    best_fresh_params, full_train_fresh_cv, groups_fresh_full, feats_fresh, categorical_cols, "Fresher"
)
models_senior, encoders_senior, oof_senior = train_lgb_cv(
    best_senior_params, full_train_senior_cv, groups_senior_full, feats_senior, categorical_cols, "Senior"
)

Training Fresher...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[102]	valid_0's rmse: 0.20795	valid_0's rmse_credits: 3.6625
Fold 1 completed. RMSE credits: 3.6625
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[121]	valid_0's rmse: 0.202952	valid_0's rmse_credits: 3.68191
Fold 2 completed. RMSE credits: 3.6819
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[158]	valid_0's rmse: 0.209931	valid_0's rmse_credits: 3.75178
Fold 3 completed. RMSE credits: 3.7518
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's rmse: 0.213337	valid_0's rmse_credits: 3.83696
Fold 4 completed. RMSE credits: 3.8370
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[98]	valid_0's rmse: 0.205723	valid_0's rmse_credits: 3.66652
Fold 5 completed. RMSE cred

In [21]:
all_y_true = np.concatenate([
    full_train_fresh_cv["TC_HOANTHANH"].values, 
    full_train_senior_cv["TC_HOANTHANH"].values
])
all_oof_preds = np.concatenate([oof_fresh, oof_senior])

rmse = np.sqrt(mean_squared_error(all_y_true, all_oof_preds))
mse = mean_squared_error(all_y_true, all_oof_preds)
r2 = r2_score(all_y_true, all_oof_preds)
wmape = np.sum(np.abs(all_y_true - all_oof_preds)) / np.sum(all_y_true)

print("=== FINAL OOF METRICS ===")
print(f"RMSE: {rmse:.4f}")
print(f"MSE  : {mse:.4f}")
print(f"R^2  : {r2:.4f}")
print(f"wMAPE: {wmape:.4f}")

=== FINAL OOF METRICS ===
RMSE: 3.7090
MSE  : 13.7566
R^2  : 0.6844
wMAPE: 0.1649


In [22]:
def predict_ensemble(models, encoders, test_df, feats, categorical_cols):
    preds_total = np.zeros(len(test_df))
    tc_dangky = test_df["TC_DANGKY"].values
    
    for model, encoder in zip(models, encoders):
        X_test_fold = test_df[feats].copy()
        X_test_fold[categorical_cols] = encoder.transform(X_test_fold[categorical_cols])
        
        preds_ratio = model.predict(X_test_fold)
        preds_tc = np.clip(preds_ratio * tc_dangky, 0, tc_dangky)
        preds_total += preds_tc
        
    return preds_total / len(models)

preds_test_fresh = predict_ensemble(models_fresh, encoders_fresh, test_fresh, feats_fresh, categorical_cols)
preds_test_senior = predict_ensemble(models_senior, encoders_senior, test_senior, feats_senior, categorical_cols)

In [23]:
test_fresh_pseudo = test_fresh.copy()
test_fresh_pseudo[target] = preds_test_fresh / test_fresh["TC_DANGKY"].values
test_fresh_pseudo["TC_HOANTHANH"] = preds_test_fresh

test_senior_pseudo = test_senior.copy()
test_senior_pseudo[target] = preds_test_senior / test_senior["TC_DANGKY"].values
test_senior_pseudo["TC_HOANTHANH"] = preds_test_senior

train_fresh_final = pd.concat([full_train_fresh_cv, test_fresh_pseudo], axis=0).reset_index(drop=True)
train_senior_final = pd.concat([full_train_senior_cv, test_senior_pseudo], axis=0).reset_index(drop=True)

groups_fresh_final = train_fresh_final["MA_SO_SV"].values
groups_senior_final = train_senior_final["MA_SO_SV"].values

print(f"Training data after pseudo-labeling: Fresh {train_fresh_final.shape}, Senior {train_senior_final.shape}")

Training data after pseudo-labeling: Fresh (32826, 19), Senior (88942, 28)


In [24]:
models_fresh_pl, encoders_fresh_pl, oof_fresh_pl = train_lgb_cv(
    best_fresh_params, train_fresh_final, groups_fresh_final, feats_fresh, categorical_cols, "Fresher_PL"
)

models_senior_pl, encoders_senior_pl, oof_senior_pl = train_lgb_cv(
    best_senior_params, train_senior_final, groups_senior_final, feats_senior, categorical_cols, "Senior_PL"
)

Training Fresher_PL...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 0.187127	valid_0's rmse_credits: 3.35374
Fold 1 completed. RMSE credits: 3.3537
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[90]	valid_0's rmse: 0.19626	valid_0's rmse_credits: 3.47892
Fold 2 completed. RMSE credits: 3.4789
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[98]	valid_0's rmse: 0.194283	valid_0's rmse_credits: 3.47129
Fold 3 completed. RMSE credits: 3.4713
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's rmse: 0.195283	valid_0's rmse_credits: 3.48972
Fold 4 completed. RMSE credits: 3.4897
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[135]	valid_0's rmse: 0.193209	valid_0's rmse_credits: 3.49533
Fold 5 completed. RMSE c

In [25]:
preds_test_fresh_final = predict_ensemble(models_fresh_pl, encoders_fresh_pl, test_fresh, feats_fresh, categorical_cols)
preds_test_senior_final = predict_ensemble(models_senior_pl, encoders_senior_pl, test_senior, feats_senior, categorical_cols)

sub_fresh_pl = pd.DataFrame({"MA_SO_SV": test_fresh["MA_SO_SV"], "PRED_TC_HOANTHANH": preds_test_fresh_final})
sub_senior_pl = pd.DataFrame({"MA_SO_SV": test_senior["MA_SO_SV"], "PRED_TC_HOANTHANH": preds_test_senior_final})

final_preds_df = pd.concat([sub_fresh_pl, sub_senior_pl], axis=0)

final_submission = pd.merge(test_df_raw[["MA_SO_SV"]], final_preds_df, on=["MA_SO_SV"], how="left")
final_submission["PRED_TC_HOANTHANH"] = final_submission["PRED_TC_HOANTHANH"].fillna(0)

final_submission.to_csv("submission_pseudo_label.csv", index=False)

In [26]:
print(final_submission)

           MA_SO_SV  PRED_TC_HOANTHANH
0      481436e2064d           1.811577
1      6c8a97d22131           2.500039
2      e87f62beabbb           5.993558
3      438aff5ef524           0.494194
4      ad172a9b0722          15.456939
...             ...                ...
16497  9e803a0d26f0          41.641614
16498  dbc819721795          51.826244
16499  9e1c8deafb70          41.291326
16500  ffecfc70f83a          43.034942
16501  dc7b37953745          51.263348

[16502 rows x 2 columns]
