In [1]:
!pip install category_encoders



In [2]:
!git clone https://github.com/CryAndRRich/trustee.git

Cloning into 'trustee'...
remote: Enumerating objects: 135, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 135 (delta 41), reused 109 (delta 31), pack-reused 14 (from 2)[K
Receiving objects: 100% (135/135), 83.07 MiB | 16.59 MiB/s, done.
Resolving deltas: 100% (41/41), done.
Updating files: 100% (53/53), done.


In [3]:
import numpy as np
import pandas as pd
import category_encoders as ce

In [4]:
import sys
sys.path.append("/kaggle/working/trustee")

In [5]:
from config import *
from preprocess import *
from utils import *
from model import *

In [6]:
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

DEVICE = "cpu"

Random seed set to 42


In [7]:
INPUT_ROOT = "/kaggle/input/df2026/DATA"
ACADEMIC_CSV = f"{INPUT_ROOT}/academic_records.csv"
ADMISSION_CSV = f"{INPUT_ROOT}/admission.csv"
TEST_CSV = f"{INPUT_ROOT}/test.csv"

WORK_DIR = "/kaggle/working"
SUBMISSION_CSV = f"{WORK_DIR}/submission.csv"

# MODEL_NAME là "Decision Tree", "Random Forest", "XGBoost" hoặc "LightGBM"
MODEL_NAME = "LightGBM"

# APPROACH_TYPE là "Credits", "Gap" hoặc "Ratio"
APPROACH_TYPE = "Ratio"

N_TRIALS = 300

In [8]:
academic_df, student_df, train_df_raw, val_df_raw, test_df_raw = get_data(
    ADMISSION_CSV, ACADEMIC_CSV, TEST_CSV
)

In [9]:
train_final = get_features(train_df_raw, academic_df, student_df)
val_final = get_features(val_df_raw, academic_df, student_df)
test_final = get_features(test_df_raw, academic_df, student_df)

In [10]:
# target là "TC_HOANTHANH", "FAIL_CREDITS" hoặc "PASS_RATIO"
target = "PASS_RATIO"
categorical_cols = ["PTXT", "TOHOP_XT"]

cbe = ce.CatBoostEncoder(
    cols=categorical_cols, 
    handle_missing="return_nan"
)

train_final[categorical_cols] = cbe.fit_transform(
    train_final[categorical_cols], 
    train_final[target]
)

val_final[categorical_cols] = cbe.transform(val_final[categorical_cols])
test_final[categorical_cols] = cbe.transform(test_final[categorical_cols])

print(f"Final Train shape: {train_final.shape}")
print(f"Final Val shape: {val_final.shape}")
print(f"Final Test shape: {test_final.shape}")

Final Train shape: (90122, 46)
Final Val shape: (15144, 46)
Final Test shape: (16502, 46)


In [11]:
train_fresh, train_senior = split_by_year(train_final)
val_fresh, val_senior = split_by_year(val_final)
test_fresh, test_senior = split_by_year(test_final)

In [12]:
feats_senior = [
    "TC_DANGKY", "SEMESTER_INDEX", "SV_NAM_THU",
    
    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO",
    
    "R2_AVG_GPA", "R2_SUM_FAIL", "R2_PASS_RATE",
    "FAIL_TREND_R2", "GPA_TREND_R2",
    
    "R3_AVG_GPA", "R3_SUM_FAIL",
    "PRESSURE_VS_R2", "PRESSURE_VS_R3", "OVERLOAD_R3",
    
    "TOTAL_EARNED", "OVERLOAD_VS_MAX",
    "HIST_AVG_GPA", "HIST_MAX_PASSED", "HIST_MAX_GPA", "HIST_STD_GPA",    
]

feats_fresh = [
    "TC_DANGKY", "SEMESTER_INDEX", "PTXT", "TOHOP_XT",
    
    "DIEM_TRUNGTUYEN", "DIEM_CHUAN", 
    "SCORE_GAP", "ENTRY_RANK", "BENCHMARK_TIER",
    "Z_SCORE", "GAP_RATIO",

    "LAST_GPA", "LAST_FAIL", "LAST_PASS_RATIO",
    "PRESSURE_VS_R2" 
]

# Nếu target = "TC_HOANTHANH" thì không cần cộng vào meta_cols
meta_cols = ["MA_SO_SV", "HOC_KY", "TC_HOANTHANH"]
if target != "TC_HOANTHANH":
    meta_cols = meta_cols + [target]

In [13]:
train_fresh = filter_cols(train_fresh, feats_fresh, meta_cols)
val_fresh = filter_cols(val_fresh, feats_fresh, meta_cols)
test_fresh = filter_cols(test_fresh, feats_fresh, meta_cols)

train_senior = filter_cols(train_senior, feats_senior, meta_cols)
val_senior = filter_cols(val_senior, feats_senior, meta_cols)
test_senior = filter_cols(test_senior, feats_senior, meta_cols)

full_train_fresh = pd.concat([train_fresh, val_fresh], axis=0, ignore_index=True)
full_train_senior = pd.concat([train_senior, val_senior], axis=0, ignore_index=True)

In [14]:
print(f"Train Fresher: {train_fresh.shape} | Train Senior: {train_senior.shape}")
print(f"Val Fresher: {val_fresh.shape} | Val Senior: {val_senior.shape}")
print(f"Test Fresher: {test_fresh.shape} | Test Senior: {test_senior.shape}")

Train Fresher: (24996, 19) | Train Senior: (65126, 26)
Val Fresher: (3504, 19) | Val Senior: (11640, 26)
Test Fresher: (4326, 19) | Test Senior: (12176, 26)


In [15]:
best_fresh_params, best_fresh_rmse = optimize_params(
    model_name=MODEL_NAME, 
    train_df=train_fresh,    
    val_df=val_fresh,
    feats=feats_fresh, 
    target_col=target,       
    n_trial=N_TRIALS, 
    model_type="Fresher", 
    approach_type=APPROACH_TYPE
)

best_senior_params, best_senior_rmse = optimize_params(
    model_name=MODEL_NAME,
    train_df=train_senior,   
    val_df=val_senior,
    feats=feats_senior, 
    target_col=target,
    n_trial=N_TRIALS, 
    model_type="Senior", 
    approach_type=APPROACH_TYPE
)

LightGBM Fresher Tuning - Mode: Ratio:   0%|          | 0/300 [00:00<?, ?it/s]

LightGBM Senior Tuning - Mode: Ratio:   0%|          | 0/300 [00:00<?, ?it/s]

In [16]:
print(f"Best Fresher RMSE: {best_fresh_rmse:.4f}")
print(f"Best Senior RMSE: {best_senior_rmse:.4f}")

print(f"Best Fresher Params: {best_fresh_params}")
print(f"Best Senior Params: {best_senior_params}")

Best Fresher RMSE: 3.9451
Best Senior RMSE: 3.6727
Best Fresher Params: {'objective': 'tweedie', 'boosting_type': 'gbdt', 'boost_from_average': True, 'metric': 'rmse', 'n_estimators': 4000, 'device': 'cpu', 'verbosity': -1, 'random_state': 42, 'learning_rate': 0.03678245094798871, 'tweedie_variance_power': 1.2428485448170394, 'num_leaves': 151, 'max_depth': 8, 'min_child_samples': 54, 'min_child_weight': 16.27422147848218, 'reg_alpha': 7.183764545963089e-07, 'reg_lambda': 1.063740975416394, 'min_split_gain': 0.010613940764911123, 'path_smooth': 8.421582164084194, 'subsample': 0.7396104434868679, 'subsample_freq': 1, 'colsample_bytree': 0.9771600392700039, 'extra_trees': False, 'max_bin': 255}
Best Senior Params: {'objective': 'tweedie', 'boosting_type': 'gbdt', 'boost_from_average': True, 'metric': 'rmse', 'n_estimators': 4000, 'device': 'cpu', 'verbosity': -1, 'random_state': 42, 'learning_rate': 0.007363986790141222, 'tweedie_variance_power': 1.3941702178222326, 'num_leaves': 89, 'ma

In [17]:
best_iter_fresh, val_preds_fresh = train_model(
    model_name=MODEL_NAME, 
    params=best_fresh_params,
    train_df=train_fresh,    
    val_df=val_fresh,
    feats=feats_fresh, 
    target_cols=target,       
    model_type="Fresher", 
    approach_type=APPROACH_TYPE
)

best_iter_senior, val_preds_senior = train_model(
    model_name=MODEL_NAME, 
    params=best_senior_params,
    train_df=train_senior,    
    val_df=val_senior,
    feats=feats_senior, 
    target_cols=target,       
    model_type="Senior", 
    approach_type=APPROACH_TYPE
)

Training LightGBM Fresher - Mode: Ratio...
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.207051
Early stopping, best iteration is:
[82]	valid_0's rmse: 0.206838
Training LightGBM Senior - Mode: Ratio...
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.234718
[200]	valid_0's rmse: 0.220915
[300]	valid_0's rmse: 0.216923
[400]	valid_0's rmse: 0.215558
[500]	valid_0's rmse: 0.214972
[600]	valid_0's rmse: 0.214703
[700]	valid_0's rmse: 0.21461
[800]	valid_0's rmse: 0.214533
[900]	valid_0's rmse: 0.214437
[1000]	valid_0's rmse: 0.214372
Early stopping, best iteration is:
[986]	valid_0's rmse: 0.214362


In [18]:
val_targets = np.concatenate([val_fresh["TC_HOANTHANH"].values, val_senior["TC_HOANTHANH"].values])
val_preds = np.concatenate([val_preds_fresh, val_preds_senior])

metrics = evaluate_model_performance(val_targets, val_preds)

=== Performance Metrics [Validation Set] ===
RMSE  : 3.7375
MSE   : 13.9687
R^2   : 0.7166
wMAPE : 0.1800


In [19]:
test_preds_fresh = test_model(
    model_name=MODEL_NAME,
    params=best_fresh_params, 
    best_iter=best_iter_fresh, 
    full_train_df=full_train_fresh, 
    train_df=train_fresh, 
    test_df=test_fresh, 
    feats=feats_fresh, 
    target_col=target,
    save_dir=WORK_DIR,
    model_type="Fresher", 
    approach_type=APPROACH_TYPE
)

test_preds_senior = test_model(
    model_name=MODEL_NAME,
    params=best_senior_params, 
    best_iter=best_iter_senior, 
    full_train_df=full_train_senior, 
    train_df=train_senior, 
    test_df=test_senior, 
    feats=feats_senior, 
    target_col=target,
    save_dir=WORK_DIR,
    model_type="Senior", 
    approach_type=APPROACH_TYPE
)

Testing LightGBM Fresher - Mode: Ratio...
Saved Model: /kaggle/working/lgbm_fresher.txt
Testing LightGBM Senior - Mode: Ratio...
Saved Model: /kaggle/working/lgbm_senior.txt


In [20]:
submission_df = save_predictions(
    test_fresh, test_senior,
    test_preds_fresh, test_preds_senior,
    test_df_raw,
    SUBMISSION_CSV
)

Saved Submission: /kaggle/working/submission.csv
Shape: (16502, 2)


In [21]:
print(submission_df)

           MA_SO_SV  PRED_TC_HOANTHANH
0      481436e2064d           1.691754
1      6c8a97d22131           2.588500
2      e87f62beabbb           7.083268
3      438aff5ef524           0.553436
4      ad172a9b0722          14.988260
...             ...                ...
16497  9e803a0d26f0          41.403311
16498  dbc819721795          53.489412
16499  9e1c8deafb70          42.266773
16500  ffecfc70f83a          45.357704
16501  dc7b37953745          52.677259

[16502 rows x 2 columns]
