# Hanoi Weather Data - Model Training

This section focuses on training and evaluating predictive models for temperature forecasting using the engineered features from the previous step. The goal is to identify the most robust and generalizable model through proper validation, hyperparameter tuning, and performance benchmarking.

## Objectives
1. Split data into train / validation / test sets — ensuring no temporal data leakage
2. Train baseline and advanced ML models (e.g., Linear Regression, Random Forest, XGBoost, LightGBM, etc.)
3. Perform hyperparameter tuning using frameworks like Optuna or GridSearchCV
4. Evaluate model performance using regression metrics: RMSE, MAE, MAPE, R²
5. Analyze overfitting / underfitting behavior using learning curves
6. Select and save the best-performing model for deployment or forecasting pipeline

## 1. Setup and Import

In [52]:
#Import library for model training
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import numpy as np
import seaborn as sns
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
import optuna
pd.set_option('display.max_columns', None)

#Loading processed data
df = pd.read_csv('../data/processed/feature_engineering_daily_data.csv', index_col='datetime')

## 2. Baseline model

In [53]:
X = df.drop(columns=['target'])
y = df['target']

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# ---------- 1) Khóa schema OHE toàn cục ----------
ohe_template = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_template.fit(X[cat_cols])
fixed_categories = ohe_template.categories_

def make_preprocessor():
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore',
                              sparse_output=False,
                              categories=fixed_categories))
    ])
    return ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ], remainder='drop')

# ---------- 2) Cố định mặt nạ VarianceThreshold từ FOLD ĐẦU ----------
tscv = TimeSeriesSplit(n_splits=5)
first_train_idx, first_test_idx = next(tscv.split(X))

# Fit preprocessor + VT trên fold đầu
preprocessor = make_preprocessor()
preprocessor.fit(X.iloc[first_train_idx], y.iloc[first_train_idx])
X_first_train_trans = preprocessor.transform(X.iloc[first_train_idx])
vt = VarianceThreshold(threshold=0.0).fit(X_first_train_trans)
vt_support_mask = vt.get_support()
feat_names_all = preprocessor.get_feature_names_out()
feat_names_after_vt = feat_names_all[vt_support_mask]

def apply_vt_mask(X_mat, mask=vt_support_mask):
    return X_mat[:, mask]

In [54]:
print("BẮT ĐẦU TUNING XGBoost với Optuna...")

# Dùng 70-80% dữ liệu đầu để tuning
train_ratio = 0.8
split_idx = int(train_ratio * len(X))
X_tune, y_tune = X.iloc[:split_idx], y.iloc[:split_idx]

# Preprocess cố định
prep_tune = make_preprocessor()
prep_tune.fit(X_tune, y_tune)
X_tune_t = apply_vt_mask(prep_tune.transform(X_tune))

# Hàm objective cho Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'
    }

    tscv = TimeSeriesSplit(n_splits=5)
    all_r2 = []
    for tr_idx, te_idx in tscv.split(X_tune):
        X_train, X_test = X_tune.iloc[tr_idx], X_tune.iloc[te_idx]
        y_train, y_test = y_tune.iloc[tr_idx], y_tune.iloc[te_idx]

        X_train_t = apply_vt_mask(prep_tune.transform(X_train))
        X_test_t = apply_vt_mask(prep_tune.transform(X_test))

        model = XGBRegressor(**params)
        model.fit(X_train_t, y_train)
        y_pred = model.predict(X_test_t)
        r2 = r2_score(y_test, y_pred)
        all_r2.append(r2)

    return np.mean(all_r2)

# Tạo study và tối ưu hóa
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)  # Tương đương n_iter=100

# Lấy tham số tốt nhất
best_params = study.best_params
print("HOÀN THÀNH TUNING! R2 tốt nhất:", study.best_value)
print("Tham số tốt nhất:", best_params)

BẮT ĐẦU TUNING XGBoost với Optuna...


[I 2025-10-27 00:09:08,723] A new study created in memory with name: no-name-c62e1665-c252-4dd5-ab55-b6106d5dd1d6
[W 2025-10-27 00:09:13,001] Trial 0 failed with parameters: {'n_estimators': 364, 'max_depth': 4, 'learning_rate': 0.010390403084233219, 'subsample': 0.5631350185983126, 'colsample_bytree': 0.6332026967287642, 'min_child_weight': 2, 'reg_alpha': 7.406517216782404, 'reg_lambda': 6.451307621434087} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\ADMIN\AppData\Local\Temp\ipykernel_11056\4098447385.py", line 39, in objective
    model.fit(X_train_t, y_train)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^

KeyboardInterrupt: 

In [3]:
best_params = {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.08207254032851698, 'subsample': 0.9160744838218801, 'colsample_bytree': 0.9125188448842537, 'min_child_weight': 9, 'reg_alpha': 7.734783954873539, 'reg_lambda': 3.1988045223999775}

In [55]:
# ---------- 3) CV vòng 1: DÙNG MODEL ĐÃ TUNED (KHÔNG TUNING LẠI) ----------
tscv = TimeSeriesSplit(n_splits=5)
all_rmse, all_mae, all_r2 = [], [], []
perm_importances_list = []

print("\n=== CV VÒNG 1: DÙNG MODEL ĐÃ TUNED + Permutation Importance ===")
for fold, (tr_idx, te_idx) in enumerate(tscv.split(X), start=1):
    X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
    y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]

    preprocessor = make_preprocessor()
    preprocessor.fit(X_train, y_train)
    X_train_t = apply_vt_mask(preprocessor.transform(X_train))
    X_test_t  = apply_vt_mask(preprocessor.transform(X_test))

    # DÙNG MODEL ĐÃ TUNED
    model = XGBRegressor(**best_params, random_state=42, n_jobs=-1, tree_method='hist')
    model.fit(X_train_t, y_train)
    y_pred = model.predict(X_test_t)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    all_rmse.append(rmse); all_mae.append(mae); all_r2.append(r2)

    # Permutation Importance
    perm = permutation_importance(model, X_test_t, y_test, n_repeats=3, random_state=42, n_jobs=-1)
    perm_importances_list.append(perm.importances_mean)

    print(f"Fold {fold}: RMSE={rmse:.3f}  R2={r2:.3f}")
# Tổng kết CV vòng 1
print("\n=== KẾT QUẢ TỔNG KẾT CV VÒNG 1 ===")
print(f"Avg RMSE: {np.mean(all_rmse):.3f} ± {np.std(all_rmse):.3f}")
print(f"Avg MAE: {np.mean(all_mae):.3f} ± {np.std(all_mae):.3f}")
print(f"Avg R2: {np.mean(all_r2):.3f} ± {np.std(all_r2):.3f}")


=== CV VÒNG 1: DÙNG MODEL ĐÃ TUNED + Permutation Importance ===
Fold 1: RMSE=2.785  R2=0.702
Fold 2: RMSE=2.523  R2=0.721
Fold 3: RMSE=2.368  R2=0.796
Fold 4: RMSE=2.550  R2=0.750
Fold 5: RMSE=2.196  R2=0.798

=== KẾT QUẢ TỔNG KẾT CV VÒNG 1 ===
Avg RMSE: 2.484 ± 0.196
Avg MAE: 1.981 ± 0.139
Avg R2: 0.753 ± 0.039


Fold 1: RMSE=2.894  R2=0.678
Fold 2: RMSE=2.580  R2=0.708
Fold 3: RMSE=2.512  R2=0.770
Fold 4: RMSE=2.653  R2=0.730
Fold 5: RMSE=2.224  R2=0.793

=== CV VÒNG 1: DÙNG MODEL ĐÃ TUNED + Permutation Importance ===
Fold 1: RMSE=2.841  R2=0.690
Fold 2: RMSE=2.507  R2=0.724
Fold 3: RMSE=2.385  R2=0.793
Fold 4: RMSE=2.611  R2=0.738
Fold 5: RMSE=2.191  R2=0.799

## 3. Selecting top K features

In [49]:
# === CHỌN TOP-K TỪ TRUNG BÌNH ===
avg_perm = np.mean(perm_importances_list, axis=0)
order = np.argsort(avg_perm)[::-1]
cumsum = np.cumsum(avg_perm[order]); cumsum /= cumsum[-1]
top_k = np.argmax(cumsum >= 0.95) + 1
top_idx = order[:top_k]

print(f"\nTop-{top_k} features được chọn từ model đã tuning.")


Top-54 features được chọn từ model đã tuning.


## 4. Training model

In [50]:
import optuna
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Giả sử các hàm make_preprocessor, apply_vt_mask, select_topk, X, y, top_idx, top_k đã được định nghĩa

def select_topk(X_transformed, top_idx=top_idx):
    return X_transformed[:, top_idx] if isinstance(X_transformed, np.ndarray) else X_transformed.iloc[:, top_idx]

# Hàm objective cho Optuna
def objective(trial):
    # Định nghĩa không gian tham số
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'
    }

    # Khởi tạo danh sách để lưu RMSE và R2
    tscv = TimeSeriesSplit(n_splits=5)
    all_rmse, all_r2 = [], []

    # Cross-validation
    for tr_idx, te_idx in tscv.split(X):
        X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
        y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]

        # Preprocessing
        preprocessor = make_preprocessor()
        preprocessor.fit(X_train, y_train)
        X_train_t = select_topk(apply_vt_mask(preprocessor.transform(X_train)), top_idx)
        X_test_t = select_topk(apply_vt_mask(preprocessor.transform(X_test)), top_idx)

        # Huấn luyện mô hình
        model = XGBRegressor(**params)
        model.fit(X_train_t, y_train)
        y_pred = model.predict(X_test_t)

        # Tính toán metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        all_rmse.append(rmse)
        all_r2.append(r2)

    # Trả về giá trị trung bình R2 (Optuna tối ưu hóa theo hướng tối đa)
    return np.mean(all_r2)

# Tạo study và tối ưu hóa
study = optuna.create_study(direction='maximize')  # Tối đa hóa R2
study.optimize(objective, n_trials=100)  # Thử 50 tổ hợp tham số

# Lấy tham số tốt nhất
best_params = study.best_params
print("Best parameters:", best_params)
print(f"Best R2: {study.best_value:.3f}")

[I 2025-10-26 22:57:08,554] A new study created in memory with name: no-name-45814b0f-b27c-4051-86b8-e2c40deac0a8
[I 2025-10-26 22:57:10,105] Trial 0 finished with value: 0.7384317769252629 and parameters: {'n_estimators': 201, 'max_depth': 3, 'learning_rate': 0.13275274087292344, 'subsample': 0.9834444641994783, 'colsample_bytree': 0.7979757207590283, 'min_child_weight': 9, 'reg_alpha': 4.118050818285707, 'reg_lambda': 0.7057656565877779}. Best is trial 0 with value: 0.7384317769252629.
[I 2025-10-26 22:57:13,647] Trial 1 finished with value: 0.7382287647247276 and parameters: {'n_estimators': 417, 'max_depth': 7, 'learning_rate': 0.11363134494163409, 'subsample': 0.5821708951975614, 'colsample_bytree': 0.5243539960101882, 'min_child_weight': 9, 'reg_alpha': 2.8034329538873126, 'reg_lambda': 3.7972325348045466}. Best is trial 0 with value: 0.7384317769252629.
[I 2025-10-26 22:57:16,471] Trial 2 finished with value: 0.7263245905990934 and parameters: {'n_estimators': 850, 'max_depth': 

Best parameters: {'n_estimators': 410, 'max_depth': 3, 'learning_rate': 0.015045552102080887, 'subsample': 0.5494946668756909, 'colsample_bytree': 0.5915983274819723, 'min_child_weight': 7, 'reg_alpha': 6.1774878685238495, 'reg_lambda': 0.008500682384123714}
Best R2: 0.766


In [23]:
best_params = {'n_estimators': 748, 'max_depth': 3, 'learning_rate': 0.011587939821652413, 'subsample': 0.7734953360178061, 'colsample_bytree': 0.5835833834497329, 'min_child_weight': 9, 'reg_alpha': 1.719826025944296, 'reg_lambda': 5.592049232867018}

In [41]:
best_params = {'n_estimators': 374, 'max_depth': 4, 'learning_rate': 0.012485536016134191, 'subsample': 0.6097062712048895, 'colsample_bytree': 0.6769721953560869, 'min_child_weight': 4, 'reg_alpha': 0.02541725541718065, 'reg_lambda': 2.518986025206981}

In [51]:
# ---------- 4) CV vòng 2: REFIT VỚI TOP-K + THAM SỐ TỐI ƯU ----------
xgb_final = XGBRegressor(**best_params, random_state=42, n_jobs=-1, tree_method='hist')
all_rmse2, all_r2_2 = [], []

print(f"\n=== CV VÒNG 2: Top-{top_k} + Tham số tối ưu ===")
for fold, (tr_idx, te_idx) in enumerate(tscv.split(X), start=1):
    X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
    y_train, y_test = y.iloc[tr_idx], y.iloc[te_idx]

    preprocessor = make_preprocessor()
    preprocessor.fit(X_train, y_train)
    X_train_t = select_topk(apply_vt_mask(preprocessor.transform(X_train)))
    X_test_t  = select_topk(apply_vt_mask(preprocessor.transform(X_test)))

    xgb_final.fit(X_train_t, y_train)
    y_pred = xgb_final.predict(X_test_t)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    all_rmse2.append(rmse); all_r2_2.append(r2)
    print(f"Fold {fold}: RMSE={rmse:.3f}  R2={r2:.3f}")

print(f"\nFINAL: RMSE={np.mean(all_rmse2):.3f} ± {np.std(all_rmse2):.3f} | R2={np.mean(all_r2_2):.3f}")


=== CV VÒNG 2: Top-54 + Tham số tối ưu ===
Fold 1: RMSE=2.586  R2=0.743
Fold 2: RMSE=2.485  R2=0.729
Fold 3: RMSE=2.312  R2=0.806
Fold 4: RMSE=2.557  R2=0.749
Fold 5: RMSE=2.176  R2=0.802

FINAL: RMSE=2.423 ± 0.156 | R2=0.766
