In [22]:
# import lib
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt


In [23]:
df = pd.read_csv('D:/project_data_mining/src/dataset.csv')

In [24]:
# encode category 
df["cat_id"] = df["Expenditure_category"].astype("category").cat.codes


In [25]:
# lag features
def create_lag_features(df):
    df = df.copy()

    for lag in [1, 3, 6, 12]:
        df[f"CPI_lag_{lag}"] = (
            df.groupby("Expenditure_category")["CPI"].shift(lag)
        )
    return df

In [26]:
# rolling
def create_rolling_features(df):
    df["CPI_roll_mean_3"] = (
        df.groupby("Expenditure_category")["CPI"]
          .shift(1)
          .rolling(3)
          .mean()
    )

    df["CPI_roll_std_6"] = (
        df.groupby("Expenditure_category")["CPI"]
          .shift(1)
          .rolling(6)
          .std()
    )

    return df

In [27]:
df = create_lag_features(df)
df = create_rolling_features(df)

# Drop NaN sinh ra do lag/rolling
df_ml = df.dropna().reset_index(drop=True)

In [28]:
# # split train/val/test 
train_df = df[(df["TIME_PERIOD"] >= "2011-01-01") & (df["TIME_PERIOD"] <= "2020-12-31")]
val_df   = df[(df["TIME_PERIOD"] >= "2021-01-01") & (df["TIME_PERIOD"] <= "2022-12-31")]
test_df  = df[(df["TIME_PERIOD"] >= "2023-01-01") & (df["TIME_PERIOD"] <= "2025-12-31")]

In [29]:
# feature set 
features = [
    "cat_id",
    "CPI_lag_1", "CPI_lag_3", "CPI_lag_6", "CPI_lag_12",
    "CPI_roll_mean_3",
    "CPI_roll_std_6"
]

X_train = train_df[features]
y_train = train_df["CPI"]

X_val = val_df[features]
y_val = val_df["CPI"]

X_test = test_df[features]
y_test = test_df["CPI"]


In [30]:
xgb = XGBRegressor(
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train)

Parameters: { "min_samples_leaf" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
# eval
def smape(y_true, y_pred):
    return np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    ) * 100

def pred_within_10pct(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true) / y_true <= 0.10) * 100

def evaluate_global(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    smp  = smape(y_true, y_pred)
    p10  = pred_within_10pct(y_true, y_pred)

    print(f"\n{name} (xgb MODEL)")
    print(f"RMSE       : {rmse:.3f}")
    print(f"MAE        : {mae:.3f}")
    print(f"sMAPE      : {smp:.2f}%")
    print(f"Pred <10%  : {p10:.2f}%")


In [32]:
# results
val_pred  = xgb.predict(X_val)
test_pred = xgb.predict(X_test)

evaluate_global(y_val,  val_pred,  "Validation")
evaluate_global(y_test, test_pred, "Test")



Validation (xgb MODEL)
RMSE       : 1.007
MAE        : 0.686
sMAPE      : 0.75%
Pred <10%  : 100.00%

Test (xgb MODEL)
RMSE       : 1.084
MAE        : 0.850
sMAPE      : 0.86%
Pred <10%  : 100.00%


In [33]:
# CPI actual - CPI predict 
results_test = test_df[[
    "TIME_PERIOD",
    "Expenditure_category",
    "CPI"
]].copy()

results_test["CPI_pred_XGB"] = test_pred

results_test.head() 

Unnamed: 0,TIME_PERIOD,Expenditure_category,CPI,CPI_pred_XGB
144,2023-01-01,All Items,95.772,95.040886
145,2023-02-01,All Items,96.308,95.711952
146,2023-03-01,All Items,96.792,95.949287
147,2023-04-01,All Items,96.869,96.935303
148,2023-05-01,All Items,97.181,96.918823
