In [27]:
# import lib
import pandas as pd
import numpy as np

from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt


In [28]:
df = pd.read_csv('D:/project_data_mining/src/dataset.csv')

In [29]:
# encode category 
df["cat_id"] = df["Expenditure_category"].astype("category").cat.codes


In [30]:
# lag features
def create_lag_features(df):
    df = df.copy()

    for lag in [1, 3, 6, 12]:
        df[f"CPI_lag_{lag}"] = (
            df.groupby("Expenditure_category")["CPI"].shift(lag)
        )
    return df

In [31]:
# rolling
def create_rolling_features(df):
    df = df.copy()

    df["CPI_roll_mean_3"] = (
        df.groupby("Expenditure_category")["CPI"]
          .shift(1)
          .rolling(3)
          .mean()
    )

    df["CPI_roll_std_6"] = (
        df.groupby("Expenditure_category")["CPI"]
          .shift(1)
          .rolling(6)
          .std()
    )

    return df


In [32]:
df = create_lag_features(df)
df = create_rolling_features(df)

# Drop NaN sinh ra do lag/rolling
df_ml = df.dropna().reset_index(drop=True)

In [33]:
# # split train/val/test 
train_df = df[(df["TIME_PERIOD"] >= "2011-01-01") & (df["TIME_PERIOD"] <= "2020-12-31")]
val_df   = df[(df["TIME_PERIOD"] >= "2021-01-01") & (df["TIME_PERIOD"] <= "2022-12-31")]
test_df  = df[(df["TIME_PERIOD"] >= "2023-01-01") & (df["TIME_PERIOD"] <= "2025-12-31")]

In [34]:
# feature set 
features = [
    "cat_id",
    "CPI_lag_1", "CPI_lag_3", "CPI_lag_6", "CPI_lag_12",
    "CPI_roll_mean_3",
    "CPI_roll_std_6"
]

X_train = train_df[features]
y_train = train_df["CPI"]

X_val = val_df[features]
y_val = val_df["CPI"]

X_test = test_df[features]
y_test = test_df["CPI"]

In [35]:
lgb = lgb.LGBMRegressor(
    objective="regression",
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
# train
lgb.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 7
[LightGBM] [Info] Start training from score 87.359074


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,1000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [36]:
# eval
def smape(y_true, y_pred):
    return np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    ) * 100

def pred_within_10pct(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true) / y_true <= 0.10) * 100

def evaluate(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    smp  = smape(y_true, y_pred)
    p10  = pred_within_10pct(y_true, y_pred)

    print(f"\n{name} (lgb MODEL)")
    print(f"RMSE       : {rmse:.3f}")
    print(f"MAE        : {mae:.3f}")
    print(f"sMAPE      : {smp:.2f}%")
    print(f"Pred <10%  : {p10:.2f}%")


In [37]:
# results
val_pred  = lgb.predict(X_val)
test_pred = lgb.predict(X_test)

evaluate(y_val,  val_pred,  "Validation")
evaluate(y_test, test_pred, "Test")


Validation (lgb MODEL)
RMSE       : 1.129
MAE        : 0.737
sMAPE      : 0.81%
Pred <10%  : 100.00%

Test (lgb MODEL)
RMSE       : 1.101
MAE        : 0.876
sMAPE      : 0.88%
Pred <10%  : 100.00%


In [38]:
# CPI actual - CPI predict 
results_test = test_df[[
    "TIME_PERIOD",
    "Expenditure_category",
    "CPI"
]].copy()

results_test["CPI_pred_gbm"] = test_pred

results_test.head()

Unnamed: 0,TIME_PERIOD,Expenditure_category,CPI,CPI_pred_gbm
144,2023-01-01,All Items,95.772,94.32168
145,2023-02-01,All Items,96.308,94.878831
146,2023-03-01,All Items,96.792,95.491065
147,2023-04-01,All Items,96.869,96.364524
148,2023-05-01,All Items,97.181,96.634413
