In [11]:
# import lib
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Embedding,
    Concatenate, Dropout, Flatten
)
from tensorflow.keras.models import Model


import matplotlib.pyplot as plt

In [12]:
df = pd.read_csv('D:/project_data_mining/src/dataset.csv')

In [13]:
# encode category 
cat2id = {c: i for i, c in enumerate(df["Expenditure_category"].unique())}
df["cat_id"] = df["Expenditure_category"].map(cat2id)
N_CAT = len(cat2id)

In [14]:
# # split train/val/test 
train_df = df[(df["TIME_PERIOD"] >= "2011-01-01") & (df["TIME_PERIOD"] <= "2020-12-31")]
val_df   = df[(df["TIME_PERIOD"] >= "2021-01-01") & (df["TIME_PERIOD"] <= "2022-12-31")]
test_df  = df[(df["TIME_PERIOD"] >= "2023-01-01") & (df["TIME_PERIOD"] <= "2025-12-31")]


In [15]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler_cpi = StandardScaler()
scaler_cpi.fit(train_df[["CPI"]])

def scale_no_exog(df):
    df[["CPI"]] = scaler_cpi.transform(df[["CPI"]])
    return df

train_df = scale_no_exog(train_df)
val_df   = scale_no_exog(val_df)
test_df  = scale_no_exog(test_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["CPI"]] = scaler_cpi.transform(df[["CPI"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["CPI"]] = scaler_cpi.transform(df[["CPI"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["CPI"]] = scaler_cpi.transform(df[["CPI"]])


In [16]:
# create sequence
def make_sequence_no_exog(df, window=12):
    X_seq, y, cat, time_idx = [], [], [], []

    for cid, g in df.groupby("cat_id"):
        g = g.sort_values("TIME_PERIOD")
        values = g[["CPI"]].values  # ⬅️ chỉ CPI

        for i in range(window, len(g)):
            X_seq.append(values[i-window:i])
            y.append(values[i, 0])   # CPI(t)
            cat.append(cid)
            time_idx.append(g["TIME_PERIOD"].iloc[i])

    return (
        np.array(X_seq, dtype=np.float32),
        np.array(y, dtype=np.float32),
        np.array(cat, dtype=np.int32),
        np.array(time_idx)
    )


In [17]:
WINDOW = 12

X_train, y_train, cat_train, t_train = make_sequence_no_exog(train_df, WINDOW)
X_val,   y_val,   cat_val,   t_val   = make_sequence_no_exog(val_df, WINDOW)
X_test,  y_test,  cat_test,  t_test  = make_sequence_no_exog(test_df, WINDOW)


In [18]:
# model LSTM
seq_in = Input(shape=(WINDOW, 1), name="seq")
cat_in = Input(shape=(1,), name="cat")

cat_emb = Embedding(N_CAT, 8)(cat_in)
cat_emb = Flatten()(cat_emb)
cat_emb = Dense(8, activation="relu")(cat_emb)

x = LSTM(64)(seq_in)
x = Concatenate()([x, cat_emb])
x = Dense(32, activation="relu")(x)
out = Dense(1)(x)

model = Model([seq_in, cat_in], out)
model.compile(optimizer="adam", loss="mse")

model.summary()



In [19]:
# train
model.fit(
    [X_train, cat_train],     
    y_train,
    validation_data=([X_val, cat_val], y_val),
    epochs=30,
    batch_size=32,
    verbose=1
)


Epoch 1/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 0.1994 - val_loss: 0.0525
Epoch 2/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0159 - val_loss: 0.0521
Epoch 3/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0100 - val_loss: 0.0533
Epoch 4/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0094 - val_loss: 0.0515
Epoch 5/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0089 - val_loss: 0.0475
Epoch 6/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0088 - val_loss: 0.0427
Epoch 7/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0085 - val_loss: 0.0393
Epoch 8/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0089 - val_loss: 0.0368
Epoch 9/30
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x1cec8f20e90>

In [20]:
print("X_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)
print("cat_test shape :", cat_test.shape)


X_test shape : (264, 12, 1)
y_test shape : (264,)
cat_test shape : (264,)


In [21]:
y_pred = model.predict([X_test, cat_test]).flatten()


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step


In [22]:
# metric
def smape(y_true, y_pred):
    return np.mean(
        2 * np.abs(y_pred - y_true) /
        (np.abs(y_true) + np.abs(y_pred) + 1e-8)
    ) * 100

def pred_10(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true) / np.abs(y_true) < 0.10)

# eval
def evaluate_global(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    smp  = smape(y_true, y_pred)
    p10  = pred_10(y_true, y_pred)

def evaluate(y_true, y_pred, name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    smp  = smape(y_true, y_pred)
    p10  = pred_10(y_true, y_pred)

    # print ra màn hình
    print(f"\n{name} (LSTM MODEL)")
    print(f"RMSE       : {rmse:.4f}")
    print(f"MAE        : {mae:.4f}")
    print(f"sMAPE (%)  : {smp:.2f}")
    print(f"Pred(<10%) : {p10:.2f}")

evaluate(y_test, y_pred, name="")


 (LSTM MODEL)
RMSE       : 0.1205
MAE        : 0.0942
sMAPE (%)  : 6.86
Pred(<10%) : 0.80
