# Thư viện

In [1]:
import os
from dataclasses import dataclass, asdict
from typing import List, Tuple
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import joblib

# Cấu hình

In [2]:
# ----------------------------
# Config
# ----------------------------
@dataclass
class Config:
  #data_path: str = '/mnt/data/Metro_Interstate_Traffic_Volume.csv' # đường dẫn file (tùy chỉnh khi khởi tạo)
  data_path: str = '/content/Metro_Interstate_Traffic_Volume.csv'
  pred_col: str = 'traffic_volume'
  features: List[str] = None # nếu None sẽ tự động chọn numeric + time features
  lookback: int = 24
  horizon_task1: int = 2
  horizon_task2: int = 5
  test_size_ratio: float = 0.15
  val_size_ratio: float = 0.15
  batch_size: int = 64
  scaler_path: str = '/content/data/processed/scaler.save'
  cleaned_data_path: str = '/content/data/processed/cleaned_data.parquet'
  models_dir: str = '/content/data/models'
  device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

  def __post_init__(self):
    if self.features is None:
    # default numeric features plus time
      self.features = ['traffic_volume', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'dayofweek']

# Xử lí dữ liệu thô

In [3]:
# ----------------------------
# Data processing OOP
# ----------------------------
class DataProcessor:
  """
      Class chịu trách nhiệm đọc, clean, feature-engineer, scale và xuất dữ liệu đã xử lý.
      Giao diện rõ ràng: khởi tạo với Config, gọi .run() để thực hiện và lưu file.
  """
  def __init__(self, cfg: Config):
    self.cfg = cfg
    os.makedirs(os.path.dirname(cfg.cleaned_data_path), exist_ok=True)
    os.makedirs(cfg.models_dir, exist_ok=True)
    self.scaler = None

  def read(self) -> pd.DataFrame:
    df = pd.read_csv(self.cfg.data_path)
    return df

  def clean_and_feature(self, df: pd.DataFrame) -> pd.DataFrame:
  # Kiểm tra cột datetime
    if 'date_time' not in df.columns:
      raise ValueError('Dữ liệu cần cột date_time')
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df.sort_values('date_time').reset_index(drop=True)
    df = df.set_index('date_time')

    # tạo time features
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    # điền missing cho các cột quan trọng
    needed = [self.cfg.pred_col] + [c for c in self.cfg.features if c!=self.cfg.pred_col]
    for c in needed:
      if c not in df.columns:
        raise ValueError(f'Column {c} không có trong dữ liệu')
    df[needed] = df[needed].interpolate().fillna(method='bfill').fillna(method='ffill')
    return df

  def scale(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, MinMaxScaler]:
    cols = self.cfg.features
    self.scaler = MinMaxScaler()
    arr = self.scaler.fit_transform(df[cols])
    df_scaled = pd.DataFrame(arr, index=df.index, columns=cols)
    return df_scaled, self.scaler

  def save(self, df_scaled: pd.DataFrame):
  # lưu parquet cho nhanh, lưu scaler
    df_scaled.to_parquet(self.cfg.cleaned_data_path, index=True)
    joblib.dump(self.scaler, self.cfg.scaler_path)

  def run(self) -> pd.DataFrame:
    df = self.read()
    df_clean = self.clean_and_feature(df)
    df_scaled, scaler = self.scale(df_clean)
    self.save(df_scaled)
    print('Saved cleaned scaled data to', self.cfg.cleaned_data_path)
    print('Saved scaler to', self.cfg.scaler_path)
    return df_scaled

  # test unit
  def test_read_and_clean(self):
    df = self.read()
    df2 = self.clean_and_feature(df.copy())
    print('Test read/clean: columns after processing ->', df2.columns.tolist())

# Load Dataset

In [4]:
# ----------------------------
# Dataset class (reusable)
# ----------------------------
class TimeSeriesDataset(Dataset):
  def __init__(self, df: pd.DataFrame, lookback: int, horizon: int, pred_col: str):
    self.array = df.values.astype(np.float32)
    self.lookback = lookback
    self.horizon = horizon
    self.pred_idx = df.columns.get_loc(pred_col)
    self.length = len(self.array) - lookback - horizon + 1

  def __len__(self):
    return max(0, self.length)

  def __getitem__(self, idx):
    s = idx
    X = self.array[s:s+self.lookback]
    y = self.array[s+self.lookback:s+self.lookback+self.horizon, self.pred_idx]
    return torch.from_numpy(X), torch.from_numpy(y)

# Mô hình

In [5]:
# ----------------------------
# Model definitions (each class is independent)
# ----------------------------
class SimpleRNN(nn.Module):
  def __init__(self, input_dim:int, hidden:int=64, num_layers:int=1, horizon:int=1):
    super().__init__()
    self.rnn = nn.RNN(input_dim, hidden, num_layers=num_layers, batch_first=True)
    self.head = nn.Sequential(nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Linear(hidden//2, horizon))

  def forward(self, x):
    out, _ = self.rnn(x)
    last = out[:, -1, :]
    return self.head(last)

class SimpleLSTM(nn.Module):
  def __init__(self, input_dim:int, hidden:int=64, num_layers:int=1, horizon:int=1):
    super().__init__()
    self.lstm = nn.LSTM(input_dim, hidden, num_layers=num_layers, batch_first=True)
    self.head = nn.Sequential(nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Linear(hidden//2, horizon))

  def forward(self, x):
    out, _ = self.lstm(x)
    last = out[:, -1, :]
    return self.head(last)

In [6]:
# Customized LSTM: thêm attention + residual connection
class CustomizedLSTM(nn.Module):
  def __init__(self, input_dim:int, hidden:int=128, num_layers:int=1, horizon:int=1):
    super().__init__()
    self.lstm = nn.LSTM(input_dim, hidden, num_layers=num_layers, batch_first=True, dropout=0.1)
    self.attn = nn.Sequential(nn.Linear(hidden, hidden//2), nn.Tanh(), nn.Linear(hidden//2, 1))
    self.res_fc = nn.Linear(input_dim, hidden)
    self.head = nn.Sequential(nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Linear(hidden//2, horizon))

  def forward(self, x):
    # x: batch, seq, feat
    out, _ = self.lstm(x) # out: batch, seq, hidden
    # attention pooling over time
    scores = self.attn(out).squeeze(-1) # batch, seq
    weights = torch.softmax(scores, dim=1).unsqueeze(-1)
    context = (out * weights).sum(dim=1) # batch, hidden
    # residual from last input timestep features (projected)
    residual = self.res_fc(x[:, -1, :])
    combined = context + residual
    return self.head(combined)

In [7]:
# Seq2Seq with Attention (encoder-decoder) for multistep
class Encoder(nn.Module):
  def __init__(self, input_dim, hidden):
    super().__init__()
    self.lstm = nn.LSTM(input_dim, hidden, batch_first=True)

  def forward(self, x):
    out, (h,c) = self.lstm(x)
    return out, (h,c)

class Attention(nn.Module):
  def __init__(self, hidden):
    super().__init__()
    self.fc = nn.Linear(hidden*2, hidden)
    self.v = nn.Linear(hidden, 1, bias=False)

  def forward(self, hidden, encoder_outputs):
    seq = encoder_outputs.size(1)
    hidden = hidden.unsqueeze(1).repeat(1, seq, 1)
    e = torch.tanh(self.fc(torch.cat((hidden, encoder_outputs), dim=2)))
    s = self.v(e).squeeze(2)
    w = torch.softmax(s, dim=1)
    ctx = torch.bmm(w.unsqueeze(1), encoder_outputs).squeeze(1)
    return ctx, w

class Decoder(nn.Module):
  def __init__(self, input_dim, hidden):
    super().__init__()
    self.cell = nn.LSTMCell(input_dim, hidden)
    self.att = Attention(hidden)
    self.out = nn.Linear(hidden, 1)

  def forward(self, enc_out, h, c, tgt_len):
    batch = enc_out.size(0); device = enc_out.device
    inp = torch.zeros(batch, 1, device=device)
    h = h.squeeze(0); c = c.squeeze(0)
    outs = []
    for _ in range(tgt_len):
      ctx, _ = self.att(h, enc_out)
      rnn_in = torch.cat((inp, ctx), dim=1)
      h, c = self.cell(rnn_in, (h, c))
      o = self.out(h)
      outs.append(o)
      inp = o
    outs = torch.stack(outs, dim=1).squeeze(-1)
    return outs


class Seq2SeqAttention(nn.Module):
  def __init__(self, input_dim, enc_hidden=128, dec_hidden=128):
    super().__init__()
    self.enc = Encoder(input_dim, enc_hidden)
    self.dec = Decoder(1+enc_hidden, dec_hidden)

  def forward(self, x, tgt_len):
    enc_out, (h,c) = self.enc(x)
    return self.dec(enc_out, h, c, tgt_len)

In [8]:
# ----------------------------
# Trainer + Evaluator utilities
# ----------------------------
class Trainer:
  def __init__(self, cfg: Config, scaler_path: str = None):
    self.cfg = cfg
    self.device = torch.device(cfg.device)
    self.scaler = None
    if scaler_path and os.path.exists(scaler_path):
      self.scaler = joblib.load(scaler_path)

  def _inv_scale(self, y_scaled: np.ndarray, scaler) -> np.ndarray:
    n, h = y_scaled.shape
    mats = np.zeros((n, len(self.cfg.features)))
    outs = []
    for i in range(h):
      mats[:, self.cfg.features.index(self.cfg.pred_col)] = y_scaled[:, i]
      outs.append(scaler.inverse_transform(mats)[:, self.cfg.features.index(self.cfg.pred_col)])
    return np.stack(outs, axis=1)

  def train(self, model: nn.Module, train_loader: DataLoader, val_loader: DataLoader,
    epochs=10, lr=1e-3, task=1, horizon=1):
    model = model.to(self.device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.MSELoss()
    best_state = None; best_val = float('inf')
    for ep in range(1, epochs+1):
      model.train(); tr_losses = []
      for xb, yb in train_loader:
        xb = xb.to(self.device); yb = yb.to(self.device)
        opt.zero_grad()
        if task==1:
          out = model(xb)
        else:
          out = model(xb, horizon)
        loss = crit(out, yb)
        loss.backward(); opt.step(); tr_losses.append(loss.item())
      model.eval(); val_losses = []
      with torch.no_grad():
        for xb, yb in val_loader:
          xb = xb.to(self.device); yb = yb.to(self.device)
          if task==1:
            out = model(xb)
          else:
            out = model(xb, horizon)
          val_losses.append(crit(out, yb).item())
      tr_loss = float(np.mean(tr_losses)); val_loss = float(np.mean(val_losses))
      if val_loss < best_val:
        best_val = val_loss; best_state = model.state_dict()
      if ep % max(1, epochs//5) == 0 or ep==1:
        print(f'Epoch {ep}/{epochs} train={tr_loss:.6f} val={val_loss:.6f}')
    if best_state is not None:
      model.load_state_dict(best_state)
    return model

  def evaluate(self, model: nn.Module, loader: DataLoader, horizon:int, task=1, scaler=None):
    model = model.to(self.device); model.eval()
    Ys=[]; Ps=[]
    with torch.no_grad():
      for xb, yb in loader:
        xb = xb.to(self.device)
        if task==1:
          p = model(xb).cpu().numpy()
        else:
          p = model(xb, horizon).cpu().numpy()
        Ys.append(yb.numpy()); Ps.append(p)
    Y = np.vstack(Ys); P = np.vstack(Ps)
    if scaler is None:
      raise ValueError('scaler required to inverse transform')
    Y_inv = self._inv_scale(Y, scaler); P_inv = self._inv_scale(P, scaler)
    rows = []
    for h in range(horizon):
      yt = Y_inv[:, h]; yp = P_inv[:, h]
      mae = mean_absolute_error(yt, yp)
      rmse = np.sqrt(mean_squared_error(yt, yp))
      r2 = r2_score(yt, yp)
      denom = np.sum((yt - np.mean(yt))**2)
      nse = 1 - np.sum((yt-yp)**2)/denom if denom!=0 else np.nan
      rows.append({'horizon': h+1, 'MAE': mae, 'RMSE': rmse, 'R2': r2, 'NSE': nse})
    return pd.DataFrame(rows), Y_inv, P_inv

In [9]:
# ----------------------------
# Pipeline manager: tiện lợi để gọi
# ----------------------------
class PipelineManager:
  def __init__(self, cfg: Config):
    self.cfg = cfg
    os.makedirs(os.path.dirname(cfg.cleaned_data_path), exist_ok=True)
    os.makedirs(cfg.models_dir, exist_ok=True)
    self.processor = DataProcessor(cfg)
    self.trainer = Trainer(cfg, scaler_path=cfg.scaler_path if os.path.exists(cfg.scaler_path) else None)

  def prepare(self):
    df_scaled = pd.read_parquet(self.cfg.cleaned_data_path) if os.path.exists(self.cfg.cleaned_data_path) else self.processor.run()
    scaler = joblib.load(self.cfg.scaler_path) if os.path.exists(self.cfg.scaler_path) else self.processor.scaler
    return df_scaled, scaler

  def create_loaders(self, df_scaled: pd.DataFrame, lookback: int, horizon: int):
    n = len(df_scaled)
    train_n = int(n * (1 - self.cfg.val_size_ratio - self.cfg.test_size_ratio))
    val_n = int(n * self.cfg.val_size_ratio)
    train_df = df_scaled.iloc[:train_n]
    val_df = df_scaled.iloc[train_n:train_n+val_n]
    test_df = df_scaled.iloc[train_n+val_n:]
    tr_ds = TimeSeriesDataset(train_df, lookback, horizon, self.cfg.pred_col)
    va_ds = TimeSeriesDataset(val_df, lookback, horizon, self.cfg.pred_col)
    te_ds = TimeSeriesDataset(test_df, lookback, horizon, self.cfg.pred_col)
    tr = DataLoader(tr_ds, batch_size=self.cfg.batch_size, shuffle=True)
    va = DataLoader(va_ds, batch_size=self.cfg.batch_size, shuffle=False)
    te = DataLoader(te_ds, batch_size=self.cfg.batch_size, shuffle=False)
    return tr, va, te

  def save_model(self, model: nn.Module, name: str):
    path = os.path.join(self.cfg.models_dir, name + '.pth')
    torch.save(model.state_dict(), path)
    print('Saved model to', path)
    return path

# Demo và Test

In [10]:
# ----------------------------
# Demo + tests
# ----------------------------
def demo_full_pipeline(cfg: Config):
  pm = PipelineManager(cfg)

  # prepare data
  df_scaled, scaler = pm.prepare()
  print('Data scaled loaded shape:', df_scaled.shape)

  # create loaders for Task1
  tr1, va1, te1 = pm.create_loaders(df_scaled, cfg.lookback, cfg.horizon_task1)

  # instantiate models
  input_dim = len(cfg.features)
  rnn = SimpleRNN(input_dim, hidden=64, num_layers=1, horizon=cfg.horizon_task1)
  lstm = SimpleLSTM(input_dim, hidden=64, num_layers=1, horizon=cfg.horizon_task1)
  cust = CustomizedLSTM(input_dim, hidden=128, num_layers=1, horizon=cfg.horizon_task1)
  trainer = pm.trainer

  # train small epochs for demo
  print('Training SimpleRNN (demo)...')
  rnn = trainer.train(rnn, tr1, va1, epochs=6, lr=1e-3, task=1, horizon=cfg.horizon_task1)
  print('Training SimpleLSTM (demo)...')
  lstm = trainer.train(lstm, tr1, va1, epochs=6, lr=1e-3, task=1, horizon=cfg.horizon_task1)
  print('Training CustomizedLSTM (demo)...')
  cust = trainer.train(cust, tr1, va1, epochs=6, lr=1e-3, task=1, horizon=cfg.horizon_task1)

  # evaluate
  print('Evaluate RNN:')
  met_rnn, Y_rnn, P_rnn = trainer.evaluate(rnn, te1, cfg.horizon_task1, task=1, scaler=scaler)
  print(met_rnn)

  print('Evaluate LSTM:')
  met_lstm, Y_lstm, P_lstm = trainer.evaluate(lstm, te1, cfg.horizon_task1, task=1, scaler=scaler)
  print(met_lstm)

  print('Evaluate Customized:')
  met_cust, Y_cust, P_cust = trainer.evaluate(cust, te1, cfg.horizon_task1, task=1, scaler=scaler)
  print(met_cust)

  # save models
  pm.save_model(rnn, 'simple_rnn_task1')
  pm.save_model(lstm, 'simple_lstm_task1')
  pm.save_model(cust, 'custom_lstm_task1')

  # Task2: Seq2Seq
  tr2, va2, te2 = pm.create_loaders(df_scaled, cfg.lookback, cfg.horizon_task2)
  seq = Seq2SeqAttention(input_dim, enc_hidden=128, dec_hidden=128)
  print('Training Seq2Seq (demo)...')

  seq = trainer.train(seq, tr2, va2, epochs=8, lr=1e-3, task=2, horizon=cfg.horizon_task2)
  met_seq, Y_seq, P_seq = trainer.evaluate(seq, te2, cfg.horizon_task2, task=2, scaler=scaler)
  print('Seq2Seq metrics:')
  print(met_seq)
  pm.save_model(seq, 'seq2seq_task2')

  print('Demo finished. Models and processed data saved.')

# Unit tests for small parts
def test_dataset_class(cfg: Config):
  proc = DataProcessor(cfg)
  df_scaled = proc.run()
  # create small dataset
  ds = TimeSeriesDataset(df_scaled.iloc[:200], cfg.lookback, 1, cfg.pred_col)
  x, y = ds[0]
  assert x.shape == (cfg.lookback, len(cfg.features))
  assert y.shape[0] == 1
  print('test_dataset_class passed.')

def test_models_forward(cfg: Config):
  proc = DataProcessor(cfg)
  df_scaled = proc.run()
  input_dim = len(cfg.features)
  x = torch.randn(4, cfg.lookback, input_dim)
  rnn = SimpleRNN(input_dim, horizon=cfg.horizon_task1)
  lstm = SimpleLSTM(input_dim, horizon=cfg.horizon_task1)
  print('test_models_forward passed.')

In [11]:
# ----------------------------
# If run as script, show sample usage
# ----------------------------
if __name__ == '__main__':
  cfg = Config()
  print('Config:', asdict(cfg))
  # Run tests
  print('Running lightweight unit tests...')
  test_dataset_class(cfg)
  test_models_forward(cfg)
  # Run demo pipeline (small epochs) — uncomment to run demo
  demo_full_pipeline(cfg)

  print('Ready. Để chạy demo đầy đủ, call demo_full_pipeline(cfg) — chú ý tốn thời gian huấn luyện.')

Config: {'data_path': '/content/Metro_Interstate_Traffic_Volume.csv', 'pred_col': 'traffic_volume', 'features': ['traffic_volume', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'dayofweek'], 'lookback': 24, 'horizon_task1': 2, 'horizon_task2': 5, 'test_size_ratio': 0.15, 'val_size_ratio': 0.15, 'batch_size': 64, 'scaler_path': '/content/data/processed/scaler.save', 'cleaned_data_path': '/content/data/processed/cleaned_data.parquet', 'models_dir': '/content/data/models', 'device': 'cuda'}
Running lightweight unit tests...


  df[needed] = df[needed].interpolate().fillna(method='bfill').fillna(method='ffill')


Saved cleaned scaled data to /content/data/processed/cleaned_data.parquet
Saved scaler to /content/data/processed/scaler.save
test_dataset_class passed.
Saved cleaned scaled data to /content/data/processed/cleaned_data.parquet
Saved scaler to /content/data/processed/scaler.save
test_models_forward passed.
Data scaled loaded shape: (48204, 7)
Training SimpleRNN (demo)...


  df[needed] = df[needed].interpolate().fillna(method='bfill').fillna(method='ffill')


Epoch 1/6 train=0.023493 val=0.008800
Epoch 2/6 train=0.011666 val=0.007056
Epoch 3/6 train=0.010187 val=0.006878
Epoch 4/6 train=0.009330 val=0.005641
Epoch 5/6 train=0.008689 val=0.005774
Epoch 6/6 train=0.008303 val=0.005184
Training SimpleLSTM (demo)...
Epoch 1/6 train=0.034783 val=0.010771
Epoch 2/6 train=0.012705 val=0.007732
Epoch 3/6 train=0.011056 val=0.007257
Epoch 4/6 train=0.010292 val=0.006920
Epoch 5/6 train=0.009690 val=0.006184
Epoch 6/6 train=0.009152 val=0.006010
Training CustomizedLSTM (demo)...
Epoch 1/6 train=0.020142 val=0.009283
Epoch 2/6 train=0.011274 val=0.008469
Epoch 3/6 train=0.009993 val=0.007641
Epoch 4/6 train=0.009177 val=0.006653
Epoch 5/6 train=0.008600 val=0.005737
Epoch 6/6 train=0.008090 val=0.005223
Evaluate RNN:
   horizon         MAE        RMSE        R2       NSE
0        1  307.849164  430.818606  0.952848  0.952848
1        2  416.666102  575.524407  0.915849  0.915849
Evaluate LSTM:
   horizon         MAE        RMSE        R2       NSE
0  