In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch
from torch import nn
from datetime import datetime
import joblib
import time
from sklearn.preprocessing import MinMaxScaler

# --- Cấu hình ---
DATA_PATH = "E:\\Demo\\TIMESERIES\\Data1.csv"  # Đường dẫn tới file dữ liệu của bạn
MODEL_DIR = "models"
DATE_MIN = datetime(2011, 12, 1)
DATE_MAX = datetime(2014, 2, 28)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCHS = 50
LR = 0.001

# --- Mô hình LSTM ---
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_dim=64, output_dim=24):  # 24 bước dự báo (12 giờ)
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# --- Tiền xử lý ---
def clean_long_zero_sequences(series, threshold=6):
    zero_mask = (series == 0)
    group = (zero_mask != zero_mask.shift()).cumsum()
    counts = zero_mask.groupby(group).transform("sum")
    to_nan = (zero_mask & (counts >= threshold))
    series_cleaned = series.copy()
    series_cleaned[to_nan] = np.nan
    return series_cleaned.interpolate().ffill().bfill()

def get_household_series_from_chunks(csv_path, household_id):
    chunks = pd.read_csv(csv_path, sep=';', chunksize=100_000, on_bad_lines='skip')
    df_list = []
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if "KWH/hh (per half hour)" not in chunk.columns:
            continue
        chunk = chunk[chunk["LCLid"] == household_id]
        if chunk.empty:
            continue
        chunk["KWH/hh (per half hour)"] = pd.to_numeric(
            chunk["KWH/hh (per half hour)"].astype(str).str.replace(",", "."), errors='coerce')
        chunk["DateTime"] = pd.to_datetime(chunk["DateTime"], dayfirst=True, errors='coerce')
        chunk.dropna(subset=["DateTime"], inplace=True)
        chunk = chunk[(chunk["DateTime"] >= DATE_MIN) & (chunk["DateTime"] <= DATE_MAX)]
        df_list.append(chunk)
    if not df_list:
        return None
    df = pd.concat(df_list)
    df.set_index("DateTime", inplace=True)
    ts = df["KWH/hh (per half hour)"].resample("30min").mean().ffill()
    ts = ts[ts >= 0]
    ts = clean_long_zero_sequences(ts)
    ts = ts.clip(upper=ts.quantile(0.995))
    return ts

def create_sequences(series, seq_len, forecast_steps):
    values = series.values.reshape(-1, 1)
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(values)
    X, y = [], []
    for i in range(len(scaled) - seq_len - forecast_steps + 1):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len:i+seq_len+forecast_steps].flatten())
    return np.array(X), np.array(y), scaler

def train_model(X_train, y_train, output_dim, epochs, lr):
    model = LSTMModel(output_dim=output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"  Epoch {epoch+1}, Loss: {total_loss/len(loader):.5f}")
    return model

def full_training_pipeline(csv_path, household_id):
    forecast_steps = 24  # 12 giờ
    seq_len = 336        # 7 ngày quan sát

    folder_name = f"{household_id}_12h"
    folder_path = os.path.join(MODEL_DIR, folder_name)
    model_file = os.path.join(folder_path, "final_model.pt")
    if os.path.exists(model_file):
        print(f"[{household_id}] Đã có model, bỏ qua.")
        return

    print(f"[{household_id}] Đang xử lý...")

    ts = get_household_series_from_chunks(csv_path, household_id)
    if ts is None or len(ts) < seq_len + forecast_steps:
        print(f"[{household_id}] Không đủ dữ liệu.")
        return

    X, y, scaler = create_sequences(ts, seq_len, forecast_steps)
    print(f"[{household_id}] Số mẫu huấn luyện: {len(X)}")

    model = train_model(X, y, output_dim=forecast_steps, epochs=EPOCHS, lr=LR)

    os.makedirs(folder_path, exist_ok=True)
    torch.save(model.state_dict(), model_file)
    joblib.dump(scaler, os.path.join(folder_path, "scaler.save"))
    print(f"[{household_id}] Đã lưu model.")

# --- Chạy toàn bộ ---
print("=== BẮT ĐẦU HUẤN LUYỆN TOÀN BỘ 12 GIỜ ===")

# Lấy danh sách các hộ
household_ids = set()
chunks = pd.read_csv(DATA_PATH, sep=";", chunksize=100_000, on_bad_lines='skip', usecols=["LCLid"])
for chunk in chunks:
    chunk.columns = chunk.columns.str.strip()
    household_ids.update(chunk["LCLid"].dropna().unique())

household_ids = sorted(household_ids)

# Train từng hộ
for household_id in household_ids:
    try:
        full_training_pipeline(DATA_PATH, household_id)
    except Exception as e:
        print(f"[{household_id}] Lỗi: {e}")

print("=== HOÀN TẤT TOÀN BỘ ===")


=== BẮT ĐẦU HUẤN LUYỆN TOÀN BỘ 12 GIỜ ===
[MAC000002] Đang xử lý...
[MAC000002] Số mẫu huấn luyện: 27673
  Epoch 1, Loss: 0.01396
  Epoch 2, Loss: 0.01162
  Epoch 3, Loss: 0.01120
  Epoch 4, Loss: 0.01034
  Epoch 5, Loss: 0.00979
  Epoch 6, Loss: 0.00955
  Epoch 7, Loss: 0.00941
