In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import os
import gc
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import joblib
import time

# --- Cấu hình ---
DATA_PATH = "E:\\Demo\\TIMESERIES\\Data1.csv"
MODEL_DIR = "models"
DATE_MIN = datetime(2011, 12, 1)
DATE_MAX = datetime(2014, 2, 28)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Map cấu hình huấn luyện ---
config_map = {
    "1 ngày (1d)": {"forecast_steps": 48, "seq_len": 336, "suffix": "1d"},
    "1 giờ (1h)": {"forecast_steps": 2, "seq_len": 48, "suffix": "1h"}
}

# --- Model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_dim=64, output_dim=48):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# --- Tiền xử lý ---
def clean_long_zero_sequences(series, threshold=6):
    zero_mask = (series == 0)
    group = (zero_mask != zero_mask.shift()).cumsum()
    counts = zero_mask.groupby(group).transform("sum")
    to_nan = (zero_mask & (counts >= threshold))
    series_cleaned = series.copy()
    series_cleaned[to_nan] = np.nan
    return series_cleaned.interpolate().ffill().bfill()

@st.cache_data(show_spinner=False)
def load_data(path):
    chunks = pd.read_csv(path, sep=';', engine="c", chunksize=95_000, on_bad_lines='skip')
    df_list = []
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if "KWH/hh (per half hour)" in chunk.columns:
            chunk["KWH/hh (per half hour)"] = pd.to_numeric(
                chunk["KWH/hh (per half hour)"].astype(str).str.replace(",", "."), errors='coerce')
        df_list.append(chunk)
    df = pd.concat(df_list, ignore_index=True)
    del df_list, chunks
    gc.collect()
    df.dropna(subset=["LCLid", "stdorToU", "DateTime", "KWH/hh (per half hour)"], inplace=True)
    df["DateTime"] = pd.to_datetime(df["DateTime"], dayfirst=True, errors='coerce')
    df.dropna(subset=["DateTime"], inplace=True)
    df.set_index("DateTime", inplace=True)
    return df[(df.index >= DATE_MIN) & (df.index <= DATE_MAX)]

def get_household_series(df, household_id):
    df_house = df[df["LCLid"] == household_id]
    ts = df_house["KWH/hh (per half hour)"].resample("30min").mean().ffill()
    ts = ts[ts >= 0]
    ts = clean_long_zero_sequences(ts)
    ts = ts.clip(upper=ts.quantile(0.995))
    return ts

def create_sequences(series, seq_len, forecast_steps):
    values = series.values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(values)
    X, y = [], []
    for i in range(len(scaled) - seq_len - forecast_steps + 1):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len:i+seq_len+forecast_steps].flatten())
    return np.array(X), np.array(y), scaler

def train_model(X_train, y_train, output_dim, epochs=10, lr=0.001):
    model = LSTMModel(output_dim=output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        st.write(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.5f}")
    return model

# --- Giao diện ---
st.set_page_config(page_title="Huấn luyện mô hình", layout="wide")
st.title("🎯 HUẤN LUYỆN MÔ HÌNH LSTM DỰ BÁO ĐIỆN")

with st.spinner("Đang tải dữ liệu..."):
    df = load_data(DATA_PATH)
    household_ids = sorted(df["LCLid"].unique())

selected_household = st.selectbox("Chọn hộ gia đình", household_ids)
selected_label = st.selectbox("Chọn khoảng dự báo", list(config_map.keys()))

if st.button("Bắt đầu huấn luyện"):
    config = config_map[selected_label]
    seq_len = config["seq_len"]
    forecast_steps = config["forecast_steps"]
    suffix = config["suffix"]

    st.write("Tiền xử lý dữ liệu...")
    ts = get_household_series(df, selected_household)

    if len(ts) < seq_len + forecast_steps:
        st.warning("Không đủ dữ liệu cho huấn luyện.")
        st.stop()

    X, y, scaler = create_sequences(ts, seq_len, forecast_steps)
    st.write(f"Tổng số mẫu huấn luyện: {len(X)}")

    st.write("Bắt đầu huấn luyện mô hình...")
    start_time = time.time()
    model = train_model(X, y, output_dim=forecast_steps, epochs=10, lr=0.001)
    end_time = time.time()

    # Lưu model
    folder_name = f"{selected_household}_{suffix}"
    folder_path = os.path.join(MODEL_DIR, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(folder_path, "final_model.pt"))
    joblib.dump(scaler, os.path.join(folder_path, "scaler.save"))

    st.success(f"Huấn luyện và lưu model hoàn tất sau {end_time - start_time:.2f} giây.")


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Đánh giá mô hình
metrics = {}

for household_id, result in forecast_results.items():
    if isinstance(result, tuple):
        actual, forecast = result
        mae = mean_absolute_error(actual, forecast)
        rmse = np.sqrt(mean_squared_error(actual, forecast))
        mape = np.mean(np.abs((actual - forecast) / (actual + 1e-8))) * 100  # tránh chia cho 0
        r2 = r2_score(actual, forecast)

        metrics[household_id] = {
            "MAE": mae,
            "RMSE": rmse,
            "MAPE (%)": mape,
            "R² Score": r2
        }

# Hiển thị bảng đánh giá
metrics_df = pd.DataFrame(metrics).T  # Transpose để hàng là hộ
metrics_df.index.name = "Household ID"

print("\n🔍 Bảng đánh giá mô hình (1 ngày dự báo):")
print(metrics_df.round(4))

# (Tuỳ chọn) Lưu ra file CSV
# metrics_df.to_csv("evaluation_metrics_1d.csv")



🔍 Bảng đánh giá mô hình (1 ngày dự báo):
                       MAE          RMSE      MAPE (%)      R² Score
Household ID                                                        
MAC000034     7.348850e+06  1.691480e+07  4.978489e+05  2.740000e-02
MAC000049     2.408512e+06  4.774701e+06  4.365747e+05 -2.400000e-03
MAC000003     5.149548e+05  6.435414e+05  8.423354e+05 -4.853520e+06
MAC000024     2.244899e+06  5.178887e+06  2.797869e+06  1.288000e-01
MAC000032     1.325518e+06  2.673469e+06  6.783820e+14  1.493000e-01


In [None]:
# Vẽ biểu đồ so sánh thực tế và dự báo cho các hộ
def plot_forecast_results(forecast_results, forecast_label="1d"):
    step = forecast_steps_map[forecast_label]["steps"]
    num_plots = len(forecast_results)
    fig, axes = plt.subplots(num_plots, 1, figsize=(12, 3 * num_plots), sharex=True)

    if num_plots == 1:
        axes = [axes]

    for ax, (hid, data) in zip(axes, forecast_results.items()):
        if isinstance(data, str):
            ax.text(0.5, 0.5, f"Lỗi: {data}", ha='center', va='center', fontsize=12)
            ax.set_title(f"Hộ {hid}")
            continue

        actual, forecast = data
        ax.plot(actual, label="Thực tế", marker='o')
        ax.plot(forecast, label="Dự báo", marker='x', linestyle='--')
        ax.set_title(f"Hộ: {hid}")
        ax.set_ylabel("KWh")
        ax.legend()
        ax.grid(True)

    axes[-1].set_xlabel("Khung giờ (30 phút)")
    plt.suptitle("So sánh thực tế và dự báo", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

# Gọi hàm vẽ
plot_forecast_results(forecast_results, forecast_label="1d")


In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import joblib
import time

# --- Cấu hình ---
DATA_PATH = "E:\\Demo\\TIMESERIES\\Data1.csv"
MODEL_DIR = "models"
DATE_MIN = datetime(2011, 12, 1)
DATE_MAX = datetime(2014, 2, 28)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Tham số huấn luyện ---
household_id = "MAC000002"  # Hộ gia đình cần train
forecast_type = "1h"  # Chọn '1d' (1 ngày) hoặc '1h' (1 giờ)
EPOCHS = 10
LR = 0.001

# --- Cấu hình theo loại dự báo ---
if forecast_type == "1d":
    forecast_steps = 48
    seq_len = 336
elif forecast_type == "1h":
    forecast_steps = 2
    seq_len = 48
else:
    raise ValueError("Chỉ chấp nhận '1d' hoặc '1h'!")

# --- Model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_dim=64, output_dim=48):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# --- Tiền xử lý ---
def clean_long_zero_sequences(series, threshold=6):
    zero_mask = (series == 0)
    group = (zero_mask != zero_mask.shift()).cumsum()
    counts = zero_mask.groupby(group).transform("sum")
    to_nan = (zero_mask & (counts >= threshold))
    series_cleaned = series.copy()
    series_cleaned[to_nan] = np.nan
    return series_cleaned.interpolate().ffill().bfill()

def load_data(path):
    chunks = pd.read_csv(path, sep=';', engine="c", chunksize=95_000, on_bad_lines='skip')
    df_list = []
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if "KWH/hh (per half hour)" in chunk.columns:
            chunk["KWH/hh (per half hour)"] = pd.to_numeric(
                chunk["KWH/hh (per half hour)"].astype(str).str.replace(",", "."), errors='coerce')
        df_list.append(chunk)
    df = pd.concat(df_list, ignore_index=True)
    del df_list, chunks
    gc.collect()
    df.dropna(subset=["LCLid", "stdorToU", "DateTime", "KWH/hh (per half hour)"], inplace=True)
    df["DateTime"] = pd.to_datetime(df["DateTime"], dayfirst=True, errors='coerce')
    df.dropna(subset=["DateTime"], inplace=True)
    df.set_index("DateTime", inplace=True)
    return df[(df.index >= DATE_MIN) & (df.index <= DATE_MAX)]

def get_household_series(df, household_id):
    df_house = df[df["LCLid"] == household_id]
    ts = df_house["KWH/hh (per half hour)"].resample("30min").mean().ffill()
    ts = ts[ts >= 0]
    ts = clean_long_zero_sequences(ts)
    ts = ts.clip(upper=ts.quantile(0.995))
    return ts

def create_sequences(series, seq_len, forecast_steps):
    values = series.values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(values)
    X, y = [], []
    for i in range(len(scaled) - seq_len - forecast_steps + 1):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len:i+seq_len+forecast_steps].flatten())
    return np.array(X), np.array(y), scaler

def train_model(X_train, y_train, output_dim, epochs, lr):
    model = LSTMModel(output_dim=output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.5f}")
    return model

# --- Chạy huấn luyện ---
print("=== BẮT ĐẦU HUẤN LUYỆN ===")

print("Đang load dữ liệu...")
df = load_data(DATA_PATH)

print("Tiền xử lý dữ liệu...")
ts = get_household_series(df, household_id)

if len(ts) < seq_len + forecast_steps:
    raise ValueError("Không đủ dữ liệu để huấn luyện!")

X, y, scaler = create_sequences(ts, seq_len, forecast_steps)
print(f"Số lượng mẫu train: {len(X)}")

print("Đang huấn luyện...")
start_time = time.time()
model = train_model(X, y, output_dim=forecast_steps, epochs=EPOCHS, lr=LR)
end_time = time.time()

# --- Lưu model ---
folder_name = f"{household_id}_{forecast_type}"
folder_path = os.path.join(MODEL_DIR, folder_name)
os.makedirs(folder_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(folder_path, "final_model.pt"))
joblib.dump(scaler, os.path.join(folder_path, "scaler.save"))

print(f"Huấn luyện và lưu model hoàn tất sau {end_time - start_time:.2f} giây.")
print(f"Model lưu tại: {folder_path}")
