In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import os
import gc
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import joblib
import time

# --- C·∫•u h√¨nh ---
DATA_PATH = "E:\\Demo\\TIMESERIES\\Data1.csv"
MODEL_DIR = "models"
DATE_MIN = datetime(2011, 12, 1)
DATE_MAX = datetime(2014, 2, 28)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Map c·∫•u h√¨nh hu·∫•n luy·ªán ---
config_map = {
    "1 ng√†y (1d)": {"forecast_steps": 48, "seq_len": 336, "suffix": "1d"},
    "1 gi·ªù (1h)": {"forecast_steps": 2, "seq_len": 48, "suffix": "1h"}
}

# --- Model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_dim=64, output_dim=48):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# --- Ti·ªÅn x·ª≠ l√Ω ---
def clean_long_zero_sequences(series, threshold=6):
    zero_mask = (series == 0)
    group = (zero_mask != zero_mask.shift()).cumsum()
    counts = zero_mask.groupby(group).transform("sum")
    to_nan = (zero_mask & (counts >= threshold))
    series_cleaned = series.copy()
    series_cleaned[to_nan] = np.nan
    return series_cleaned.interpolate().ffill().bfill()

@st.cache_data(show_spinner=False)
def load_data(path):
    chunks = pd.read_csv(path, sep=';', engine="c", chunksize=95_000, on_bad_lines='skip')
    df_list = []
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if "KWH/hh (per half hour)" in chunk.columns:
            chunk["KWH/hh (per half hour)"] = pd.to_numeric(
                chunk["KWH/hh (per half hour)"].astype(str).str.replace(",", "."), errors='coerce')
        df_list.append(chunk)
    df = pd.concat(df_list, ignore_index=True)
    del df_list, chunks
    gc.collect()
    df.dropna(subset=["LCLid", "stdorToU", "DateTime", "KWH/hh (per half hour)"], inplace=True)
    df["DateTime"] = pd.to_datetime(df["DateTime"], dayfirst=True, errors='coerce')
    df.dropna(subset=["DateTime"], inplace=True)
    df.set_index("DateTime", inplace=True)
    return df[(df.index >= DATE_MIN) & (df.index <= DATE_MAX)]

def get_household_series(df, household_id):
    df_house = df[df["LCLid"] == household_id]
    ts = df_house["KWH/hh (per half hour)"].resample("30min").mean().ffill()
    ts = ts[ts >= 0]
    ts = clean_long_zero_sequences(ts)
    ts = ts.clip(upper=ts.quantile(0.995))
    return ts

def create_sequences(series, seq_len, forecast_steps):
    values = series.values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(values)
    X, y = [], []
    for i in range(len(scaled) - seq_len - forecast_steps + 1):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len:i+seq_len+forecast_steps].flatten())
    return np.array(X), np.array(y), scaler

def train_model(X_train, y_train, output_dim, epochs=10, lr=0.001):
    model = LSTMModel(output_dim=output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        st.write(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.5f}")
    return model

# --- Giao di·ªán ---
st.set_page_config(page_title="Hu·∫•n luy·ªán m√¥ h√¨nh", layout="wide")
st.title("üéØ HU·∫§N LUY·ªÜN M√î H√åNH LSTM D·ª∞ B√ÅO ƒêI·ªÜN")

with st.spinner("ƒêang t·∫£i d·ªØ li·ªáu..."):
    df = load_data(DATA_PATH)
    household_ids = sorted(df["LCLid"].unique())

selected_household = st.selectbox("Ch·ªçn h·ªô gia ƒë√¨nh", household_ids)
selected_label = st.selectbox("Ch·ªçn kho·∫£ng d·ª± b√°o", list(config_map.keys()))

if st.button("B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán"):
    config = config_map[selected_label]
    seq_len = config["seq_len"]
    forecast_steps = config["forecast_steps"]
    suffix = config["suffix"]

    st.write("Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu...")
    ts = get_household_series(df, selected_household)

    if len(ts) < seq_len + forecast_steps:
        st.warning("Kh√¥ng ƒë·ªß d·ªØ li·ªáu cho hu·∫•n luy·ªán.")
        st.stop()

    X, y, scaler = create_sequences(ts, seq_len, forecast_steps)
    st.write(f"T·ªïng s·ªë m·∫´u hu·∫•n luy·ªán: {len(X)}")

    st.write("B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán m√¥ h√¨nh...")
    start_time = time.time()
    model = train_model(X, y, output_dim=forecast_steps, epochs=10, lr=0.001)
    end_time = time.time()

    # L∆∞u model
    folder_name = f"{selected_household}_{suffix}"
    folder_path = os.path.join(MODEL_DIR, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(folder_path, "final_model.pt"))
    joblib.dump(scaler, os.path.join(folder_path, "scaler.save"))

    st.success(f"Hu·∫•n luy·ªán v√† l∆∞u model ho√†n t·∫•t sau {end_time - start_time:.2f} gi√¢y.")


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ƒê√°nh gi√° m√¥ h√¨nh
metrics = {}

for household_id, result in forecast_results.items():
    if isinstance(result, tuple):
        actual, forecast = result
        mae = mean_absolute_error(actual, forecast)
        rmse = np.sqrt(mean_squared_error(actual, forecast))
        mape = np.mean(np.abs((actual - forecast) / (actual + 1e-8))) * 100  # tr√°nh chia cho 0
        r2 = r2_score(actual, forecast)

        metrics[household_id] = {
            "MAE": mae,
            "RMSE": rmse,
            "MAPE (%)": mape,
            "R¬≤ Score": r2
        }

# Hi·ªÉn th·ªã b·∫£ng ƒë√°nh gi√°
metrics_df = pd.DataFrame(metrics).T  # Transpose ƒë·ªÉ h√†ng l√† h·ªô
metrics_df.index.name = "Household ID"

print("\nüîç B·∫£ng ƒë√°nh gi√° m√¥ h√¨nh (1 ng√†y d·ª± b√°o):")
print(metrics_df.round(4))

# (Tu·ª≥ ch·ªçn) L∆∞u ra file CSV
# metrics_df.to_csv("evaluation_metrics_1d.csv")



üîç B·∫£ng ƒë√°nh gi√° m√¥ h√¨nh (1 ng√†y d·ª± b√°o):
                       MAE          RMSE      MAPE (%)      R¬≤ Score
Household ID                                                        
MAC000034     7.348850e+06  1.691480e+07  4.978489e+05  2.740000e-02
MAC000049     2.408512e+06  4.774701e+06  4.365747e+05 -2.400000e-03
MAC000003     5.149548e+05  6.435414e+05  8.423354e+05 -4.853520e+06
MAC000024     2.244899e+06  5.178887e+06  2.797869e+06  1.288000e-01
MAC000032     1.325518e+06  2.673469e+06  6.783820e+14  1.493000e-01


In [None]:
# V·∫Ω bi·ªÉu ƒë·ªì so s√°nh th·ª±c t·∫ø v√† d·ª± b√°o cho c√°c h·ªô
def plot_forecast_results(forecast_results, forecast_label="1d"):
    step = forecast_steps_map[forecast_label]["steps"]
    num_plots = len(forecast_results)
    fig, axes = plt.subplots(num_plots, 1, figsize=(12, 3 * num_plots), sharex=True)

    if num_plots == 1:
        axes = [axes]

    for ax, (hid, data) in zip(axes, forecast_results.items()):
        if isinstance(data, str):
            ax.text(0.5, 0.5, f"L·ªói: {data}", ha='center', va='center', fontsize=12)
            ax.set_title(f"H·ªô {hid}")
            continue

        actual, forecast = data
        ax.plot(actual, label="Th·ª±c t·∫ø", marker='o')
        ax.plot(forecast, label="D·ª± b√°o", marker='x', linestyle='--')
        ax.set_title(f"H·ªô: {hid}")
        ax.set_ylabel("KWh")
        ax.legend()
        ax.grid(True)

    axes[-1].set_xlabel("Khung gi·ªù (30 ph√∫t)")
    plt.suptitle("So s√°nh th·ª±c t·∫ø v√† d·ª± b√°o", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

# G·ªçi h√†m v·∫Ω
plot_forecast_results(forecast_results, forecast_label="1d")


In [None]:
import pandas as pd
import numpy as np
import os
import gc
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import joblib
import time

# --- C·∫•u h√¨nh ---
DATA_PATH = "E:\\Demo\\TIMESERIES\\Data1.csv"
MODEL_DIR = "models"
DATE_MIN = datetime(2011, 12, 1)
DATE_MAX = datetime(2014, 2, 28)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Tham s·ªë hu·∫•n luy·ªán ---
household_id = "MAC000002"  # H·ªô gia ƒë√¨nh c·∫ßn train
forecast_type = "1h"  # Ch·ªçn '1d' (1 ng√†y) ho·∫∑c '1h' (1 gi·ªù)
EPOCHS = 10
LR = 0.001

# --- C·∫•u h√¨nh theo lo·∫°i d·ª± b√°o ---
if forecast_type == "1d":
    forecast_steps = 48
    seq_len = 336
elif forecast_type == "1h":
    forecast_steps = 2
    seq_len = 48
else:
    raise ValueError("Ch·ªâ ch·∫•p nh·∫≠n '1d' ho·∫∑c '1h'!")

# --- Model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_dim=64, output_dim=48):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# --- Ti·ªÅn x·ª≠ l√Ω ---
def clean_long_zero_sequences(series, threshold=6):
    zero_mask = (series == 0)
    group = (zero_mask != zero_mask.shift()).cumsum()
    counts = zero_mask.groupby(group).transform("sum")
    to_nan = (zero_mask & (counts >= threshold))
    series_cleaned = series.copy()
    series_cleaned[to_nan] = np.nan
    return series_cleaned.interpolate().ffill().bfill()

def load_data(path):
    chunks = pd.read_csv(path, sep=';', engine="c", chunksize=95_000, on_bad_lines='skip')
    df_list = []
    for chunk in chunks:
        chunk.columns = chunk.columns.str.strip()
        if "KWH/hh (per half hour)" in chunk.columns:
            chunk["KWH/hh (per half hour)"] = pd.to_numeric(
                chunk["KWH/hh (per half hour)"].astype(str).str.replace(",", "."), errors='coerce')
        df_list.append(chunk)
    df = pd.concat(df_list, ignore_index=True)
    del df_list, chunks
    gc.collect()
    df.dropna(subset=["LCLid", "stdorToU", "DateTime", "KWH/hh (per half hour)"], inplace=True)
    df["DateTime"] = pd.to_datetime(df["DateTime"], dayfirst=True, errors='coerce')
    df.dropna(subset=["DateTime"], inplace=True)
    df.set_index("DateTime", inplace=True)
    return df[(df.index >= DATE_MIN) & (df.index <= DATE_MAX)]

def get_household_series(df, household_id):
    df_house = df[df["LCLid"] == household_id]
    ts = df_house["KWH/hh (per half hour)"].resample("30min").mean().ffill()
    ts = ts[ts >= 0]
    ts = clean_long_zero_sequences(ts)
    ts = ts.clip(upper=ts.quantile(0.995))
    return ts

def create_sequences(series, seq_len, forecast_steps):
    values = series.values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(values)
    X, y = [], []
    for i in range(len(scaled) - seq_len - forecast_steps + 1):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len:i+seq_len+forecast_steps].flatten())
    return np.array(X), np.array(y), scaler

def train_model(X_train, y_train, output_dim, epochs, lr):
    model = LSTMModel(output_dim=output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.5f}")
    return model

# --- Ch·∫°y hu·∫•n luy·ªán ---
print("=== B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN ===")

print("ƒêang load d·ªØ li·ªáu...")
df = load_data(DATA_PATH)

print("Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu...")
ts = get_household_series(df, household_id)

if len(ts) < seq_len + forecast_steps:
    raise ValueError("Kh√¥ng ƒë·ªß d·ªØ li·ªáu ƒë·ªÉ hu·∫•n luy·ªán!")

X, y, scaler = create_sequences(ts, seq_len, forecast_steps)
print(f"S·ªë l∆∞·ª£ng m·∫´u train: {len(X)}")

print("ƒêang hu·∫•n luy·ªán...")
start_time = time.time()
model = train_model(X, y, output_dim=forecast_steps, epochs=EPOCHS, lr=LR)
end_time = time.time()

# --- L∆∞u model ---
folder_name = f"{household_id}_{forecast_type}"
folder_path = os.path.join(MODEL_DIR, folder_name)
os.makedirs(folder_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(folder_path, "final_model.pt"))
joblib.dump(scaler, os.path.join(folder_path, "scaler.save"))

print(f"Hu·∫•n luy·ªán v√† l∆∞u model ho√†n t·∫•t sau {end_time - start_time:.2f} gi√¢y.")
print(f"Model l∆∞u t·∫°i: {folder_path}")
