In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgets
import os
from sklearn.preprocessing import StandardScaler
from IPython import get_ipython

#import joblib
import random

In [3]:
BASE_DIR = Path.cwd()
PROJECT_DIR = BASE_DIR.parent
DATASET_DIR = PROJECT_DIR / "dataset"


HIDDEN_SIZE = 8
SEQUENCE_LENGTH = 10
MODEL_NAME = "LSTM_SWaT"
MODEL_VERSION = "v1"
NUM_EPOCHS = 2
EARLY_STOPPING_PATIENCE = 10

In [None]:
MODEL_FILENAME = f"{MODEL_NAME}_HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION}.pt"
MODEL_DIR = f"{PROJECT_DIR}/Models/HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION}"
NORM_REFERENCE_FILENAME = f"featureNormRef_{MODEL_NAME}_HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION}.csv"
LEARN_DATA_FILENAME = "SWaT_Dataset_Normal_v0.csv"

DROPOUT = 0.0
NUM_LAYERS = 1
BATCH_SIZE = 32 
THRESHOLD_PERCENTILE = 92
LEARNING_RATE = 0.001
SEED = 42

# --- 1. Hard-stop execution if MODEL_DIR exists and user cancels ---
out = widgets.Output()  # <-- KÜLÖN output widget

if os.path.exists(MODEL_DIR):
    print(f"[WARNING] Directory already exists at: {MODEL_DIR}\nCONTINUE?")

    yes_button = widgets.Button(description="Yes", button_style='danger')
    no_button = widgets.Button(description="No, STOP execution", button_style='success')

    def on_yes_clicked(b):
        with out:
            out.clear_output()
            print("[WARNING] Existing files in this directory may be OVERWRITTEN!")
          
    def on_no_clicked(b):
        with out:
            out.clear_output()
            print("[ABORTED] User cancelled execution. Kernel will shut down.")
        get_ipython().kernel.do_shutdown(restart=False)

    yes_button.on_click(on_yes_clicked)
    no_button.on_click(on_no_clicked)

    display(widgets.HBox([yes_button, no_button]))
    display(out)

else:
    os.makedirs(MODEL_DIR)
    print(f"[INFO] Directory created at:\n{MODEL_DIR}")

CONTINUE?


HBox(children=(Button(button_style='danger', description='Yes', style=ButtonStyle()), Button(button_style='suc…

Output()

: 

In [None]:
# Check existence and create if necessary
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
    print(f"Directory created at:\n{MODEL_DIR}")
else:
    print(f"DIRECTORY ALREADY EXIST AT:\n{MODEL_DIR}")

In [None]:
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# --- 1. Load and Inspect Data ---
df = pd.read_csv(os.path.join(DATASET_DIR, LEARN_DATA_FILENAME), encoding='utf-8-sig', header=1)
df.columns = df.columns.str.strip()

# --- 2. Model candidate features: numeric columns ---
numeric_df = df.select_dtypes(include='number')
numeric_columns = numeric_df.columns.tolist()

# --- 3. Non-numeric columns (meta, timestamp, or labels) ---
non_numeric_columns = [col for col in df.columns if col not in numeric_columns]

# --- 4. Exclude constant (uninformative) numeric features ---
min_row = numeric_df.min()
max_row = numeric_df.max()
status_row = np.where((max_row - min_row) == 0, 'CONST', 'VALID')
transposed_df = pd.DataFrame(
    [min_row.values, max_row.values, status_row],
    columns=min_row.index,
    index=['Min', 'Max', 'Status']
)
features_for_model = transposed_df.columns[transposed_df.loc['Status'] == 'VALID'].tolist()
constant_numeric_columns = transposed_df.columns[transposed_df.loc['Status'] == 'CONST'].tolist()

print(f"Model candidate input features: {len(numeric_columns)}")
print(numeric_columns)
print(f"\nDiscarded non-numeric columns: {len(non_numeric_columns)}")
print(non_numeric_columns)
print(f"\nDiscarded constant numeric columns: {len(constant_numeric_columns)}")
print(constant_numeric_columns)
print(f"\nFinal model input features (non-constant): {len(features_for_model)}")
print(features_for_model)
print(f"\nTotal samples before removing missing values: {len(df)}")

In [None]:
# --- 5. Remove samples with missing values in model input features ---
missing_mask = df[features_for_model].isnull().any(axis=1)
n_missing = missing_mask.sum()
print(f"Samples with missing input features (removed): {n_missing}")
df_clean = df[~missing_mask].copy()
print(f"Total samples after removing missing values: {len(df_clean)}")

In [None]:
# --- 6. Train / Calib(threshold) / Test Split (időrendi) ---
n = len(df_clean)
train_end = int(n * 0.6)
calib_end = int(n * 0.8)

train_df = df_clean.iloc[:train_end].copy()
calib_df = df_clean.iloc[train_end:calib_end].copy()   
test_df  = df_clean.iloc[calib_end:].copy()            

# --- 7. Z-score normalization ---
scaler = StandardScaler()
train_df[features_for_model] = scaler.fit_transform(train_df[features_for_model])
calib_df[features_for_model] = scaler.transform(calib_df[features_for_model])
test_df[features_for_model]  = scaler.transform(test_df[features_for_model])

# --- 8. Check normalization correctness 
mean_tolerance = 1e-2  
std_tolerance = 1e-2   

means = train_df[features_for_model].mean()
stds = train_df[features_for_model].std()

bad_means = means[np.abs(means) > mean_tolerance]
bad_stds = stds[np.abs(stds - 1) > std_tolerance]

if not bad_means.empty or not bad_stds.empty:
    print("WARNING: Train features were not properly normalized.")
    if not bad_means.empty:
        print("Features with mean significantly different from 0:")
        print(bad_means)
    if not bad_stds.empty:
        print("Features with std significantly different from 1:")
        print(bad_stds)
else:
    print("Z-score normalization check: All TRAIN features normalized successfully.")
    
NUM_CLASSES = len(features_for_model)
print("Model input/output dim (NUM_CLASSES):", NUM_CLASSES)

In [None]:
# --- 9. Dataset and DataLoader Preparation ---
class TempDataset(Dataset):
    def __init__(self, data, seq_len, features, stride=1):
        self.data = data[features].values.astype('float32')
        self.seq_len = seq_len
        self.stride = stride

    def __len__(self):
        return (len(self.data) - self.seq_len) // self.stride + 1

    def __getitem__(self, idx):
        start = idx * self.stride
        end = start + self.seq_len
        seq = self.data[start:end, :]
        return torch.from_numpy(seq)

train_dataset = TempDataset(train_df, SEQUENCE_LENGTH, features_for_model, stride=1)
calib_dataset = TempDataset(calib_df, SEQUENCE_LENGTH, features_for_model, stride=1)
test_dataset  = TempDataset(test_df,  SEQUENCE_LENGTH, features_for_model, stride=1)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
calib_loader = DataLoader(calib_dataset, batch_size=1, shuffle=False)  # early stop + threshold
test_loader  = DataLoader(test_dataset,  batch_size=1, shuffle=False)  # érintetlen mérés

# --- 10. Model Definition ---
class TempLSTMAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, sequence_length):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)
        self.sequence_length = sequence_length  # csak tárolás

    def forward(self, x):
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size, device=x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size, device=x.device)
        out, _ = self.lstm(x, (h0, c0))
        y = self.fc(out[:, -1, :])  # (B, F)
        return y

###Train and create the model


In [None]:
# --- 11. Training ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TempLSTMAutoencoder(
    input_size=NUM_CLASSES,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    sequence_length=SEQUENCE_LENGTH
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

early_stopping_patience = EARLY_STOPPING_PATIENCE
best_loss = float('inf')
epochs_without_improvement = 0

train_loss_history = []
calib_loss_history = []

for epoch in range(NUM_EPOCHS):
    # MODEL TRAINING PHASE
    model.train()
    total_train_loss = 0.0

    for inputs in train_loader:
        inputs = inputs.to(device)

        target = inputs[:, -1, :]      # (B,F)
        outputs = model(inputs)        # (B,F)
        loss = criterion(outputs, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * inputs.size(0)

    avg_train_loss = total_train_loss / len(train_dataset)
    train_loss_history.append(avg_train_loss)

    # --- CALIBRATION/VALIDATION PHASE (EARLY STOP ITT) ---
    model.eval()
    total_calib_loss = 0.0
    with torch.no_grad():
        for inputs in calib_loader:
            inputs = inputs.to(device)
            target = inputs[:, -1, :]
            outputs = model(inputs)
            loss = criterion(outputs, target)
            total_calib_loss += loss.item() * inputs.size(0)

    avg_calib_loss = total_calib_loss / len(calib_dataset)
    calib_loss_history.append(avg_calib_loss)

    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.6f} | Calib Loss: {avg_calib_loss:.6f}")

    # --- SAVE MODEL FOR EACH EPOCH ---
    model_path = f"{MODEL_DIR}/{MODEL_NAME}_HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION}_model_epoch_{epoch+1:02d}.pt"
    torch.save(model.state_dict(), model_path)

    # --- EARLY STOPPING (based on CALIB loss) ---
    if avg_calib_loss < best_loss:
        best_loss = avg_calib_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

# --- 12. Loss Visualization ---
plt.figure(figsize=(8, 5))
plt.plot(train_loss_history, label='Train Loss', marker='o')
plt.plot(calib_loss_history, label='Calib Loss', marker='s')
plt.yscale('log')
plt.xlabel('Epoch')
plt.ylabel('Loss (log scale)')
plt.title(f'Train vs Calib Loss per Epoch ({MODEL_NAME}_HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION})')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- 13. Save loss history to CSV ---
loss_df = pd.DataFrame({
    'Epoch': list(range(1, len(train_loss_history) + 1)),
    'Train_Loss': train_loss_history,
    'Calib_Loss': calib_loss_history
})
loss_df.to_csv(f"{MODEL_DIR}/{MODEL_NAME}_HS{HIDDEN_SIZE}_IW{SEQUENCE_LENGTH}_{MODEL_VERSION}_loss_history.csv", index=False)

# --- 14. Calculate THRESHOLD on CALIB dataset ---
model.eval()
mse_calib = []
with torch.no_grad():
    for inputs in calib_loader:
        inputs = inputs.to(device)
        targets = inputs[:, -1, :].cpu().numpy()   
        outputs = model(inputs).cpu().numpy()      
        mse_calib.append(((targets - outputs) ** 2).mean(axis=1))  

mse_calib = np.concatenate(mse_calib, axis=0) 
print(f"\nCalib reconstruction error: mean MSE = {mse_calib.mean():.6f}, std = {mse_calib.std():.6f}")

threshold_percentile = THRESHOLD_PERCENTILE
pred_threshold = np.percentile(mse_calib, threshold_percentile)
print(f"Prediction threshold (MSE, {threshold_percentile} percentile) FROM CALIB: {pred_threshold:.6f}")

plt.figure(figsize=(8, 5))
plt.hist(mse_calib, bins=100, alpha=0.7)
plt.axvline(pred_threshold, color='red', linestyle='--',
            label=f'Threshold ({threshold_percentile} percentile)')
plt.title('CALIB Set Reconstruction Error Distribution')
plt.xlabel('MSE')
plt.ylabel('Sample Count')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

### LSTM modell

In [None]:
# --- 15. Confirm and Save Model and Normalization Parameters ---1
def confirm_and_save_model(model, dummy_input):
    def save_all():
        model.eval()

        # Save normalization parameters to CSV
        scaler_df = pd.DataFrame(
            {'mean': scaler.mean_, 'std': scaler.scale_},
            index=features_for_model
        )
        scaler_df.to_csv(os.path.join(MODEL_DIR, NORM_REFERENCE_FILENAME))

        # Save FULL PyTorch model (.pt)
        torch.save(model, os.path.join(MODEL_DIR, MODEL_FILENAME))

        print("Full PyTorch model (.pt) and normalization parameters saved. ")

    # Check for existing PyTorch model file
    if not os.path.exists(os.path.join(MODEL_DIR, MODEL_FILENAME)):
        save_all()
        return

    print(f"PyTorch model file already exists at: {os.path.join(MODEL_DIR, MODEL_FILENAME)}")
    yes_button = widgets.Button(description="Yes, overwrite", button_style='danger')
    no_button = widgets.Button(description="No")

    def on_yes_clicked(b):
        clear_output()
        save_all()

    def on_no_clicked(b):
        clear_output()
        print("Model saving cancelled.")

    yes_button.on_click(on_yes_clicked)
    no_button.on_click(on_no_clicked)
    display(widgets.HBox([yes_button, no_button]))


In [None]:
# --- 16. Trigger Model Saving with Dummy Input Verification ---
dummy_input = torch.randn(1, SEQUENCE_LENGTH, NUM_CLASSES).to(device)
confirm_and_save_model(model=model, dummy_input=dummy_input)
