In [None]:
# ## 04 - Modeling with Deep Learning for Time Series
#
# **Objective:** Train a deep learning model directly on the raw, windowed time-series data.

import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# --- Configuration ---
PROCESSED_DATA_DIR = '../data/processed/'
MODELS_DIR = '../models/'
INPUT_FILE = os.path.join(PROCESSED_DATA_DIR, 'task_windows.pkl')
MODEL_OUTPUT_FILE = os.path.join(MODELS_DIR, 'cnn_lstm_model.pt')

# --- Hyperparameters ---
SEQUENCE_LENGTH = 512
BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 0.001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# --- Load and Prepare Data ---
try:
    task_windows = pd.read_pickle(INPUT_FILE)
    print(f"Loaded {len(task_windows)} task windows.")
except FileNotFoundError:
    print(f"Error: {INPUT_FILE} not found. Run notebook 01 first.")
    exit()

labels = [d['CognitiveLoad'] for d in task_windows]
data = [(d['EEG_Data'], d['GSR_Data']) for d in task_windows]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=42, stratify=labels
)

# --- NEW, ROBUST APPROACH: Pre-process and Clean Data Windows ---
print("Pre-processing and cleaning all data windows...")

def clean_and_scale_data(data_list):
    cleaned_data = []
    for eeg, gsr in tqdm(data_list):
        # Scale the signals
        scaled_eeg = StandardScaler().fit_transform(eeg)
        scaled_gsr = StandardScaler().fit_transform(gsr.reshape(-1, 1)).flatten()
        
        # Clean any potential NaN/infinity values resulting from zero-variance slices
        cleaned_eeg = np.nan_to_num(scaled_eeg, nan=0.0, posinf=0.0, neginf=0.0)
        cleaned_gsr = np.nan_to_num(scaled_gsr, nan=0.0, posinf=0.0, neginf=0.0)
        
        cleaned_data.append((cleaned_eeg, cleaned_gsr))
    return cleaned_data

cleaned_train_data = clean_and_scale_data(train_data)
cleaned_test_data = clean_and_scale_data(test_data)

print("Data cleaning complete.")

# --- PyTorch Dataset Class (Now Simplified) ---
class CognitiveLoadDataset(Dataset):
    def __init__(self, data, labels, seq_len):
        self.data = data
        self.labels = labels
        self.seq_len = seq_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Data is already scaled and cleaned, we just pad/truncate
        eeg, gsr = self.data[idx]
        label = self.labels[idx]

        if eeg.shape[0] < self.seq_len:
            pad_width = self.seq_len - eeg.shape[0]
            eeg = np.pad(eeg, ((0, pad_width), (0, 0)), 'constant')
        else:
            eeg = eeg[:self.seq_len, :]

        if gsr.shape[0] < self.seq_len:
            pad_width = self.seq_len - gsr.shape[0]
            gsr = np.pad(gsr, (0, pad_width), 'constant')
        else:
            gsr = gsr[:self.seq_len]

        eeg_tensor = torch.FloatTensor(eeg).permute(1, 0)
        gsr_tensor = torch.FloatTensor(gsr).unsqueeze(0)
        return eeg_tensor, gsr_tensor, torch.LongTensor([label]).squeeze()

# Use the pre-cleaned data to create the datasets
train_dataset = CognitiveLoadDataset(cleaned_train_data, train_labels, SEQUENCE_LENGTH)
test_dataset = CognitiveLoadDataset(cleaned_test_data, test_labels, SEQUENCE_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- Define the CNN-LSTM Model ---
class CN_LSTM_Model(nn.Module):
    def __init__(self, eeg_channels=20, gsr_channels=1, num_classes=3):
        super(CN_LSTM_Model, self).__init__()
        self.eeg_cnn = nn.Sequential(nn.Conv1d(eeg_channels, 32, 3, padding=1), nn.ReLU(), nn.Conv1d(32, 64, 3, padding=1), nn.ReLU())
        self.eeg_lstm = nn.LSTM(64, 50, 1, batch_first=True)
        self.gsr_cnn = nn.Sequential(nn.Conv1d(gsr_channels, 16, 3, padding=1), nn.ReLU(), nn.Conv1d(16, 32, 3, padding=1), nn.ReLU())
        self.gsr_lstm = nn.LSTM(32, 20, 1, batch_first=True)
        self.fc = nn.Sequential(nn.Linear(50 + 20, 32), nn.ReLU(), nn.Dropout(0.5), nn.Linear(32, num_classes))

    def forward(self, eeg, gsr):
        eeg_out = self.eeg_cnn(eeg).permute(0, 2, 1)
        eeg_out, _ = self.eeg_lstm(eeg_out)
        gsr_out = self.gsr_cnn(gsr).permute(0, 2, 1)
        gsr_out, _ = self.gsr_lstm(gsr_out)
        combined = torch.cat((eeg_out[:, -1, :], gsr_out[:, -1, :]), dim=1)
        return self.fc(combined)

model = CN_LSTM_Model().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# --- Training Loop ---
print("\nStarting model training...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for eeg, gsr, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        eeg, gsr, labels = eeg.to(DEVICE), gsr.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(eeg, gsr)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

# --- Evaluation ---
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for eeg, gsr, labels in test_loader:
        outputs = model(eeg.to(DEVICE), gsr.to(DEVICE))
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=['Low', 'Medium', 'High'], zero_division=0)

print("-" * 50)
print(f"\nDeep Learning Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

# --- Save the Model ---
torch.save(model.state_dict(), MODEL_OUTPUT_FILE)
print(f"\nTrained model saved successfully to: {MODEL_OUTPUT_FILE}")

Using device: cpu
Loaded 1364 task windows.
Pre-processing and cleaning all data windows...


100%|██████████| 1091/1091 [00:02<00:00, 537.65it/s]
100%|██████████| 273/273 [00:00<00:00, 428.59it/s]


Data cleaning complete.

Starting model training...


Epoch 1/15: 100%|██████████| 35/35 [00:20<00:00,  1.71it/s]


Epoch 1 - Training Loss: 1.1027


Epoch 2/15: 100%|██████████| 35/35 [00:20<00:00,  1.72it/s]


Epoch 2 - Training Loss: 1.0867


Epoch 3/15: 100%|██████████| 35/35 [00:21<00:00,  1.63it/s]


Epoch 3 - Training Loss: 1.0681


Epoch 4/15: 100%|██████████| 35/35 [00:21<00:00,  1.62it/s]


Epoch 4 - Training Loss: 1.0702


Epoch 5/15: 100%|██████████| 35/35 [00:22<00:00,  1.58it/s]


Epoch 5 - Training Loss: 1.0647


Epoch 6/15: 100%|██████████| 35/35 [00:43<00:00,  1.23s/it]


Epoch 6 - Training Loss: 1.0436


Epoch 7/15: 100%|██████████| 35/35 [00:51<00:00,  1.48s/it]


Epoch 7 - Training Loss: 1.0507


Epoch 8/15: 100%|██████████| 35/35 [00:47<00:00,  1.37s/it]


Epoch 8 - Training Loss: 1.0285


Epoch 9/15: 100%|██████████| 35/35 [00:22<00:00,  1.57it/s]


Epoch 9 - Training Loss: 1.0036


Epoch 10/15: 100%|██████████| 35/35 [00:23<00:00,  1.51it/s]


Epoch 10 - Training Loss: 1.0243


Epoch 11/15: 100%|██████████| 35/35 [00:23<00:00,  1.52it/s]


Epoch 11 - Training Loss: 0.9932


Epoch 12/15: 100%|██████████| 35/35 [00:23<00:00,  1.51it/s]


Epoch 12 - Training Loss: 0.9636


Epoch 13/15: 100%|██████████| 35/35 [00:22<00:00,  1.55it/s]


Epoch 13 - Training Loss: 0.9583


Epoch 14/15: 100%|██████████| 35/35 [00:21<00:00,  1.61it/s]


Epoch 14 - Training Loss: 0.9209


Epoch 15/15: 100%|██████████| 35/35 [00:19<00:00,  1.78it/s]


Epoch 15 - Training Loss: 0.8883
--------------------------------------------------

Deep Learning Model Accuracy: 0.4066

Classification Report:
              precision    recall  f1-score   support

         Low       0.40      0.58      0.47        83
      Medium       0.31      0.14      0.19        86
        High       0.45      0.49      0.47       104

    accuracy                           0.41       273
   macro avg       0.39      0.40      0.38       273
weighted avg       0.39      0.41      0.38       273


Trained model saved successfully to: ../models/cnn_lstm_model.pt
