# NeuroVision

## Dataset Preparation

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil

In [5]:
# function to read the dir contents of dataset folder and segregate them
# into n separate classes.
def create_dataset_folders(metadata_file:str, csv_dir:str, output_dir:str):
    class_id_to_folder = {}

    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')

            if len(parts) < 3:
                continue

            label_str, _, class_id = parts
            # print(label_str, class_id)
            first_label = label_str.split(',')[0].strip()
            # print(first_label)
            class_id_to_folder[class_id] = first_label

        count = 0
        for filename in os.listdir(csv_dir):
            if not filename.endswith('.csv'):
                continue

            class_id = filename.split('_')[3]

            folder_name = class_id_to_folder.get(class_id)
            print(folder_name)

            if not folder_name:
                print(f'Unknown class id: {class_id}')
                continue

            safe_folder = folder_name.replace('/', '_').replace('\\', '_').strip()

            dest_folder = os.path.join(output_dir, safe_folder)
            os.makedirs(dest_folder, exist_ok=True)

            src_path = os.path.join(csv_dir, filename)
            dst_path = os.path.join(dest_folder, filename)

            # print(f"Move: {src_path} to {dst_path}")
            count+=1
            print(count)
            shutil.copy(src_path, dst_path)


In [6]:
# create_dataset_folders('../data/WordReport-v1.04.txt',
#                        '../data/MindBigData-Imagenet',
#                        '../data/Segregated_Dataset')

In [7]:
import shutil
import json
import os

def reorganize_dataset(mapping_file, src_root, dst_root, move=False):
    with open(mapping_file, 'r') as f:
        mapping = json.load(f)

    os.makedirs(dst_root, exist_ok=True)

    for super_class, sub_classes in mapping.items():
        super_cls_dir = os.path.join(dst_root, super_class)
        os.makedirs(super_cls_dir, exist_ok=True)

        for sub_class in sub_classes:
            sub_cls_dir = os.path.join(src_root, sub_class)
            if not os.path.exists(sub_cls_dir):
                print(f"[Warning] Sub-class folder not found: {sub_cls_dir}")
                continue

            for file_name in os.listdir(sub_cls_dir):
                src_file = os.path.join(sub_cls_dir, file_name)
                dst_file = os.path.join(super_cls_dir, file_name)

                if move:
                    shutil.move(src_file, dst_file)

                else:
                    shutil.copy2(src_file, dst_file)

            print(f"[OK] {'Moved' if move else 'Copied'} {sub_class} -> {super_class}")
    print("Dataset reorganization complete!")


In [8]:
# reorganize_dataset(mapping_file='../data/class_mapping_v2.json',
#                    src_root='../data/Segregated_Dataset',
#                    dst_root='../data/Class_Combined_Balanced_Dataset',
#                    move=False)

## Dataset Processing for PyTorch

In [9]:
import torch
import os
import pandas as pd
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split

In [10]:
class EEGDataset(Dataset):
    def __init__(self, root_dir, samples, transform=None):
        self.root_dir = root_dir
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path, label = self.samples[idx]

        df = pd.read_csv(file_path, header=None, index_col=0)
        eeg_data = torch.tensor(df.values, dtype=torch.float32)

        if eeg_data.shape[0] < eeg_data.shape[1]:
            eeg_data = eeg_data.T

        if self.transform:
            eeg_data = self.transform(eeg_data)

        return eeg_data, label


In [11]:
def make_datasets(root_dir, val_ratio=0.25, random_state=42):
    class_names = os.listdir(root_dir)
    class_to_idx = {cls:idx for idx, cls in enumerate(class_names)}

    all_samples = []
    all_labels = []

    for cls in class_names:
        cls_dir = os.path.join(root_dir, cls)

        for fname in os.listdir(cls_dir):
            if fname.endswith('.csv'):
                path = os.path.join(cls_dir, fname)
                all_samples.append((path, class_to_idx[cls]))
                all_labels.append(class_to_idx[cls])

    train_idx, val_idx = train_test_split(
        list(range(len(all_samples))),
        test_size=val_ratio,
        random_state=random_state,
        stratify=all_labels
    )

    train_samples = [all_samples[i] for i in train_idx]
    val_samples = [all_samples[i] for i in val_idx]

    train_dataset = EEGDataset(root_dir, train_samples)
    val_dataset = EEGDataset(root_dir, val_samples)

    return train_dataset, val_dataset

In [12]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sequences, labels = zip(*batch)

    lengths = torch.tensor([seq.size(0) for seq in sequences], dtype=torch.long)
    padded_seqs = pad_sequence(sequences, batch_first=True)

    return padded_seqs, torch.tensor(labels), lengths

In [13]:
def get_class_weights(dataset):
    from collections import defaultdict

    cls_samples_count = defaultdict(int)

    for sample in dataset.samples:
        cls_idx = sample[1]
        cls_samples_count[cls_idx] += 1

    for cls_idx, count in cls_samples_count.items():
        cls_samples_count[cls_idx] = round(count/len(dataset.samples), 4)

    return cls_samples_count

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
root_dir = '..//content/drive/MyDrive/Class_Combined_Balanced_Dataset'

In [16]:
train_dataset, val_dataset = make_datasets(root_dir)

In [17]:
train_cls_wts_dict = get_class_weights(train_dataset)

train_cls_wts = [item[1] for item in sorted(train_cls_wts_dict.items())]

In [18]:
from torch.utils.data import WeightedRandomSampler

sampler = WeightedRandomSampler(weights=train_cls_wts,
                                num_samples=len(train_dataset.samples),
                                replacement=True)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=False, sampler=sampler,
                          collate_fn=collate_fn, num_workers=2, pin_memory=False,
                          persistent_workers=True, prefetch_factor=2)

val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn,
                        num_workers=2, pin_memory=False, persistent_workers=True, prefetch_factor=2)

In [20]:
# # class_weights
# samples_count = 0
# dataset_dir = '../data/Class_Combined_Balanced_Dataset'
# class_wts = defaultdict(float)

# for cls in os.listdir(dataset_dir):
#     print(cls)
#     cls_count = len(os.listdir(os.path.join(dataset_dir, cls)))
#     samples_count += cls_count
#     class_wts[cls] = cls_count

# print(f"Total Samples: {samples_count}")

In [21]:
# with open('../data/class_samples_count.json', 'w') as f:
#     json.dump(class_wts, f)

In [22]:
# for cls, count in class_wts.items():
#     class_wts[cls] = count/samples_count

In [23]:
# with open('../data/class_weights.json', 'w') as f:
#     json.dump(class_wts, f)

## Model Architecture

In [24]:
import torch
import torch.nn as nn

In [25]:
class EegLstm(nn.Module):
    def __init__(self, input_dims=5, hidden_dims=128, num_layers=3, dropout=0.3 , num_classes=len(os.listdir(root_dir))):
        super(EegLstm, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_dims,
            hidden_size=hidden_dims,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers >= 2 else 0
        )

        self.fc = nn.Sequential(
            nn.Linear(hidden_dims, hidden_dims),
            nn.BatchNorm1d(hidden_dims),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims, hidden_dims),
            nn.BatchNorm1d(hidden_dims),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims, hidden_dims//2),
            nn.BatchNorm1d(hidden_dims//2),
            nn.LeakyReLU(),
            nn.Linear(hidden_dims//2, num_classes)
        )

    def forward(self, x, lengths=None):
        if lengths is not None:
            packed = nn.utils.rnn.pack_padded_sequence(
                x, lengths.cpu(), batch_first=True, enforce_sorted=False
            )

            packed_out, (h_n, c_n) = self.lstm(packed)

        else:
            out, (h_n, c_n) = self.lstm(x)

        last_hidden = h_n[-1]
        logits=self.fc(last_hidden)

        return logits

In [26]:
from tqdm import tqdm
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [27]:
class EarlyStopping(object):
    def __init__(self, model, save_path='../content/drive/MyDrive/eeg_classifier.pt', patience=4, tol=1e-3):
        self.model = model
        self.save_path = save_path
        self.patience = patience
        self.counter = 0
        self.tol = tol
        self.best_val_loss = float('inf')
        self.early_stop = False

    def __call__(self, batch_val_loss):
        if batch_val_loss < self.best_val_loss - self.tol:
            torch.save(self.model.state_dict(), self.save_path)
            self.best_val_loss = batch_val_loss
            self.counter = 0
            print(f'Validation Loss improved -> model saved to {self.save_path}')

        else:
            if self.counter < self.patience:
                self.counter += 1
                print(f'No improvement in Val Loss. Counter: {self.counter}/{self.patience}')

            else:
                self.early_stop = True
                print(f"Early Stopping triggered!")


In [28]:
def train_model(model, model_name, train_loader, val_loader, epochs=15, lr=1e-3, device='cpu'):
    log_dir = f'../content/drive/MyDrive/NeuroVision/runs/{model_name}'
    save_path = f'../content/drive/MyDrive/NeuroVision/models/{model_name}_v1_best.pth'
    os.makedirs(os.path.dirname(log_dir), exist_ok=True)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    criterion = nn.CrossEntropyLoss(weight=torch.Tensor(train_cls_wts).to(device))
    optimizer = optim.Adam(model.parameters(), lr=lr)
    writer = SummaryWriter(log_dir=log_dir)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)
    early_stopping = EarlyStopping(model, save_path=save_path, patience=6)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss, train_correct, train_total = 0.0, 0, 0
        train_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train Pass]', leave=True)

        for batch_x, batch_y, lengths in train_bar:
            batch_x, batch_y, lengths = batch_x.to(device), batch_y.to(device), lengths.to(device)

            optimizer.zero_grad()
            y_preds = model(batch_x, lengths)

            loss = criterion(y_preds, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

            train_loss += loss.item() * batch_x.size(0)
            _, preds = torch.max(y_preds, 1)
            train_correct += (preds == batch_y).sum().item()
            train_total += batch_y.size(0)

            train_bar.set_postfix(loss=loss.item())

        train_acc = train_correct / train_total
        train_loss /= train_total


        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0

        val_bar = tqdm(val_loader, desc=f"Epoch{epoch+1}/{epochs} [Val Pass]", leave=True)

        with torch.no_grad():
            for batch_x, batch_y, lengths in val_bar:
                batch_x, batch_y, lengths = batch_x.to(device), batch_y.to(device), lengths.to(device)

                y_preds = model(batch_x, lengths)
                loss = criterion(y_preds, batch_y)

                val_loss += loss.item() * batch_x.size(0)
                _, preds = torch.max(y_preds, 1)
                val_correct += (preds == batch_y).sum().item()
                val_total += batch_y.size(0)

                val_bar.set_postfix(loss=loss.item())

        val_acc = val_correct / val_total
        val_loss /= val_total

        scheduler.step(val_loss)

        early_stopping(val_loss)
        if early_stopping.early_stop:
            break


        # logging
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Loss/val', val_loss, epoch)
        writer.add_scalar('Accuracy/train', train_acc, epoch)
        writer.add_scalar('Accuracy/val', val_acc, epoch)

        print(f"Epoch {epoch+1}/{epochs}:\nTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f} %\nVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f} %")

    writer.close()

In [29]:
def model_summary(model):
    print('========================================= Model Summary ==============================================\n')
    print(f"\n{'='*55}")
    print(f"{'| Parameter Name':31}|| Number of Parameters|")
    print(f"{'='*55}")

    total_params = 0

    for name, param in model.named_parameters():
        print(f'| {name:30}|{param.numel():20} |')
        print(f"{'-'*55}")
        total_params += param.numel()

    print(f"\nTotal Parameters: {total_params:,}")

## Model Training

In [30]:
lstm_model = EegLstm(input_dims=5, hidden_dims=256, num_layers=5, dropout=0.3)

In [31]:
model_summary(lstm_model)



| Parameter Name               || Number of Parameters|
| lstm.weight_ih_l0             |                5120 |
-------------------------------------------------------
| lstm.weight_hh_l0             |              262144 |
-------------------------------------------------------
| lstm.bias_ih_l0               |                1024 |
-------------------------------------------------------
| lstm.bias_hh_l0               |                1024 |
-------------------------------------------------------
| lstm.weight_ih_l1             |              262144 |
-------------------------------------------------------
| lstm.weight_hh_l1             |              262144 |
-------------------------------------------------------
| lstm.bias_ih_l1               |                1024 |
-------------------------------------------------------
| lstm.bias_hh_l1               |                1024 |
-------------------------------------------------------
| lstm.weight_ih_l2             |             

In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_model(lstm_model, 'EEG_LSTM', train_loader, val_loader, 10, 1e-3, device)

Epoch 1/10 [Train Pass]: 100%|██████████| 82/82 [01:52<00:00,  1.37s/it, loss=1.57]
Epoch1/10 [Val Pass]: 100%|██████████| 28/28 [12:12<00:00, 26.16s/it, loss=5.2]


Validation Loss improved -> model saved to ../content/drive/MyDrive/NeuroVision/models/EEG_LSTM_v1_best.pth
Epoch 1/10:
Train Loss: 1.417 | Train Acc: 49.49 %
Val Loss: 3.231 | Val Acc: 21.80 %


Epoch 2/10 [Train Pass]: 100%|██████████| 82/82 [01:42<00:00,  1.24s/it, loss=1.96]
Epoch2/10 [Val Pass]: 100%|██████████| 28/28 [00:32<00:00,  1.15s/it, loss=3.56]


Validation Loss improved -> model saved to ../content/drive/MyDrive/NeuroVision/models/EEG_LSTM_v1_best.pth
Epoch 2/10:
Train Loss: 0.942 | Train Acc: 54.04 %
Val Loss: 2.244 | Val Acc: 21.80 %


Epoch 3/10 [Train Pass]: 100%|██████████| 82/82 [01:40<00:00,  1.22s/it, loss=0.932]
Epoch3/10 [Val Pass]: 100%|██████████| 28/28 [00:32<00:00,  1.16s/it, loss=3.5]


No improvement in Val Loss. Counter: 1/6
Epoch 3/10:
Train Loss: 0.902 | Train Acc: 55.00 %
Val Loss: 2.250 | Val Acc: 21.80 %


Epoch 4/10 [Train Pass]: 100%|██████████| 82/82 [01:38<00:00,  1.21s/it, loss=0.76]
Epoch4/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.13s/it, loss=3.66]


No improvement in Val Loss. Counter: 2/6
Epoch 4/10:
Train Loss: 0.948 | Train Acc: 54.10 %
Val Loss: 2.365 | Val Acc: 21.80 %


Epoch 5/10 [Train Pass]: 100%|██████████| 82/82 [01:38<00:00,  1.21s/it, loss=1.41]
Epoch5/10 [Val Pass]: 100%|██████████| 28/28 [00:32<00:00,  1.16s/it, loss=3.29]


No improvement in Val Loss. Counter: 3/6
Epoch 5/10:
Train Loss: 0.987 | Train Acc: 52.90 %
Val Loss: 2.359 | Val Acc: 21.80 %


Epoch 6/10 [Train Pass]: 100%|██████████| 82/82 [01:39<00:00,  1.22s/it, loss=0.868]
Epoch6/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.13s/it, loss=3.43]


Validation Loss improved -> model saved to ../content/drive/MyDrive/NeuroVision/models/EEG_LSTM_v1_best.pth
Epoch 6/10:
Train Loss: 0.911 | Train Acc: 54.93 %
Val Loss: 2.219 | Val Acc: 21.80 %


Epoch 7/10 [Train Pass]: 100%|██████████| 82/82 [01:39<00:00,  1.22s/it, loss=0.998]
Epoch7/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.11s/it, loss=3.8]


No improvement in Val Loss. Counter: 1/6
Epoch 7/10:
Train Loss: 0.929 | Train Acc: 53.86 %
Val Loss: 2.415 | Val Acc: 21.80 %


Epoch 8/10 [Train Pass]: 100%|██████████| 82/82 [01:38<00:00,  1.20s/it, loss=0.636]
Epoch8/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.14s/it, loss=3.77]


No improvement in Val Loss. Counter: 2/6
Epoch 8/10:
Train Loss: 0.914 | Train Acc: 54.85 %
Val Loss: 2.419 | Val Acc: 21.80 %


Epoch 9/10 [Train Pass]: 100%|██████████| 82/82 [01:38<00:00,  1.20s/it, loss=0.506]
Epoch9/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.11s/it, loss=3.95]


No improvement in Val Loss. Counter: 3/6
Epoch 9/10:
Train Loss: 0.898 | Train Acc: 54.79 %
Val Loss: 2.482 | Val Acc: 21.80 %


Epoch 10/10 [Train Pass]: 100%|██████████| 82/82 [01:38<00:00,  1.20s/it, loss=0.712]
Epoch10/10 [Val Pass]: 100%|██████████| 28/28 [00:31<00:00,  1.13s/it, loss=3.98]

No improvement in Val Loss. Counter: 4/6
Epoch 10/10:
Train Loss: 0.927 | Train Acc: 53.96 %
Val Loss: 2.520 | Val Acc: 21.80 %





## Model Testing