# NeuroVision

## Load Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil

In [2]:
print(torch.__version__)

NameError: name 'torch' is not defined

## Dataset Preparation

In [48]:
# function to read the dir contents of dataset folder and segregate them 
# into n separate classes.
def create_dataset_folders(metadata_file:str, csv_dir:str, output_dir:str):
    class_id_to_folder = {}

    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')

            if len(parts) < 3:
                continue

            label_str, _, class_id = parts
            # print(label_str, class_id)
            first_label = label_str.split(',')[0].strip()
            # print(first_label)
            class_id_to_folder[class_id] = first_label

        count = 0
        for filename in os.listdir(csv_dir):
            if not filename.endswith('.csv'):
                continue

            class_id = filename.split('_')[3]

            folder_name = class_id_to_folder.get(class_id)
            print(folder_name)

            if not folder_name:
                print(f'Unknown class id: {class_id}')
                continue

            safe_folder = folder_name.replace('/', '_').replace('\\', '_').strip()

            dest_folder = os.path.join(output_dir, safe_folder)
            os.makedirs(dest_folder, exist_ok=True)

            src_path = os.path.join(csv_dir, filename)
            dst_path = os.path.join(dest_folder, filename)

            # print(f"Move: {src_path} to {dst_path}")
            count+=1
            print(count)
            shutil.copy(src_path, dst_path)
            

In [None]:
# create_dataset_folders('../data/WordReport-v1.04.txt', 
#                        '../data/MindBigData-Imagenet', 
#                        '../data/Segregated_Dataset')

In [None]:
import os
os.listdir('../data/Segregated_Dataset')

## Dataset Processing for PyTorch

In [3]:
import torch
import os
import pandas as pd
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split

In [4]:
class EEGDataset(Dataset):
    def __init__(self, root_dir, samples, transform=None):
        self.root_dir = root_dir
        self.samples = samples
        self.transform = transform

    def __len__(self): 
        return len(self.samples)          

    def __getitem__(self, idx):
        file_path, label = self.samples[idx]

        df = pd.read_csv(file_path, header=None, index_col=0)
        eeg_data = torch.tensor(df.values, dtype=torch.float32)

        if eeg_data.shape[0] < eeg_data.shape[1]:
            eeg_data = eeg_data.T

        if self.transform:
            eeg_data = self.transform(eeg_data)

        return eeg_data, label


In [5]:
def make_datasets(root_dir, val_ratio=0.2, random_state=42): 
    class_names = sorted(os.listdir(root_dir))
    class_to_idx = {cls:idx for idx, cls in enumerate(class_names)}

    all_samples = []
    all_labels = []

    for cls in class_names:
        cls_dir = os.path.join(root_dir, cls)
        
        for fname in os.listdir(cls_dir): 
            if fname.endswith('.csv'):
                path = os.path.join(cls_dir, fname)
                all_samples.append((path, class_to_idx[cls]))
                all_labels.append(class_to_idx[cls])

    train_idx, val_idx = train_test_split(
        list(range(len(all_samples))), 
        test_size=val_ratio, 
        random_state=random_state, 
        stratify=all_labels
    )

    train_samples = [all_samples[i] for i in train_idx]
    val_samples = [all_samples[i] for i in val_idx]

    train_dataset = EEGDataset(root_dir, train_samples)
    val_dataset = EEGDataset(root_dir, val_samples)

    return train_dataset, val_dataset

In [6]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sequences, labels = zip(*batch)

    lengths = torch.tensor([seq.size(0) for seq in sequences], dtype=torch.long)
    padded_seqs = pad_sequence(sequences, batch_first=True)

    return padded_seqs, torch.tensor(labels), lengths

In [7]:
train_dataset, val_dataset = make_datasets('../data/Segregated_Dataset')

In [8]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [11]:
x, y, lengths = next(iter(train_loader))

print(x.shape)
print(y)

torch.Size([32, 404, 5])
tensor([471, 178, 497, 250, 518, 414, 150, 237, 518, 199, 236, 507, 533, 177,
        525, 539,   6,   7,  35, 529, 368, 260, 515, 559,  20, 156, 209, 511,
         36, 105, 268, 210])


In [13]:
# pd.read_csv('../data/Segregated_Dataset/accordion/MindBigData_Imagenet_Insight_n02672831_563_1_2136.csv', header=None, index_col=0).head()

## Model Training