Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import torch
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [2]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

from trainer_hub import TrainerHub

In [3]:
# https://github.com/N-Nieto/Inner_Speech_Dataset

# Load the Inner Speech Dataset
# =============================
# This dataset comprises raw EEG data collected from subject 'sub-01' during session 'ses-01'.
# Source: https://github.com/N-Nieto/Inner_Speech_Dataset
#
# Overview:
# - The dataset is part of a study on inner speech, capturing brain activity via EEG.
# - Each row in the dataset corresponds to a timestamp of EEG readings.
# - Columns represent various EEG channels (electrodes placed on the scalp).
#
# Usage:
# - The data is primarily used for cognitive neuroscience research, focusing on the neural correlates of inner speech.
# - Users can analyze EEG signals to investigate brain activity patterns associated with the cognitive processes of inner speech.
#
# File Structure:
# - Located at '../data/RAW_EEG/sub-01/sub-01_ses-01.csv' relative to this script.
# - It is advisable to preprocess the data (filtering, normalization) before detailed analysis.
#
# Example:
# - To load this data into a DataFrame for analysis and processing, use the following code snippet.


df = None
for csv in ["../data/RAW_EEG/sub-01/sub-01_ses-01.csv", "../data/RAW_EEG/sub-01/sub-01_ses-02.csv", "../data/RAW_EEG/sub-01/sub-01_ses-03.csv"]:
    tmp_df = pd.read_csv(path_append + csv)
    if df is None:
        df = tmp_df
    else:
        df = pd.concat([df, tmp_df])
df = df.reset_index(drop=True)
df


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,D24,D25,D26,D27,D28,D29,D30,D31,D32,event
0,3549.790315,4533.538497,3619.665186,3077.291188,-1380.325575,6120.066816,-4072.820600,-2256.511456,1820.012261,-2815.635423,...,-7240.845997,7034.252627,8458.062496,5905.223463,6147.660515,2458.073582,-7465.876831,-3604.133966,-5445.224315,5
1,3551.227812,4534.850995,3622.540181,3077.322438,-1377.575581,6123.066810,-4069.851856,-2252.167714,1825.168502,-2803.072947,...,-7227.283522,7039.627617,8463.874985,5911.598451,6153.504254,2463.354822,-7461.033090,-3594.258985,-5435.693082,5
2,3556.727802,4539.850986,3629.040169,3081.978679,-1370.419344,6130.348047,-4063.508118,-2249.292720,1828.074746,-2804.041695,...,-7227.158522,7048.502600,8473.562467,5921.348433,6163.004236,2469.854810,-7460.470591,-3591.540240,-5433.568086,5
3,3557.915300,4541.225983,3628.540169,3083.197427,-1372.263090,6130.410547,-4062.070620,-2251.667715,1825.856000,-2803.572946,...,-7224.189777,7042.346362,8464.593734,5917.660940,6160.972990,2467.011066,-7458.158095,-3597.008980,-5437.474329,5
4,3553.352808,4535.757243,3622.477681,3079.572434,-1377.763080,6125.598056,-4066.570612,-2255.136459,1821.981008,-2808.041687,...,-7219.971035,7044.658857,8466.843729,5914.848445,6156.785498,2466.948566,-7457.501846,-3585.821500,-5428.630595,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588018,-623.326974,2269.261431,2575.479615,285.733846,907.388947,-491.014719,-2998.447586,1886.043389,1659.637557,416.296105,...,-7176.689865,2116.667963,-901.138961,-227.327706,-657.170662,3025.322534,-12313.149124,-3810.071086,-5620.505241,10
588019,-627.420717,2264.448940,2570.323375,281.077605,903.482705,-490.702219,-3001.260080,1884.387142,1657.012562,414.702358,...,-7179.502360,2118.074210,-900.607712,-227.046456,-659.389408,3027.760030,-12307.211635,-3809.946086,-5621.098990,10
588020,-631.764459,2260.730197,2566.917131,275.546365,902.045207,-493.545964,-3006.103821,1886.199639,1658.512560,424.202340,...,-7177.439864,2118.199210,-900.920211,-226.140208,-659.764407,3027.103781,-12305.774138,-3805.633594,-5614.880251,10
588021,-625.076971,2265.605188,2573.354619,281.702604,904.982702,-490.795969,-3001.416330,1888.387135,1659.418808,420.077348,...,-7172.002374,2119.730457,-898.170216,-224.515211,-656.576913,3032.822520,-12303.742892,-3804.133597,-5614.192752,10


In [4]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

# Example setup, assuming df and mm are defined as DataFrame and RobustScaler respectively

# Assuming df['event'] contains the class labels
event_counts = df['event'].value_counts()
max_class_number = df['event'].max()

# Print each number of classes
print("Counts of each class in the 'event' column:")
print(event_counts)

# Print the maximum class number
print("\nMaximum class number:")
print(max_class_number)

num_classes = max_class_number + 1
# Additionally, verify against the num_classes variable
print("\nExpected number of classes (from num_classes variable):", num_classes)

Counts of each class in the 'event' column:
event
1     57650
0     57650
3     57650
2     57650
13    57650
12    57650
11    57650
10    57650
6     28825
9     28825
8     28825
7     28825
5     11523
Name: count, dtype: int64

Maximum class number:
13

Expected number of classes (from num_classes variable): 14


In [5]:
# Assuming df is defined and already includes an 'event' column
# Assuming 'event' column contains class labels
event_changes = df['event'].diff().ne(0)
change_indices = event_changes[event_changes].index.tolist()

# Calculate and print lengths between changes
lengths_between_changes = [change_indices[i] - change_indices[i-1] for i in range(1, len(change_indices))]

# Find the minimum cycle length where the label changes
min_cycle_length = min(lengths_between_changes)

print("Indices where the 'event' label changes:", change_indices)
print("Lengths between changes:", lengths_between_changes)
print(f"Minimum cycle length: {min_cycle_length}")

Indices where the 'event' label changes: [0, 3841, 4994, 6147, 8453, 9606, 10759, 11912, 13065, 14218, 15371, 17677, 18830, 19983, 21136, 23442, 24595, 25748, 26901, 28054, 29207, 30360, 31513, 33819, 34972, 36125, 38431, 39584, 40737, 41890, 45349, 46502, 48808, 49961, 52267, 55726, 56879, 58032, 60338, 61491, 62644, 63797, 66103, 67256, 68409, 69562, 70715, 71868, 73021, 74174, 75327, 77633, 78786, 81092, 82245, 83398, 85704, 86857, 88010, 89163, 90316, 91469, 92622, 94928, 96081, 98387, 101846, 102999, 104152, 106458, 107611, 108764, 109917, 112223, 113376, 114529, 115682, 116835, 117988, 119141, 120294, 121447, 123753, 124906, 127212, 128365, 129518, 131824, 132977, 134130, 135283, 136436, 137589, 138742, 141048, 142201, 143354, 144507, 146813, 149119, 150272, 151425, 152578, 153731, 157190, 158343, 160649, 161802, 162955, 164108, 165261, 166414, 168720, 169873, 172179, 173332, 175638, 176791, 179097, 180250, 181403, 182556, 183709, 184862, 186015, 187168, 188321, 190627, 192933, 1

In [6]:

# Correctly select only the numerical columns (exclude the 'event' column) and convert to a PyTorch tensor
df_tensor = torch.tensor(df.iloc[:, :-1].values).float().cuda()  # Using .iloc and .values to correctly handle DataFrame slicing
print("df_tensor shape:", df_tensor.shape)
# Define a function to perform robust scaling using PyTorch
def robust_scale_gpu(data):
    median = torch.median(data, dim=0, keepdim=True).values
    q75, q25 = torch.quantile(data, torch.tensor([0.75, 0.25], device=data.device), dim=0, keepdim=True)
    iqr = q75 - q25

    return (data - median) / iqr

def standard_scale_gpu(data):
    mean = torch.mean(data, dim=0, keepdim=True)
    std = torch.std(data, dim=0, keepdim=True)

    return (data - mean) / (std + 1e-8)

for start, end in zip(change_indices[:-1], change_indices[1:]):
    segment_length = end - start
    if segment_length >= min_cycle_length and segment_length % min_cycle_length == 0:
        # Normalize each sub-segment within the main segment
        for offset in range(0, segment_length, min_cycle_length):
            sub_start = start + offset
            sub_end = sub_start + min_cycle_length
            segment = df_tensor[sub_start:sub_end, :]
            scaled_segment = standard_scale_gpu(segment)
            df_tensor[sub_start:sub_end, :] = scaled_segment  # Correctly place the scaled data back into the DataFrame
    else:
        irregular_num = segment_length//min_cycle_length
        # Normalize each sub-segment within the main segment
        for i in range(irregular_num):
            sub_start = start + i * min_cycle_length
            if i == irregular_num - 1:
                sub_end = end
            else:
                sub_end = sub_start + min_cycle_length
            segment = df_tensor[sub_start:sub_end, :]
            scaled_segment = standard_scale_gpu(segment)
            df_tensor[sub_start:sub_end, :] = scaled_segment  # Correctly place the scaled data back into the DataFrame

# Optionally, convert back to DataFrame if needed for further processing
scaled_df = pd.DataFrame(df_tensor.cpu().numpy(), columns=df.columns[:-1])
scaled_df['event'] = df['event']
num_features = len(scaled_df.columns) - 1
print(scaled_df.head())

df_tensor shape: torch.Size([588023, 128])
         A1        A2        A3        A4        A5        A6        A7   
0 -2.488196 -2.276049 -2.513326 -2.187655 -0.665328 -1.345729 -0.058758  \
1 -2.275978 -2.101950 -2.131188 -2.183079 -0.334046 -0.958461  0.322920   
2 -1.464012 -1.438715 -1.267226 -1.501220  0.528039 -0.018531  1.138506   
3 -1.288701 -1.256325 -1.333684 -1.322747  0.305930 -0.010463  1.323319   
4 -1.962264 -1.981739 -2.139496 -1.853590 -0.356633 -0.631705  0.744775   

         A8        A9       A10  ...       D24       D25       D26       D27   
0  0.063271  0.040405 -1.022892  ... -1.909194 -1.004350 -0.560398 -1.681318  \
1  0.508570  0.570278 -0.405529  ... -1.421690 -0.624266  0.064576 -0.879298   
2  0.803300  0.868933 -0.453136  ... -1.417197  0.003315  1.106200  0.347321   
3  0.559827  0.640927 -0.430100  ... -1.310485 -0.432014  0.141858 -0.116593   
4  0.204229  0.242720 -0.649710  ... -1.158842 -0.268490  0.383784 -0.470425   

        D28       D29    

In [7]:
import torch
from torch.utils.data import Dataset
import random

class EEG_Dataset(Dataset):
    def __init__(self, df, indices, max_window_size):
        self.df = df
        self.indices = indices  # List of start indices
        self.max_window_size = max_window_size
        self.min_window_size = max_window_size // 2

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        start_idx = self.indices[idx]
        # Randomly choose a window size between min_window_size and max_window_size
        window_size = random.randint(self.min_window_size, self.max_window_size)
        
        end_idx = start_idx + window_size
        # Make sure the end index does not go out of the bounds of the DataFrame
        end_idx = min(end_idx, len(self.df))

        # Retrieve the sequence using the calculated indices
        seq = self.df.iloc[start_idx:end_idx]
        X, y = seq.values[:, :-1], seq.values[:, -1]
        
        # Convert to PyTorch tensors
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.long)  # ensure y is a tensor of type long
        y = torch.nn.functional.one_hot(y, num_classes=num_classes)  # correct use
        return X, y

# Assuming 'df' is your DataFrame, 'indices' are the start indices, and 'max_window_size' is defined
# trainset = EEG_Dataset(df, indices, max_window_size)
# DataLoader code would follow initialization


In [8]:
from sklearn.model_selection import train_test_split
from random import shuffle

# Assume 'df' is your DataFrame and 'event' is the column containing labels

def generate_indices(df, max_window_size):
    indices = []
    max_index = len(df) - max_window_size + 1  # Calculate the maximum starting index
    
    for i in range(max_index):
        # Check if all labels in the window are the same
        if len(df['event'][i:i + max_window_size].unique()) == 1:
            indices.append(i)
    
    return indices

# Example usage
max_window_size = 128
indices = generate_indices(df, max_window_size)
shuffle(indices)  # Shuffle the indices to randomize the data order

# Split the indices into training and testing sets
train_indices, test_indices = train_test_split(indices, test_size=0.2, shuffle=False)

# Assuming you have an EEG_Dataset class defined as before
trainset = EEG_Dataset(df=scaled_df, indices=train_indices, max_window_size=max_window_size)
testset = EEG_Dataset(df=scaled_df, indices=test_indices, max_window_size=max_window_size)

In [9]:
data_config = DataConfig(dataset_name = 'eeg-sub-01', task_type='multi_class_classification', obs_shape=[num_features], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters()

In [10]:
ml_params.core_model_name = 'gpt' 
ml_params.encoder_model_name = 'none'
ml_params.training.max_epoch = 200

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [11]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/200 [00:00<?, ?it/s]

Iterations:   0%|          | 0/6720 [00:00<?, ?it/s]

[0/200][50/6720][Time 21.82]
Unified LR across all optimizers: 0.00019969466861371834
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.3227	Gen: 17.0459	Rec: 16.9905	E: 21.4960	R: 15.1877	P: 1948.8082
--------------------Test Metrics------------------------
accuracy: 0.1542
precision: 0.0902
recall: 0.0955
f1_score: 0.0684

[0/200][100/6720][Time 20.87]
Unified LR across all optimizers: 0.0001993957766378747
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.1598	Gen: 11.3946	Rec: 11.3696	E: 10.4920	R: 7.6522	P: 1292.2542
--------------------Test Metrics------------------------
accuracy: 0.1536
precision: 0.0985
recall: 0.1245
f1_score: 0.0883

[0/200][150/6720][Time 20.87]
Unified LR across all optimizers: 0.00019909733202706992
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.1364	Gen: 12.4864	Rec: 12.4670	E: 8.8670	R: 6.6616	P: 1414.6276
--------------------Test Metrics------------------------
accur

In [None]:
trainer_hub.test(testset)