In [1]:
import json
import os
import random
from tqdm import tqdm
import xgboost as xgb
from sklearn.metrics import accuracy_score,log_loss

Read `channel-based` annotation to a python dictionary

In [2]:
training_data_path = "../bipolar_eeg_dataset/train_filtred_channel_based.json"
testing_data_path = "../bipolar_eeg_dataset/dev_filtred_channel_based.json"

In [3]:
with open(training_data_path,'r') as f:
    training_data = json.load(f)

with open(testing_data_path,'r') as f:
    testing_data = json.load(f)

## Data generator

In [4]:
import numpy as np
import torch
import torchaudio

In [5]:
def get_batch(sampled_data:torch.Tensor,target:torch.Tensor,batch_size:int = 8):
    """ Split the sampled data into tensors of each with `batch_size` as the first dimmension 
    Args:
      sampled_data (torch.Tensor): resampled eeg reading with shape (nW,nC,D)
      target (torch.Tenosr): target labels for each sample (nW,22)
      batch_size (int): batch_size
    Returns:
      (torch.Tensor,torch.Tensor): batch

    """
    x_batch = torch.split(sampled_data,batch_size,dim=0)[:-1]
    y_batch = torch.split(target,batch_size,dim=0)[:-1]

    return x_batch,y_batch

In [6]:

def create_window(eeg_sample:np.ndarray,channel_annot:dict,s_freq:int,window_size:int=20):
    """ Resample and generate class label tensor for an eeg reading.
    Args:
      eeg_sample (nd.array): Raw EEG data
      channel_annot (dict): corresponding annoations
      s_fre (int): sampling frequency.
      window_size: window_size in seconds.
    """
    context_length = window_size * s_freq # change window_size(seconds) to sequence_lenth
    sample_length = eeg_sample.shape[-1] # total length of the raw eeg signal

    # pad the `eeg_sample` to the nearest integer factor of `window_size`
    padding_size = int(
    context_length * torch.ceil(torch.tensor(sample_length / context_length)).item()
)
    padded_zero = torch.zeros(eeg_sample.shape[0],padding_size)
    padded_zero[...,0:sample_length] = eeg_sample
    padded_zero = padded_zero.view(-1,padded_zero.shape[0],context_length)
    # class labels
    target = torch.zeros(padded_zero.shape[0],padded_zero.shape[1])
 
    for idx in range(target.shape[0]):
        channel_labels_tensor = torch.zeros(target.shape[1])
        channel_labels = []

        for i,labels in enumerate(channel_annot.values()):
            for label in labels:
                start_time,stop_time,c = label

                sample_start_time = idx * window_size
                sample_stop_time = (idx + 1) * window_size
                if sample_start_time >= start_time and sample_stop_time <= stop_time:
                    channel_labels.append(0 if c =="bckg" else 1)
            
        channel_labels_tensor[0:len(channel_labels)] = torch.tensor(channel_labels,dtype=torch.float32)

        target[idx,...] =channel_labels_tensor
        
    #target = target.unsqueeze(-1)
    return padded_zero,target

In [7]:
class DataLoader:
    def __init__(self,annotation:dict,new_s_freq:int = 256,window_size:int=20) -> None:
        self.idx = 0
        self.new_s_freq = new_s_freq
        self.window_size = window_size
        self.annotation = annotation

        default_channel_nums = 22
        sample_freq = annotation["s_freq"]  # sampleing freqeuncy
        montage = annotation["montage"]
         # resample EEG to a fixed sampling frequency.
        resampler = torchaudio.transforms.Resample(sample_freq,new_s_freq)

        with np.load(annotation["npz_filepath"]) as npz_file:
            raw_eeg = npz_file["arr_0"]

        raw_eeg = torch.from_numpy(raw_eeg).to(torch.float32)
        # resample
        raw_eeg_resample = resampler(raw_eeg)

        if montage not in ["01_tcp_ar","02_tcp_le"]:
            zero_eeg = torch.zeros(default_channel_nums,raw_eeg_resample.shape[-1])
            zero_eeg[0:raw_eeg_resample.shape[0],...] = raw_eeg_resample

            raw_eeg_resample = zero_eeg

        self.x,self.y =  self.create_window(raw_eeg_resample,self.annotation["channel_annot"])

    
    def create_window(self,eeg_sample:np.ndarray,channel_annot:dict):
        """ Resample and generate class label tensor for an eeg reading.
        Args:
        eeg_sample (nd.array): Raw EEG data
        channel_annot (dict): corresponding annoations
        s_fre (int): sampling frequency.
        window_size: window_size in seconds.
        """
        context_length = self.window_size * self.new_s_freq # change window_size(seconds) to sequence_lenth
        sample_length = eeg_sample.shape[-1] # total length of the raw eeg signal

        # pad the `eeg_sample` to the nearest integer factor of `window_size`
        padding_size = int(
        context_length * torch.ceil(torch.tensor(sample_length / context_length)).item()
    )
        padded_zero = torch.zeros(eeg_sample.shape[0],padding_size)
        padded_zero[...,0:sample_length] = eeg_sample
        padded_zero = padded_zero.view(-1,padded_zero.shape[0],context_length)
        # class labels
        target = torch.zeros(padded_zero.shape[0],padded_zero.shape[1])
    
        for idx in range(target.shape[0]):
            channel_labels_tensor = torch.zeros(target.shape[1])
            channel_labels = []

            for i,labels in enumerate(channel_annot.values()):
                for label in labels:
                    start_time,stop_time,c = label

                    sample_start_time = idx * self.window_size
                    sample_stop_time = (idx + 1) * self.window_size
                    if sample_start_time >= start_time and sample_stop_time <= stop_time:
                        channel_labels.append(0 if c =="bckg" else 1)
                
            channel_labels_tensor[0:len(channel_labels)] = torch.tensor(channel_labels,dtype=torch.float32)

            target[idx,...] =channel_labels_tensor
        #target = target.unsqueeze(-1)
        return padded_zero,target

    def __len__(self):
        return len(self.x)
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.idx > len(self.x) - 1:
            raise StopIteration
        
        x = self.x[self.idx]

        x = torch.fft.fft(x,norm="ortho",dim=-1).real
        y = self.y[self.idx]

        if torch.all(y):
            idx = random.randint(0,len(y)-1)
            x[idx,...] = torch.zeros(x.shape[-1])
            y[idx] = 0.
        self.idx +=1
        
        return x,y

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 5
batch_size = 8
lr = 1e-3

In [7]:
from tqdm import tqdm
import torch.optim as optim
from sklearn.metrics import accuracy_score
""" 
# train a simple model
optimizer = optim.Adam(model.parameters(),lr=lr)

for epoch in range(epochs):
  model.train()

  losses = []
  accuracy = []
  loader = tqdm(
      data_loader(training_data,new_s_freq = 256,window_size=20,batch_size=batch_size)
  )

  for x, y in loader:
    x = x.to(device)
    y = y.to(device)

    logits,loss = model(x,y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    losses.append(loss.detach().item())

    preds = torch.sigmoid(logits)
    preds = (preds > 0.5).float().detach().tolist()
    
    accuracy.append(accuracy_score(y.tolist(),preds))
    
    loader.set_description(f"Epoch {epoch + 1}/{epochs}") 
    loader.set_postfix(loss=np.mean(losses),acc=np.mean(accuracy)) """

' \n# train a simple model\noptimizer = optim.Adam(model.parameters(),lr=lr)\n\nfor epoch in range(epochs):\n  model.train()\n\n  losses = []\n  accuracy = []\n  loader = tqdm(\n      data_loader(training_data,new_s_freq = 256,window_size=20,batch_size=batch_size)\n  )\n\n  for x, y in loader:\n    x = x.to(device)\n    y = y.to(device)\n\n    logits,loss = model(x,y)\n\n    optimizer.zero_grad(set_to_none=True)\n    loss.backward()\n    optimizer.step()\n\n    losses.append(loss.detach().item())\n\n    preds = torch.sigmoid(logits)\n    preds = (preds > 0.5).float().detach().tolist()\n    \n    accuracy.append(accuracy_score(y.tolist(),preds))\n    \n    loader.set_description(f"Epoch {epoch + 1}/{epochs}") \n    loader.set_postfix(loss=np.mean(losses),acc=np.mean(accuracy)) '

## XGBoost

In [9]:
# Set the tree_method parameter to gpu_hist
params = {'tree_method': 'gpu_hist'}
# Set the gpu_id parameter to the ID of your GPU
params['gpu_id'] = 0

xgb_clf = xgb.XGBClassifier(tree_method='gpu_hist',
                            learning_rate=1e-3,
                            nthread=4,
                            max_depth=10,
                            eval_metric='logloss',
                            objective="binary:logistic")

In [None]:
from sklearn.preprocessing import LabelEncoder
import sys
sys.path.append("..")

from utils.custom_loader import XGBLoader

le = LabelEncoder()
labels = [0.,1.]
le.fit_transform(labels)

In [37]:
test_loader = XGBLoader(testing_data[1])

In [38]:
x_t = None
y_t = None
for x,y in test_loader:
  if y.sum() != 0:
    x_t,y_t = x,y
    break

x_t = x_t.numpy()#.reshape(x.shape[0],-1)
y_t = y_t.numpy()#.reshape(x.shape[0],-1)

y_t = le.transform(y_t)

In [39]:
y_t

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

## Evaluate trained XGBoost.

1. `Benchmark`: No preprocessing of signal

In [44]:
benchmark_model_file = "../utils/xgb_model.json"
os.path.exists(benchmark_model_file)

model = xgb.Booster()

model.set_param({"tree_method":"gpu_hist","depth":10,"nthread":16})
model.load_model(benchmark_model_file)

In [45]:
x_t.shape

(22, 5120)

In [49]:
pred = model.predict(xgb.DMatrix(x_t))
pred

array([0.4971228 , 0.48866928, 0.52708787, 0.5058152 , 0.5012345 ,
       0.49603844, 0.48642415, 0.48960558, 0.48289624, 0.48838705,
       0.4964315 , 0.49772727, 0.5038789 , 0.49455923, 0.5288117 ,
       0.51224506, 0.5104085 , 0.48718044, 0.48567632, 0.4839459 ,
       0.4865774 , 0.4858613 ], dtype=float32)

In [47]:
pred = np.uint8(pred > 0.5).tolist()

In [48]:
print(pred)
print(y_t.tolist())

[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [24]:
x_train = []
y_train = []

bar = tqdm(training_data, total=len(training_data), desc="Evaluating")
for i, eeg_annot in enumerate(bar):
        dg = XGBLoader(eeg_annot)
        for x,y in dg:
                if y.sum() == 0:
                        continue
                x_train.append(x)
                y_train.append(y)



Evaluating:   0%|          | 0/723 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 723/723 [02:44<00:00,  4.38it/s]


In [25]:
x_train_np = np.stack(x_train)
y_train_np = np.stack(y_train)

In [26]:
y_train_np = y_train_np.reshape(-1)

In [27]:
y_train_np.shape

(139436,)

In [28]:
x_train_np = x_train_np.reshape(-1,x_train_np.shape[-1])

In [29]:
x_train_np.shape

(139436, 5120)

In [30]:
xgb_clf.fit(x_train_np,y_train_np,verbose=True)

KeyboardInterrupt: 