# Install Extra Packages Offline

1) tslearn

In [1]:
!pip install '/kaggle/input/hmsofflinepackages/tslearn-0.6.3-py3-none-any.whl'

Processing /kaggle/input/hmsofflinepackages/tslearn-0.6.3-py3-none-any.whl
Installing collected packages: tslearn
Successfully installed tslearn-0.6.3


In [2]:
import os
import zipfile

def zip_folder(folder_path, output_zip):
    """
    Zip the contents of an entire folder (with that folder included
    in the archive). Empty directories are included in the archive as well.
    """
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        lenDirPath = len(folder_path)
        for root, _, files in os.walk(folder_path):
            # Include all subdirectories, including empty ones.
            for dirName in os.listdir(root):
                dirPath = os.path.join(root, dirName)
                if os.path.isdir(dirPath):
                    zipf.write(dirPath, os.path.relpath(dirPath, folder_path))
            # Add files
            for file in files:
                filePath = os.path.join(root, file)
                zipf.write(filePath, os.path.relpath(filePath, folder_path))

2) signatory

In [3]:
zip_folder("/kaggle/input/hmsofflinepackages/signatory-1.2.6.1.9.0", "./signatory.zip")

!pip install "./signatory.zip"

Processing ./signatory.zip
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: signatory
  Building wheel for signatory (setup.py) ... [?25l- \ | / - \ | / done
[?25h  Created wheel for signatory: filename=signatory-1.2.6.1.9.0-cp310-cp310-linux_x86_64.whl size=237896 sha256=0aebdaefb1e8469874d14ad8694485d0843fc825c3bd357528407f7e7d2da22f
  Stored in directory: /tmp/pip-ephem-wheel-cache-a8gb2wlm/wheels/54/c2/57/159645f76fece22063dc3e9b8dc35b9c7a2d53632ae423ab4b
Successfully built signatory
Installing collected packages: signatory
Successfully installed signatory-1.2.6.1.9.0


In [4]:
# standard packages
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import pandas as pd
import torch

In [5]:
# from offline package installations
from tslearn.preprocessing import TimeSeriesScalerMinMax
import signatory

# Prepare Train Set

Load signature features and prepare for training NN

In [6]:
def TrainPreProcessing(DataPortion, Validation_Split):
    
    # Load all numpy arrays 
    EEG_Sig_Total = np.load('/kaggle/input/logsig3scaled/eeg_data_scaled.npy')   
    NumVotes_Total = np.load('/kaggle/input/logsig3scaled/num_votes.npy')
    Targets_Total = np.load('/kaggle/input/logsig3scaled/targets.npy')

    n_features = 2470
    
    # Combine columns into one array: ¦2470 Features¦6 Targets¦NumVotes¦ 
    TotalDataset = np.hstack((EEG_Sig_Total, Targets_Total, NumVotes_Total))
    
    # We want to drop all rows with nans in them
    nan_rows = np.isnan(TotalDataset).any(axis=1)
    # Drop rows with NaN values
    TotalDataset = TotalDataset[~nan_rows]
    
    TotalN = TotalDataset.shape[0]

    # shuffle rows
    np.random.seed(69)
    p = np.random.permutation(TotalN)

    TotalDataset = TotalDataset[p]

    # Work with only a small portion of the dataset for experimentation 
    N = round(DataPortion*TotalN)

    TruncatedDataset = TotalDataset[:N]
    
    # set proportion of train data for validation set
    Validation_N = round(Validation_Split*N)

    ValidationDataset = TruncatedDataset[:Validation_N]
    TrainDataset = TruncatedDataset[Validation_N:]

    X_val, y_val = ValidationDataset[:,:n_features], ValidationDataset[:,n_features:n_features+6]
    X_train, y_train = TrainDataset[:,:n_features], TrainDataset[:,n_features:n_features+6]
    N_votes_train = TrainDataset[:,-1]

    return X_train, y_train, X_val, y_val, N_votes_train


In [7]:
# Do not need a test set from the train data since we are submitting
X_train, y_train, X_test, y_test, N_votes_train = TrainPreProcessing(DataPortion=1, Validation_Split=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(106789, 2470) (106789, 6)
(0, 2470) (0, 6)


# Build MLP Model

In [8]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import kullback_leibler_divergence
from tensorflow.keras.callbacks import EarlyStopping


# Define the number of input features and output classes
num_features = 2470

num_classes = 6

2024-04-03 01:29:45.466634: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 01:29:45.466725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 01:29:45.629204: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
# Set random seeds for reproducibility
seed_value = 69

# 1. Set the seed for Python's built-in random number generator
random.seed(seed_value)
# 2. Set the seed for NumPy
np.random.seed(seed_value)
# 3. Set the seed for TensorFlow
tf.random.set_seed(seed_value)

# Performance on total train dataset w/ 10% portioned for test: 0.4381



# Random attempt
def create_model():
    inputs = tf.keras.Input(shape=(num_features,))
    
    x = layers.Dense(800, activation='sigmoid')(inputs) # 800 is good
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    x = layers.Dense(500, activation='relu')(x) # 500 is good
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)  
    
    x = layers.Dense(500, activation='relu')(x) # 500 is good
    x = layers.Dropout(0.2)(x)  
    
    x = layers.Dense(350, activation='relu')(x) # 350 is good   
    x = layers.Dropout(0.1)(x)

    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model

# Instantiate the model
model = create_model()

# Compile the model with KL divergence loss
model.compile(optimizer='adam',
              loss=kullback_leibler_divergence)

# Print the model summary
model.summary()

# Load MLP Weights

In [10]:
model.load_weights('/kaggle/input/praneesh-mlp-model-weights/TrainedModel.weights.h5')


  trackable.load_own_variables(weights_store.get(inner_path))


# Process Test Set

In [11]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print('Test shape:',test.shape)

Test shape: (1, 3)


In [12]:
EEG_SUB_PATH_TEMPL = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/'
SP_SUB_PATH_TEMPL = '/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/'

def get_sub_eeg_sp_data(train_row):
    """Gets EEG and Spectogram data from a specific row in the dataset"""
    
    eeg_id = train_row.eeg_id
    sp_id = train_row.spectrogram_id
    
    eeg_parquet = pd.read_parquet(f'{EEG_SUB_PATH_TEMPL}{eeg_id}.parquet')
    sp = pd.read_parquet(f'{SP_SUB_PATH_TEMPL}{sp_id}.parquet')
    
    rows = len(eeg_parquet)
    eeg_offset = (rows-10_000)//2
    
    
    # get middle 50 seconds of eeg data
    #eeg_offset = int(train_row.eeg_label_offset_seconds + 20) #only 10 central seconds from 50 secs were labeled, which should be seconds 20-30 in the sample
    eeg_data = eeg_parquet.iloc[eeg_offset:eeg_offset + 10_000]
    
    
    # sp_offset = int(train_row.spectrogram_label_offset_seconds )
    
    # get spectrogram data
    # sp = sp_parquet.loc[(sp_parquet.time>=sp_offset)&(sp_parquet.time<sp_offset+SP_WIN)]
    sp = sp.loc[:, sp.columns != 'time']
    sp = {
        "LL": sp.filter(regex='^LL', axis=1),
        "RL": sp.filter(regex='^RL', axis=1),
        "RP": sp.filter(regex='^RP', axis=1),
        "LP": sp.filter(regex='^LP', axis=1)}
    
    # calculate eeg data
    # print(eeg_data.keys()) # Has keys Index(['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz',
                            # 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
    # assert 0 == 1
    
    CHAINS = {
    'LL' : [("Fp1","F7"),("F7","T3"),("T3","T5"),("T5","O1")],
    'RL' : [("Fp2","F8"),("F8","T4"),("T4","T6"),("T6","O2")],
    'LP' : [("Fp1","F3"),("F3","C3"),("C3","P3"),("P3","O1")],
    'RP' : [("Fp2","F4"),("F4","C4"),("C4","P4"),("P4","O2")],
    'other' : [("Fz","Cz"), ("Cz", "Pz"), ("EKG")]
}
    
    eeg = pd.DataFrame({})
    for chain in CHAINS.keys():
        for s_i, signals in enumerate(CHAINS[chain]):
            if len(signals) == 2:
                diff=eeg_data[signals[0]]-eeg_data[signals[1]] # Subtracts relevant fields as in the image above
                diff.ffill(inplace = True) # forward fills in the casse of nan values
                eeg[f"{chain}: {signals[0]} - {signals[1]}"] = diff
            
            elif len(signals) == 1:
                sig=eeg_data[signals[0]]
                sig.ffill(inplace = True) 
                eeg[f"{chain}: {signals[0]}"] = sig
                
                
    
    return eeg, sp   

In [13]:
def preprocess(pre_eeg):

    tot_len = 53400
    n_chunks = 10
    chunk = int(tot_len // n_chunks)

    for i in range(n_chunks):
        if i == n_chunks-1:
            eeg_data = pre_eeg[i*chunk:]
        else:
            eeg_data = pre_eeg[i*chunk:(i+1)*chunk]
            
        print(eeg_data.shape)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(device)

        ts_scaler = TimeSeriesScalerMinMax()
        eeg_data = ts_scaler.fit_transform(eeg_data)
        print(np.nanmax(eeg_data), np.nanmin(eeg_data))

        eeg_data_cuda = torch.from_numpy(eeg_data).to(device)
        exp = torch.linspace(0, 1, eeg_data_cuda.shape[1], device=device).unsqueeze(0).unsqueeze(-1).expand_as(eeg_data_cuda[:, :, 0].unsqueeze(-1))
        eeg_data_cuda = torch.cat([eeg_data_cuda, exp], dim=2)
        print(eeg_data_cuda.shape)

        sig = signatory.logsignature(eeg_data_cuda, 3)
        print(sig.shape)
        eeg_data = sig.cpu().numpy()

        # eeg_sig = np.concatenate(eeg_data, axis=-1)
        return eeg_data

In [14]:
eeg_arr = []
for i in tqdm(range(len(test))):
        exp_row = test.iloc[i]
        eeg_data, sp_dict = get_sub_eeg_sp_data(exp_row)
        eeg_arr.append(eeg_data.to_numpy())

eeg_arr = np.array(eeg_arr)

eeg_arr

100%|██████████| 1/1 [00:00<00:00,  3.55it/s]


array([[[ 26.11      ,   6.09      ,   2.83      , ..., -12.41      ,
          31.380001  ,  24.289999  ],
        [ 65.39001   , -47.380005  ,  -6.5200005 , ..., -12.78      ,
          35.34      ,  23.3       ],
        [  8.649998  ,  -3.289999  ,   0.52000046, ..., -25.64      ,
          26.83      ,  21.        ],
        ...,
        [-43.39      ,  18.630001  ,  28.78      , ..., -52.12      ,
         -16.39      ,  21.629997  ],
        [-44.690002  ,  23.130001  ,  25.96      , ..., -50.050003  ,
         -14.73      ,  20.920002  ],
        [-37.53      ,  27.5       ,  22.44      , ..., -40.63      ,
          -3.75      ,  23.39      ]]], dtype=float32)

In [15]:
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
prep_eeg_sub = preprocess(eeg_arr)


(1, 10000, 18)
cuda
1.0 0.0
torch.Size([1, 10000, 19])
torch.Size([1, 2470])


# Predict from processed test set

In [16]:
sub_pred = model.predict(prep_eeg_sub)

print("This is the sub pred sum:",sub_pred.sum())
print(sub_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1000ms/step
This is the sub pred sum: 1.0000001
[[5.4647678e-01 1.5370744e-01 1.0347281e-09 1.8320108e-03 3.5487128e-06
  2.9798031e-01]]


I0000 00:00:1712107798.977915     140 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [17]:
# CREATE SUBMISSION.CSV
from IPython.display import display

sub = pd.DataFrame({'eeg_id':test.eeg_id.values})

TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
sub[TARGETS] = sub_pred
sub.to_csv('submission.csv',index=False)
print('Submission shape',sub.shape)
display( sub.head() )

# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
print('Sub row 0 sums to:',sub.iloc[0,-6:].sum())

Submission shape (1, 7)


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.546477,0.153707,1.034728e-09,0.001832,4e-06,0.29798


Sub row 0 sums to: 1.0000000951228645
