In [None]:
import os
import json
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

TRAIN_CSV = "/kaggle/input/hms-harmful-brain-activity-classification/train.csv"
TEST_CSV  = "/kaggle/input/hms-harmful-brain-activity-classification/test.csv"
EEG_TRAIN_DIR = "/kaggle/input/hms-harmful-brain-activity-classification/train_eegs"
SPEC_TRAIN_DIR = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms"
EEG_TEST_DIR = "/kaggle/input/hms-harmful-brain-activity-classification/test_eegs"
SPEC_TEST_DIR = "/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms"

EEG_SAMPLE_RATE = 200
EEG_WINDOW_SEC = 50
SPEC_WINDOW_SEC = 10

BATCH_SIZE = 32
EPOCHS = 5
VAL_SIZE = 0.15

label_cols = ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]

In [None]:
def safe_pad(arr, target_shape):
    """Crop or zero-pad `arr` to match `target_shape` (rows, cols)."""
    arr = np.array(arr)
    padded = np.zeros(target_shape, dtype=np.float32)
    rows = min(arr.shape[0], target_shape[0])
    cols = min(arr.shape[1], target_shape[1])
    padded[:rows, :cols] = arr[:rows, :cols]
    return padded


def extract_eeg_window_fixed(eeg_id, offset_seconds, base_path="train_eegs", target_len=None):
    """Load EEG parquet, extract 50s window starting at offset, crop/pad to target_len."""
    path = f"{base_path}/{eeg_id}.parquet"
    eeg = pd.read_parquet(path)
    eeg = eeg.fillna(0).values.astype(np.float32)

    start_idx = int(offset_seconds * 200)
    end_idx = start_idx + (target_len if target_len else eeg.shape[0])
    window = eeg[start_idx:end_idx]

    if target_len is not None:
        window = safe_pad(window, (target_len, eeg.shape[1]))

    return window


def extract_spectrogram_window_fixed(spec_id, offset_seconds, base_path="train_spectrograms", target_len=None):
    """Load spectrogram parquet, crop/pad to target_len."""
    path = f"{base_path}/{spec_id}.parquet"
    spec = pd.read_parquet(path)
    spec = spec.fillna(0).values.astype(np.float32)

    if target_len is not None:
        spec = safe_pad(spec, (target_len, spec.shape[1]))

    return spec


def normalize_votes_to_probs(row, cols):
    votes = row[cols].astype(float).values
    total = votes.sum()
    if total == 0:
        return np.full(len(cols), 1.0 / len(cols), dtype=np.float32)
    return (votes / total).astype(np.float32)


def example_generator(meta_df, batch_size, eeg_len, spec_len, eeg_path="train_eegs", spec_path="train_spectrograms"):
    """Yield batches of (inputs, labels) with safe crop+pad."""
    while True:
        meta_df = meta_df.sample(frac=1).reset_index(drop=True)
        for i in range(0, len(meta_df), batch_size):
            batch = meta_df.iloc[i:i+batch_size]
            eeg_batch, spec_batch, y_batch = [], [], []

            for _, row in batch.iterrows():
                eeg = extract_eeg_window_fixed(row["eeg_id"], row["eeg_label_offset_seconds"],
                                               base_path=eeg_path, target_len=eeg_len)
                spec = extract_spectrogram_window_fixed(row["spectrogram_id"], row["spectrogram_label_offset_seconds"],
                                                        base_path=spec_path, target_len=spec_len)
                spec = np.expand_dims(spec, -1)
                y = normalize_votes_to_probs(row, label_cols)

                eeg_batch.append(eeg)
                spec_batch.append(spec)
                y_batch.append(y)

            yield (
                {
                    "eeg_input": np.array(eeg_batch, dtype=np.float32),
                    "spec_input": np.array(spec_batch, dtype=np.float32),
                },
                np.array(y_batch, dtype=np.float32),
            )

In [None]:
train_df = pd.read_csv(TRAIN_CSV)
if len(train_df) == 0:
    raise RuntimeError("Train CSV empty or not found.")

sample_row = None
for _, r in train_df.sample(min(5, len(train_df)), random_state=RANDOM_STATE).iterrows():
    try:
        eeg_vals = extract_eeg_window_fixed(r['eeg_id'], r['eeg_label_offset_seconds'], base_path=EEG_TRAIN_DIR)
        spec_vals = extract_spectrogram_window_fixed(r['spectrogram_id'], r['spectrogram_label_offset_seconds'], base_path=SPEC_TRAIN_DIR)
        sample_row = r
        break
    except Exception:
        continue
if sample_row is None:
    raise RuntimeError("Could not read any sample EEG/spectrogram to infer shapes.")

EEG_TIMESTEPS, EEG_CHANNELS = eeg_vals.shape
SPEC_H, SPEC_W = spec_vals.shape

print("Inferred shapes:")
print("EEG_TIMESTEPS:", EEG_TIMESTEPS, "EEG_CHANNELS:", EEG_CHANNELS, "SPEC_H:", SPEC_H, "SPEC_W:", SPEC_W)

In [None]:
def build_custom_cnn(eeg_timesteps, eeg_channels, spec_h, spec_w, output_dim=6,
                     dropout_rate=0.3, base_filters=32):
    eeg_input = layers.Input(shape=(eeg_timesteps, eeg_channels), name="eeg_input")
    x = layers.Conv1D(base_filters, kernel_size=7, padding="same", activation="relu")(eeg_input)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(pool_size=4)(x)

    x = layers.Conv1D(base_filters*2, kernel_size=5, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(pool_size=4)(x)

    x = layers.Conv1D(base_filters*4, kernel_size=3, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GlobalAveragePooling1D()(x)
    eeg_out = layers.Dense(128, activation="relu")(x)
    eeg_out = layers.Dropout(dropout_rate)(eeg_out)

    spec_input = layers.Input(shape=(spec_h, spec_w, 1), name="spec_input")
    y = layers.Conv2D(base_filters, kernel_size=(3,3), padding="same", activation="relu")(spec_input)
    y = layers.BatchNormalization()(y)
    y = layers.MaxPool2D(pool_size=(2,2))(y)

    y = layers.Conv2D(base_filters*2, kernel_size=(3,3), padding="same", activation="relu")(y)
    y = layers.BatchNormalization()(y)
    y = layers.MaxPool2D(pool_size=(2,2))(y)

    y = layers.Conv2D(base_filters*4, kernel_size=(3,3), padding="same", activation="relu")(y)
    y = layers.BatchNormalization()(y)
    y = layers.GlobalAveragePooling2D()(y)
    spec_out = layers.Dense(128, activation="relu")(y)
    spec_out = layers.Dropout(dropout_rate)(spec_out)

    concat = layers.Concatenate()([eeg_out, spec_out])
    h = layers.Dense(256, activation="relu")(concat)
    h = layers.BatchNormalization()(h)
    h = layers.Dropout(dropout_rate)(h)

    h = layers.Dense(128, activation="relu")(h)
    h = layers.BatchNormalization()(h)
    h = layers.Dropout(dropout_rate)(h)

    out = layers.Dense(output_dim, activation="softmax", name="output")(h)

    model = Model(inputs=[eeg_input, spec_input], outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss=tf.keras.losses.KLDivergence(),
        metrics=[
            tf.keras.metrics.KLDivergence(name="kl_div"),
            tf.keras.metrics.MeanSquaredError(name="mse"),
            tf.keras.metrics.CategoricalCrossentropy(name="cce"),
            "accuracy"
        ]
    )
    return model

model = build_custom_cnn(EEG_TIMESTEPS, EEG_CHANNELS, SPEC_H, SPEC_W, output_dim=len(label_cols))
model.summary()

In [None]:
train_df = train_df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
n = len(train_df)
val_n = int(n * VAL_SIZE)
val_df = train_df.iloc[:val_n].reset_index(drop=True)
train_df_part = train_df.iloc[val_n:].reset_index(drop=True)

print(f"Train rows: {len(train_df_part)}, Val rows: {len(val_df)}")

output_signature = (
    {
        "eeg_input": tf.TensorSpec(shape=(None, EEG_TIMESTEPS, EEG_CHANNELS), dtype=tf.float32),
        "spec_input": tf.TensorSpec(shape=(None, SPEC_H, SPEC_W, 1), dtype=tf.float32),
    },
    tf.TensorSpec(shape=(None, len(label_cols)), dtype=tf.float32),
)

train_gen = tf.data.Dataset.from_generator(
    lambda: example_generator(train_df_part, BATCH_SIZE, EEG_TIMESTEPS, SPEC_H,
                              eeg_path=EEG_TRAIN_DIR, spec_path=SPEC_TRAIN_DIR),
    output_signature=output_signature
)
val_gen = tf.data.Dataset.from_generator(
    lambda: example_generator(val_df, BATCH_SIZE, EEG_TIMESTEPS, SPEC_H,
                              eeg_path=EEG_TRAIN_DIR, spec_path=SPEC_TRAIN_DIR),
    output_signature=output_signature
)

train_ds = train_gen.prefetch(tf.data.AUTOTUNE)
val_ds = val_gen.prefetch(tf.data.AUTOTUNE)

In [None]:
ckpt = ModelCheckpoint("cnn_model_best.h5", monitor="val_loss", save_best_only=True, verbose=1)
early = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)
rlr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-8, verbose=1)

steps_per_epoch = max(1, int(np.ceil(len(train_df_part) / BATCH_SIZE)))
validation_steps = max(1, int(np.ceil(len(val_df) / BATCH_SIZE)))

print("steps_per_epoch:", steps_per_epoch, "validation_steps:", validation_steps)

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_ds,
    validation_steps=validation_steps,
    callbacks=[ckpt, early, rlr],
    verbose=1
)

In [None]:
model.save("cnn_model.h5")
print("Saved cnn_model.h5")

model_json = model.to_json()

model.save_weights("cnn_model.weights.h5")

bundle = {
    "type": "KerasModelBundle",
    "model_json": model_json,
    "weights_path": "cnn_model.weights.h5",
    "eeg_timesteps": EEG_TIMESTEPS,
    "eeg_channels": EEG_CHANNELS,
    "spec_h": SPEC_H,
    "spec_w": SPEC_W,
    "label_cols": label_cols
}

joblib.dump(bundle, "model.pkl")
print("Saved portable bundle to model.pkl")


In [None]:
test_df = pd.read_parquet(TEST_CSV)
test_df = test_df.dropna(how="all")
test_df = test_df.dropna(subset=["eeg_id", "spectrogram_id"])

preds, ids = [], []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Preprocessing test"):
    try:
        eeg_arr = extract_eeg_window_fixed(row['eeg_id'], 0.0, base_path=EEG_TEST_DIR, target_len=EEG_TIMESTEPS)
        spec_arr = extract_spectrogram_window_fixed(row['spectrogram_id'], 0.0, base_path=SPEC_TEST_DIR, target_len=SPEC_H)

        eeg_arr = safe_pad(eeg_arr, (EEG_TIMESTEPS, EEG_CHANNELS))
        spec_arr = safe_pad(spec_arr, (SPEC_H, SPEC_W))
        spec_arr = np.expand_dims(spec_arr, -1)

        eeg_arr = np.nan_to_num(eeg_arr, nan=0.0, posinf=0.0, neginf=0.0).astype("float32")
        spec_arr = np.nan_to_num(spec_arr, nan=0.0, posinf=0.0, neginf=0.0).astype("float32")

        pred = model.predict({"eeg_input": np.expand_dims(eeg_arr, 0),
                              "spec_input": np.expand_dims(spec_arr, 0)}, verbose=0)
        preds.append(pred[0])
        ids.append(row["id"])
    except Exception:
        preds.append(np.full((len(label_cols),), 1.0 / len(label_cols), dtype="float32"))
        ids.append(row.get("id", -1))

In [None]:
preds = np.vstack(preds)
preds

In [None]:
submission = pd.DataFrame(preds, columns=label_cols)
submission