In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Define file path
file_path = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Raw\MI\train\S2\2\EEGdata.csv"  # Use raw string or double backslashes

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load data
eeg_df = pd.read_csv(file_path)

# Display data structure
print("First 5 rows of the data:")
print(eeg_df.head())
print("\nColumns in data:", eeg_df.columns.tolist())

# EEG channels for MI (change these based on your dataset)
mi_channels = ['C3', 'CZ', 'C4']

# Check if channels exist in data
for ch in mi_channels:
    if ch not in eeg_df.columns:
        raise ValueError(f"Channel {ch} not found in the data columns!")

# Plot raw EEG signal
plt.figure(figsize=(12, 6))
for ch in mi_channels:
    plt.plot(eeg_df[ch][:1000], label=ch)  # Plot first 4 seconds (1000 samples at 250 Hz)

plt.xlabel('Sample number (250 Hz)')
plt.ylabel('EEG Amplitude (µV)')
plt.title('Raw EEG Signals (MI Channels)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Define file path
file_path = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Raw\MI\train\S1\1\EEGdata.csv"  # Use raw string or double backslashes

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Load data
eeg_df = pd.read_csv(file_path)

# Display data structure
print("First 5 rows of the data:")
print(eeg_df.head())
print("\nColumns in data:", eeg_df.columns.tolist())

# EEG channels for MI (change these based on your dataset)
mi_channels = ['C3', 'CZ', 'C4']

# Check if channels exist in data
for ch in mi_channels:
    if ch not in eeg_df.columns:
        raise ValueError(f"Channel {ch} not found in the data columns!")

# Plot raw EEG signal
plt.figure(figsize=(12, 6))
for ch in mi_channels:
    plt.plot(eeg_df[ch][:1000], label=ch)  # Plot first 4 seconds (1000 samples at 250 Hz)

plt.xlabel('Sample number (250 Hz)')
plt.ylabel('EEG Amplitude (µV)')
plt.title('Raw EEG Signals (MI Channels)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# List of relevant channels for MI
mi_channels = ['C3', 'CZ', 'C4']

# Select only these channels from the data
eeg_mi_df = eeg_df[mi_channels]

# Optional: Check the shape and preview
print(eeg_mi_df.shape)
print(eeg_mi_df.head())


In [None]:
# Step 3: Re-referencing EEG signals (Average Reference)

# Select MI channels from the main DataFrame
mi_channels = ['C3', 'CZ', 'C4']
eeg_mi_df = eeg_df[mi_channels].copy()  # Always use .copy() when making modifications

# Compute average reference across selected MI channels (row-wise mean)
eeg_mi_df['Average'] = eeg_mi_df.mean(axis=1)

# Subtract average reference from each channel to get re-referenced signal
for ch in mi_channels:
    eeg_mi_df[ch + '_ref'] = eeg_mi_df[ch] - eeg_mi_df['Average']

# Keep only re-referenced channels for further analysis
eeg_mi_ref = eeg_mi_df[[ch + '_ref' for ch in mi_channels]]

# Preview the re-referenced data
print("First 5 rows of re-referenced EEG data:")
print(eeg_mi_ref.head())


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
for ch in [ch + '_ref' for ch in mi_channels]:
    plt.plot(eeg_mi_ref[ch][:1000], label=ch.replace('_ref',''))
plt.xlabel('Sample number (250 Hz)')
plt.ylabel('EEG Amplitude (µV)')
plt.title('EEG Signals after Average Re-referencing (MI Channels)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from scipy.signal import butter, filtfilt
import matplotlib.pyplot as plt

# --- Bandpass filter settings ---
fs = 250  # Sampling frequency (Hz)
lowcut = 8  # Lower bound of bandpass (Hz)
highcut = 30  # Upper bound of bandpass (Hz)
order = 4  # Filter order (common for EEG)


# --- Butterworth bandpass filter function ---
def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs  # Nyquist frequency
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def bandpass_filter(data, lowcut, highcut, fs, order=4):
    b, a = butter_bandpass(lowcut, highcut, fs, order)
    y = filtfilt(b, a, data, axis=0)
    return y


# --- Apply to re-referenced MI channels ---
filtered_eeg = bandpass_filter(eeg_mi_ref.values, lowcut, highcut, fs, order)

# Put back in a DataFrame for convenience
filtered_eeg_df = pd.DataFrame(filtered_eeg, columns=eeg_mi_ref.columns)

# --- Visualization ---
plt.figure(figsize=(12, 6))
for ch in filtered_eeg_df.columns:
    plt.plot(filtered_eeg_df[ch][:1000], label=ch.replace('_ref', ''))
plt.xlabel('Sample number (250 Hz)')
plt.ylabel('Filtered EEG Amplitude (a.u.)')
plt.title('Bandpass Filtered EEG Signals (8–30 Hz, MI Channels)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 4))
for ch in eeg_mi_ref.columns:
    plt.plot(eeg_mi_ref[ch][:1000], label=ch.replace('_ref',''))
plt.title("Raw (Re-referenced) EEG, before filtering")
plt.legend()
plt.show()


In [None]:
# Demean (remove mean) before filtering
eeg_mi_ref_demeaned = eeg_mi_ref - eeg_mi_ref.mean()

# Now filter
filtered_eeg = bandpass_filter(eeg_mi_ref_demeaned.values, lowcut, highcut, fs, order)
filtered_eeg_df = pd.DataFrame(filtered_eeg, columns=eeg_mi_ref.columns)

# Plot result
plt.figure(figsize=(12, 6))
for ch in filtered_eeg_df.columns:
    plt.plot(filtered_eeg_df[ch][:1000], label=ch.replace('_ref',''))
plt.xlabel('Sample number (250 Hz)')
plt.ylabel('Filtered EEG Amplitude (a.u.)')
plt.title('Bandpass Filtered EEG Signals (after mean removal, 8–30 Hz, MI Channels)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Artifact Removal by Amplitude Threshold
import numpy as np
import matplotlib.pyplot as plt

# Define amplitude threshold (change as needed)
threshold = 100  # Adjust to 75 or 150 based on your filtered data's range

# Find samples (in any channel) that exceed the threshold
artifact_mask = (np.abs(filtered_eeg_df) > threshold).any(axis=1)

# Show how many samples are marked as artifact
print(f"Number of artifact samples: {artifact_mask.sum()} out of {len(artifact_mask)}")

# Option 1: Remove artifact samples (for continuous data analysis)
filtered_eeg_clean = filtered_eeg_df[~artifact_mask]

# Option 2: Mark artifact samples (for epoch-level rejection)
# You can use artifact_mask to later reject/mark affected trials

# --- Visualization before/after artifact rejection ---
plt.figure(figsize=(12, 5))
plt.plot(filtered_eeg_df['C3_ref'][:1000], label='Original C3', alpha=0.6)
plt.plot(filtered_eeg_clean['C3_ref'][:1000], label='Cleaned C3', alpha=0.9)
plt.legend()
plt.title("C3 Before and After Artifact Rejection")
plt.show()


In [None]:
import numpy as np

# Assume `filtered_eeg_clean` is your cleaned (artifact-rejected, filtered, re-referenced) DataFrame
n_samples = filtered_eeg_clean.shape[0]
samples_per_trial = 2250  # MI task
n_trials = n_samples // samples_per_trial

print(f"Total samples: {n_samples}, Trials detected: {n_trials}")

# Create epochs: a list of DataFrames, one per trial
epochs = []
for i in range(n_trials):
    start_idx = i * samples_per_trial
    end_idx = start_idx + samples_per_trial
    epoch = filtered_eeg_clean.iloc[start_idx:end_idx].reset_index(drop=True)
    epochs.append(epoch)

print(f"Number of epochs extracted: {len(epochs)}")
print("Shape of one epoch:", epochs[0].shape)


In [None]:
# Get the artifact mask from before (True = artifact sample)
artifact_mask = (np.abs(filtered_eeg_df) > threshold).any(axis=1)

# For each trial, mark it as 'bad' if any sample is an artifact
trial_good = []
for i in range(n_trials + 1):  # +1 in case there are partial trials
    start_idx = i * samples_per_trial
    end_idx = start_idx + samples_per_trial
    if end_idx > n_samples:
        continue  # Skip incomplete trial
    if artifact_mask[start_idx:end_idx].any():
        trial_good.append(False)
    else:
        trial_good.append(True)

# Only keep good epochs
epochs = []
for i, is_good in enumerate(trial_good):
    if is_good:
        start_idx = i * samples_per_trial
        end_idx = start_idx + samples_per_trial
        epoch = filtered_eeg_df.iloc[start_idx:end_idx].reset_index(drop=True)
        epochs.append(epoch)

print(f"Number of artifact-free epochs: {len(epochs)}")


In [None]:
# Allow epochs if less than X% of samples are artifacts
max_artifact_fraction = 0.05  # Allow up to 5% artifact samples per trial

epochs = []
for i in range(n_trials):
    start_idx = i * samples_per_trial
    end_idx = start_idx + samples_per_trial
    if end_idx > n_samples:
        continue  # Skip incomplete
    frac_artifact = artifact_mask[start_idx:end_idx].sum() / samples_per_trial
    if frac_artifact < max_artifact_fraction:
        epoch = filtered_eeg_df.iloc[start_idx:end_idx].reset_index(drop=True)
        epochs.append(epoch)

print(f"Number of 'mostly clean' epochs: {len(epochs)}")


# Feature Extraction

In [None]:
from scipy.signal import welch
import numpy as np

fs = 250  # Hz, as before


def compute_psd_features(epoch_df, fs=250, bands=[(8, 13), (13, 30)]):
    """Returns average power in each band for each channel in the epoch."""
    psd_features = []
    for ch in epoch_df.columns:
        # Welch's method: returns freqs and power
        freqs, psd = welch(epoch_df[ch], fs=fs, nperseg=fs * 2)  # 2s window

        # For each band, sum power in that range
        for (low, high) in bands:
            idx = np.logical_and(freqs >= low, freqs <= high)
            band_power = np.mean(psd[idx])
            psd_features.append(band_power)
    return psd_features


# Example: extract features for all epochs
X_psd = np.array([compute_psd_features(epoch) for epoch in epochs])
print("Shape of PSD feature matrix:", X_psd.shape)  # (n_epochs, n_channels * n_bands)


In [None]:
import os
import numpy as np

processed_dir = 'Data/Processed'
os.makedirs(processed_dir, exist_ok=True)

np.save(os.path.join(processed_dir, 'X_psd.npy'), X_psd)
print("Saved features for ML to Data/Processed/X_psd.npy")


# Save

In [None]:
import pandas as pd
import os

# Example: Your already preprocessed DataFrame
# filtered_eeg_clean = ... # (Already exists in your notebook!)

# ----------- EDIT THESE -----------
save_dir = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed"
os.makedirs(save_dir, exist_ok=True)
basename = "S2_1_EEGdata_preprocessed.csv"  # Change this for each file if looping
save_path = os.path.join(save_dir, basename)
# ----------------------------------

# Save as CSV
filtered_eeg_clean.to_csv(save_path, index=False)
print(f"Preprocessed EEG saved to: {save_path}")


In [None]:
import pandas as pd
import numpy as np
import os

# ----------- SETTINGS (EDIT THESE) -----------
input_csv = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\S2_1_EEGdata_preprocessed.csv"
save_dir = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready"
channels = ['C3_ref', 'CZ_ref', 'C4_ref']
samples_per_trial = 2250

# ---------------------------------------------
os.makedirs(save_dir, exist_ok=True)

# 1. Load preprocessed CSV
df = pd.read_csv(input_csv)
print(f"Loaded: {input_csv} | Shape: {df.shape}")

# 2. Check channels exist
for ch in channels:
    if ch not in df.columns:
        raise ValueError(f"Channel {ch} not found! Available: {df.columns.tolist()}")

# 3. Calculate number of full trials (epochs)
n_samples = df.shape[0]
n_trials = n_samples // samples_per_trial
usable_samples = n_trials * samples_per_trial
if usable_samples != n_samples:
    print(f"Warning: {n_samples - usable_samples} samples discarded (incomplete trial)")

# 4. Slice only full trials
df_epochs = df[channels].iloc[:usable_samples]

# 5. Reshape to (n_trials, n_channels, samples_per_trial)
epochs = df_epochs.to_numpy().reshape(n_trials, samples_per_trial, len(channels)).transpose(0, 2, 1)
print(f"Epochs shape (n_trials, n_channels, samples): {epochs.shape}")

# 6. Save as .npy for deep learning
base_name = os.path.splitext(os.path.basename(input_csv))[0] + "_DLready.npy"
output_npy = os.path.join(save_dir, base_name)
np.save(output_npy, epochs)
print(f"Saved array: {output_npy}")

# 7. (Optional) Save trial index
subject = "S2"      # Edit if needed
session = 1         # Edit if needed
trial_index = pd.DataFrame({
    "subject": [subject] * n_trials,
    "session": [session] * n_trials,
    "trial": np.arange(1, n_trials + 1)
})
meta_name = os.path.splitext(os.path.basename(input_csv))[0] + "_index.csv"
trial_index.to_csv(os.path.join(save_dir, meta_name), index=False)
print(f"Saved trial index: {os.path.join(save_dir, meta_name)}")


# Data Engineering Pipleline

In [4]:

import pandas as pd
import numpy as np
from scipy.signal import butter, filtfilt
import os


def load_eeg_csv(file_path, channels):
    df = pd.read_csv(file_path)
    # Check for missing channels
    for ch in channels:
        if ch not in df.columns:
            raise ValueError(f"Channel {ch} not found in data columns!")
    return df[channels]


def rereference_average(df):
    avg = df.mean(axis=1)
    reref = df.subtract(avg, axis=0)
    reref.columns = [f"{col}_ref" for col in df.columns]
    return reref


def bandpass_filter(data, lowcut=8, highcut=30, fs=250, order=4):
    b, a = butter(order, [lowcut / (0.5 * fs), highcut / (0.5 * fs)], btype='band')
    return filtfilt(b, a, data, axis=0)


def filter_dataframe(df, fs=250, lowcut=8, highcut=30, order=4):
    arr = df.values
    arr_filt = bandpass_filter(arr, lowcut, highcut, fs, order)
    return pd.DataFrame(arr_filt, columns=df.columns)


def artifact_reject(df, threshold=100):
    mask = (np.abs(df) > threshold).any(axis=1)
    return df[~mask].reset_index(drop=True), mask


def preprocess_eeg_file(
        file_path,
        channels=['C3', 'CZ', 'C4'],
        fs=250,
        lowcut=8,
        highcut=30,
        threshold=100
):
    # 1. Load
    raw = load_eeg_csv(file_path, channels)
    # 2. Re-reference
    reref = rereference_average(raw)
    # 3. Filter
    filt = filter_dataframe(reref, fs, lowcut, highcut)
    # 4. Artifact rejection
    clean, mask = artifact_reject(filt, threshold)
    return clean


def save_preprocessed(clean_df, save_path):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    clean_df.to_csv(save_path, index=False)
    print(f"Preprocessed EEG saved to: {save_path}")


def make_dl_ready(
        csv_path,
        save_dir,
        channels=['C3_ref', 'CZ_ref', 'C4_ref'],
        samples_per_trial=2250,
        subject="S1",
        session=1
):
    os.makedirs(save_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    for ch in channels:
        if ch not in df.columns:
            raise ValueError(f"Channel {ch} not found in preprocessed CSV!")
    n_samples = df.shape[0]
    n_trials = n_samples // samples_per_trial
    usable_samples = n_trials * samples_per_trial
    if usable_samples != n_samples:
        print(f"Warning: {n_samples - usable_samples} samples discarded (incomplete trial)")
    df_epochs = df[channels].iloc[:usable_samples]
    epochs = df_epochs.to_numpy().reshape(n_trials, samples_per_trial, len(channels)).transpose(0, 2, 1)
    base_name = os.path.splitext(os.path.basename(csv_path))[0] + "_DLready.npy"
    np.save(os.path.join(save_dir, base_name), epochs)
    print(f"Saved array: {os.path.join(save_dir, base_name)}")
    # Save trial index
    trial_index = pd.DataFrame({
        "subject": [subject] * n_trials,
        "session": [session] * n_trials,
        "trial": np.arange(1, n_trials + 1)
    })
    meta_name = os.path.splitext(os.path.basename(csv_path))[0] + "_index.csv"
    trial_index.to_csv(os.path.join(save_dir, meta_name), index=False)
    print(f"Saved trial index: {os.path.join(save_dir, meta_name)}")


In [5]:
import glob

raw_dir = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Raw\MI\test"
save_dir = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI"
dl_ready_dir = r"C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready"
samples_per_trial = 2250
raw_channels = ['C3', 'CZ', 'C4']          # For raw/preprocessing
dl_channels = ['C3_ref', 'CZ_ref', 'C4_ref'] # For deep learning ready

all_files = glob.glob(os.path.join(raw_dir, "S*", "*", "EEGdata.csv"))

for file_path in all_files:
    parts = file_path.split(os.sep)
    subject = parts[-3]
    session = parts[-2]
    # 1. Preprocess and save (use raw channel names)
    clean = preprocess_eeg_file(file_path, raw_channels)
    basename = f"{subject}_{session}_EEGdata_preprocessed.csv"
    save_path = os.path.join(save_dir, basename)
    save_preprocessed(clean, save_path)
    # 2. Make DL ready (use referenced channel names)
    make_dl_ready(
        save_path,
        dl_ready_dir,
        channels=dl_channels,
        samples_per_trial=samples_per_trial,
        subject=subject,
        session=session
    )
print("Batch preprocessing complete!")


Preprocessed EEG saved to: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\S36_1_EEGdata_preprocessed.csv
Saved array: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S36_1_EEGdata_preprocessed_DLready.npy
Saved trial index: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S36_1_EEGdata_preprocessed_index.csv
Preprocessed EEG saved to: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\S37_1_EEGdata_preprocessed.csv
Saved array: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S37_1_EEGdata_preprocessed_DLready.npy
Saved trial index: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S37_1_EEGdata_preprocessed_index.csv
Preprocessed EEG saved to: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\S38_1_EEGdata_preprocessed.csv
Saved array: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S38_1_EEGdata_preprocessed_DLready.npy
Saved trial index: C:\Users\zezom\PycharmProjects\HorusEye\Data\Test\MI\DL_ready\S38_1_EE

# Batching

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

# --- SETTINGS ---
base_dir = r"C:\Users\zezom\PycharmProjects\HorusEye"
dl_ready_dir = os.path.join(base_dir, "Data", "Processed", "DL_ready")
labels_path = os.path.join(base_dir, "train.csv")

# --- Load competition labels ---
labels_df = pd.read_csv(labels_path)

X = []
y = []
missing = []

for i, row in labels_df.iterrows():
    subject = row['subject_id']
    session = row['trial_session']
    trial_num = int(row['trial'])    # 1-based
    label = row['label']
    # Path to your preprocessed file
    npy_path = os.path.join(dl_ready_dir, f"{subject}_{session}_EEGdata_preprocessed_DLready.npy")
    if not os.path.exists(npy_path):
        missing.append(npy_path)
        continue
    epochs = np.load(npy_path)  # (n_trials, n_channels, samples)
    # Safety check:
    if trial_num-1 >= epochs.shape[0]:
        print(f"Trial number {trial_num} out of range for file: {npy_path}")
        continue
    # Append correct trial (trial number - 1 for zero-indexing)
    X.append(epochs[trial_num-1])
    y.append(label)

X = np.stack(X)  # shape: (num_trials, n_channels, n_samples)
y_enc = LabelEncoder().fit_transform(y)

print("Loaded data shape:", X.shape)
print("Encoded labels shape:", y_enc.shape)
if missing:
    print(f"Missing {len(missing)} files, e.g.:", missing[:3])


Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_1_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_2_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_3_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_4_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_5_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_6_EEGdata_preprocessed_DLready.npy
Trial number 10 out of range for file: C:\Users\zezom\PycharmProjects\HorusEye\Data\Processed\DL_ready\S1_7_EEGdata_preprocessed_DLready.npy
Trial number 