# Imports

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy.stats import mode

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from scipy.signal import butter, lfilter
import numpy as np




# Data labeling

In [None]:
columns = [
    'ED_COUNTER',    'ED_INTERPOLATED',    'ED_RAW_CQ',    'ED_AF3',    'ED_F7',
    'ED_F3',    'ED_FC5',    'ED_T7',    'ED_P7',    'ED_O1',
    'ED_O2',    'ED_P8',    'ED_T8',    'ED_FC6',    'ED_F4',
    'ED_F8',    'ED_AF4',    'ED_GYROX',    'ED_GYROY',    'ED_TIMESTAMP',
    'ED_ES_TIMESTAMP',    'ED_FUNC_ID',    'ED_FUNC_VALUE',    'ED_MARKER',    'ED_SYNC_SIGNAL'
]

In [None]:
FOCUSED_ID = 0
UNFOCUSED_ID = 1
DROWSY_ID = 2

def get_state(timestamp):
    if timestamp <= 10*128*60:
        return FOCUSED_ID
    elif timestamp > 20*128*60:
        return UNFOCUSED_ID
    else:
        return DROWSY_ID

# Data preprocessing

In [None]:
def bandpass_filter(data, low_freq, high_freq, fs, order=4):
    nyquist = 0.5 * fs
    low = low_freq / nyquist
    high = high_freq / nyquist
    b, a = butter(order, [low, high], btype='band')
    return lfilter(b, a, data)

def visualize_filters(original_data, filtered_signals, fs, channel_idx=0, duration=2):
    """
    Visualize original and filtered signals

    Parameters:
    - original_data: Original EEG data
    - filtered_signals: Dictionary containing filtered signals for each frequency band
    - fs: Sampling frequency
    - channel_idx: Channel index to visualize
    - duration: Duration in seconds to visualize
    """
    # Calculate time axis
    n_samples = min(int(duration * fs), original_data.shape[0])
    time = np.arange(n_samples) / fs

    # Create figure
    plt.figure(figsize=(12, 10))
    gs = GridSpec(5, 1, height_ratios=[1, 1, 1, 1, 1])

    # Plot original signal
    ax0 = plt.subplot(gs[0])
    ax0.plot(time, original_data[:n_samples, channel_idx], 'k', label='Original')
    ax0.set_title(f'Original EEG Signal (Channel {channel_idx})')
    ax0.set_ylabel('Amplitude')
    ax0.legend()
    ax0.grid(True)

    # Plot filtered signals
    wave_colors = {
        'Delta': 'b',
        'Theta': 'g',
        'Alpha': 'r',
        'Beta': 'purple'
    }

    for i, (wave, (low, high)) in enumerate(brainwave_ranges.items()):
        ax = plt.subplot(gs[i+1], sharex=ax0)
        ax.plot(time, filtered_signals[wave][:n_samples, channel_idx], color=wave_colors[wave], label=f'{wave} ({low}-{high} Hz)')
        ax.set_title(f'{wave} Band ({low}-{high} Hz)')
        ax.set_ylabel('Amplitude')
        ax.legend()
        ax.grid(True)

    ax.set_xlabel('Time (s)')
    plt.tight_layout()
    plt.show()

# Function to plot frequency spectrum
def plot_frequency_spectrum(original_data, filtered_signals, fs, channel_idx=0):
    """
    Plot frequency spectrum of original and filtered signals

    Parameters:
    - original_data: Original EEG data
    - filtered_signals: Dictionary containing filtered signals for each frequency band
    - fs: Sampling frequency
    - channel_idx: Channel index to visualize
    """
    plt.figure(figsize=(12, 10))

    # Calculate frequency spectrum of original signal
    signal = original_data[:, channel_idx]
    n = len(signal)
    fft_vals = np.abs(np.fft.rfft(signal))
    fft_freq = np.fft.rfftfreq(n, 1.0/fs)

    # Plot original spectrum
    plt.subplot(5, 1, 1)
    plt.plot(fft_freq, fft_vals, 'k')
    plt.title(f'Frequency Spectrum - Original Signal (Channel {channel_idx})')
    plt.ylabel('Magnitude')
    plt.xlim(0, 50)  # Limit x-axis to 0-50 Hz for better visualization
    plt.grid(True)

    # Plot filtered spectrums
    wave_colors = {
        'Delta': 'b',
        'Theta': 'g',
        'Alpha': 'r',
        'Beta': 'purple'
    }

    for i, (wave, (low, high)) in enumerate(brainwave_ranges.items()):
        signal = filtered_signals[wave][:, channel_idx]
        fft_vals = np.abs(np.fft.rfft(signal))

        plt.subplot(5, 1, i+2)
        plt.plot(fft_freq, fft_vals, color=wave_colors[wave])
        plt.title(f'Frequency Spectrum - {wave} Band ({low}-{high} Hz)')
        plt.ylabel('Magnitude')
        plt.xlim(0, 50)
        # Add vertical lines showing the bandpass range
        plt.axvline(x=low, color='grey', linestyle='--', alpha=0.7)
        plt.axvline(x=high, color='grey', linestyle='--', alpha=0.7)
        plt.grid(True)

    plt.xlabel('Frequency (Hz)')
    plt.tight_layout()
    plt.show()


brainwave_ranges = {
    "Delta": (0.5, 4),
    "Theta": (4, 8),
    "Alpha": (8, 13),
    "Beta": (13, 30)
}

features = []
delta_features = []
theta_features = []
alpha_features = []
beta_features = []
labels = []
SAMPLE_LENGTH_SECOND = 4
FREQUENCY_HZ = 128
SAMPLE_LENGTH_HZ = FREQUENCY_HZ * SAMPLE_LENGTH_SECOND
scaler = StandardScaler(with_mean=True, with_std=True)

visualized = False
for i in [3,4,5,6,7,10,11,12,13,14,17,18,19,20,21,24,25,26,27,31,32,33,34]:
    print(f"Extracting file {i}")
    mat_data = loadmat(f'/kaggle/input/eeg-data-for-mental-attention-state-detection/EEG Data/eeg_record{i}.mat')
    data = mat_data['o'][0][0]['data']
    eeg_df = pd.DataFrame(data, columns=columns)
    eeg_df.reset_index(inplace=True)
    eeg_df.rename(columns={'index': 'timestamp'}, inplace=True)
    eeg_df['state'] = eeg_df['timestamp'].apply(get_state)

    # df_selected = eeg_df[['ED_AF3', 'ED_AF4', 'ED_F3', 'ED_F4', 'state']]

    # Extract some of them
    # feature = df_selected[['ED_AF3', 'ED_AF4', 'ED_F3', 'ED_F4']].values  # Columns 4 to 17 (0-indexed)
    # label = df_selected['state'].values

    feature = eeg_df.iloc[:, 4:18].values  # Columns 4 to 17 (0-indexed)
    label = eeg_df['state'].values


    feature = scaler.fit_transform(feature)


    brainwave_features = {}
    for wave, (low, high) in brainwave_ranges.items():
        filtered = np.apply_along_axis(
            bandpass_filter, 0, feature, low, high, FREQUENCY_HZ
        )
        brainwave_features[wave] = filtered  # Do not scale these

    if not visualized:
        print(f"Generating visualization for file {i}")
        # Choose a channel to visualize (0-13, corresponding to the 14 EEG channels)
        channel_to_visualize = 0  # First channel (ED_AF3)

        # Visualize time domain signals
        visualize_filters(feature, brainwave_features, FREQUENCY_HZ,
                         channel_idx=channel_to_visualize, duration=2)

        # Visualize frequency domain
        plot_frequency_spectrum(feature, brainwave_features, FREQUENCY_HZ,
                               channel_idx=channel_to_visualize)

        # Create a 3D visualization to show all channels and filtered signals
        plt.figure(figsize=(15, 10))
        # Get a subset of data for visualization
        time_segment = slice(0, int(FREQUENCY_HZ * 2))  # 2 seconds of data

        # Plot heatmaps showing all channels before and after filtering
        plt.subplot(2, 2, 1)
        plt.imshow(feature[time_segment, :].T, aspect='auto', cmap='viridis')
        plt.title('Original Signal - All Channels')
        plt.xlabel('Time (samples)')
        plt.ylabel('Channel')
        plt.colorbar(label='Amplitude')

        # Plot the filtered heatmaps
        positions = [(2, 2, 2), (2, 2, 3), (2, 2, 4)]
        for (wave, filtered_data), pos in zip(brainwave_features.items(), positions):
            plt.subplot(*pos)
            plt.imshow(filtered_data[time_segment, :].T, aspect='auto', cmap='viridis')
            plt.title(f'{wave} Filtered Signal - All Channels')
            plt.xlabel('Time (samples)')
            plt.ylabel('Channel')
            plt.colorbar(label='Amplitude')

        plt.tight_layout()
        plt.show()

        print("Visualization complete!")
        visualized = True

    num_samples = len(feature) // SAMPLE_LENGTH_HZ
    feature = feature[:num_samples * SAMPLE_LENGTH_HZ]
    label = label[:num_samples * SAMPLE_LENGTH_HZ]

    feature = feature.reshape(num_samples, SAMPLE_LENGTH_HZ, 14, 1)
    label = label.reshape(num_samples, SAMPLE_LENGTH_HZ)
    consensus_labels = mode(label, axis=1)[0].flatten()

    features.append(feature)
    labels.append(consensus_labels)

    for wave in brainwave_ranges.keys():
        brainwave_feature = brainwave_features[wave][:num_samples * SAMPLE_LENGTH_HZ]
        brainwave_feature = brainwave_feature.reshape(num_samples, SAMPLE_LENGTH_HZ, 14, 1)
        if wave == "Delta":
            delta_features.append(brainwave_feature)
        elif wave == "Theta":
            theta_features.append(brainwave_feature)
        elif wave == "Alpha":
            alpha_features.append(brainwave_feature)
        elif wave == "Beta":
            beta_features.append(brainwave_feature)


features = np.vstack(features)
delta_features = np.vstack(delta_features)
theta_features = np.vstack(theta_features)
alpha_features = np.vstack(alpha_features)
beta_features = np.vstack(beta_features)
labels = np.concatenate(labels)

print(f"Delta Features Shape: {delta_features.shape}")
print(f"Theta Features Shape: {theta_features.shape}")
print(f"Alpha Features Shape: {alpha_features.shape}")
print(f"Beta Features Shape: {beta_features.shape}")
print(f"Final Labels Shape: {labels.shape}")


# Data classes distribution

In [None]:
import numpy as np


unique_classes, class_counts = np.unique(labels, return_counts=True)

for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} samples")


In [None]:
eeg_df

## Model SVM without class balancing

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


X_unbalanced = features
y_unbalanced = labels

X_flattened_unbalanced = X_unbalanced.reshape(X_unbalanced.shape[0], -1)

X_train_unbalanced, X_test_unbalanced, y_train_unbalanced, y_test_unbalanced = train_test_split(X_flattened_unbalanced, y_unbalanced, test_size=0.2, random_state=42)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_pca_unbalanced = pca.fit_transform(X_train_unbalanced)
X_test_pca_unbalanced = pca.transform(X_test_unbalanced)

In [None]:
print(X_train_pca_unbalanced.shape)
print(X_test_pca_unbalanced.shape)

In [None]:

svm_model_unbalanced_rbf = SVC(kernel='rbf', probability=True)
svm_model_unbalanced_rbf.fit(X_train_pca_unbalanced, y_train_unbalanced)

In [None]:

y_pred_unbalanced = svm_model_unbalanced_rbf.predict(X_test_pca_unbalanced)



In [None]:
# Display the classification report
print("Classification Report:")
print(classification_report(y_test_unbalanced, y_pred_unbalanced))


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


cm =confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

class_sizes = {
    0: 3450,  # Class 0 (Focused)
    1: 10250, # Class 1 (Unfocused)
    2: 3450   # Class 2 (Drowsy)
}


def normalize_cm_by_class_size(cm, class_sizes):
    cm_normalized = cm.astype('float')
    for i in range(cm.shape[0]):
        cm_normalized[i] = cm[i] / class_sizes[i] * 100
    return cm_normalized


cm_normalized_by_class = normalize_cm_by_class_size(cm, class_sizes)


class_names = ['Focused', 'Unfocused', 'Drowsy']
predicted_class_names = ['Predicted Focused', 'Predicted Unfocused', 'Predicted Drowsy']

cm_normalized_df = pd.DataFrame(cm_normalized_by_class, index=class_names, columns=predicted_class_names)

cm_mean = cm_normalized_df.mean().mean()

cmap = sns.diverging_palette(250, 10, as_cmap=True, s=100, l=50)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized_df, annot=True, cmap="Blues", center=cm_mean, linewidths=0.5)
plt.title('Confusion Matrix Normalized')
plt.show()


## SVM balanced classes

In [None]:
X= features
y= labels


from sklearn.utils import shuffle
focused_indices = np.where(y == 0)[0]
unfocused_indices = np.where(y == 1)[0]
drowsy_indices = np.where(y == 2)[0]

num_drowsy_samples = len(drowsy_indices)

unfocused_indices_undersampled = np.random.choice(unfocused_indices, num_drowsy_samples, replace=False)

final_indices = np.concatenate([focused_indices, unfocused_indices_undersampled, drowsy_indices])

final_indices = shuffle(final_indices, random_state=42)

X_balanced = X[final_indices]
y_balanced = y[final_indices]

unique_classes, class_counts = np.unique(y_balanced, return_counts=True)
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} samples")




In [None]:
X_flattened_balanced = X_balanced.reshape(X_balanced.shape[0], -1)

X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_flattened_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_pca_balanced = pca.fit_transform(X_train_balanced)
X_test_pca_balanced = pca.transform(X_test_balanced)

In [None]:
print(X_train_pca_balanced.shape)
print(X_test_pca_balanced.shape)

In [None]:
svm_model_balanced = SVC(kernel='rbf', probability=True)

svm_model_balanced.fit(X_train_pca_balanced, y_train_balanced)

In [None]:

y_pred_balanced = svm_model_balanced.predict(X_test_pca_balanced)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_balanced, y_pred_balanced))


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm_balanced = confusion_matrix(y_test_balanced, y_pred_balanced)

class_sizes_balanced = {
    0: 3450,  # Class 0 (Focused)
    1: 3450,  # Class 1 (Unfocused)
    2: 3450   # Class 2 (Drowsy)
}

def normalize_cm_by_class_size(cm, class_sizes):
    cm_normalized = cm.astype('float')  # Convert to float to avoid integer division
    for i in range(cm.shape[0]):  # Loop over rows (classes)
        cm_normalized[i] = cm[i] / class_sizes[i] * 100  # Convert to percentage
    return cm_normalized

cm_normalized_by_class_balanced = normalize_cm_by_class_size(cm_balanced, class_sizes_balanced)

class_names = ['Focused', 'Unfocused', 'Drowsy']
predicted_class_names = ['Predicted Focused', 'Predicted Unfocused', 'Predicted Drowsy']

cm_normalized_df_balanced = pd.DataFrame(cm_normalized_by_class_balanced, index=class_names, columns=predicted_class_names)

cm_mean_balanced = cm_normalized_df_balanced.mean().mean()

cmap_balanced = sns.diverging_palette(250, 10, as_cmap=True, s=100, l=50)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized_df_balanced, annot=True, cmap="Blues", center=cm_mean_balanced, linewidths=0.5)
plt.title('Confusion Matrix Normalized for Balanced Data')
plt.show()


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

## SVM optimized C=10

In [None]:
print(X_train_pca_balanced.shape, y_train_balanced.shape, X_test_pca_balanced.shape, y_test_balanced.shape)

In [None]:
svm_model_c1 = SVC(kernel='linear', C=0.5)  # C=10 is the regularization parameter

svm_model_c1.fit(X_train_pca_balanced, y_train_balanced)

In [None]:

y_pred_c1 = svm_model_c1.predict(X_test_pca_balanced)

In [None]:


print("Classification Report for SVM on Balanced Data:")
print(classification_report(y_test_balanced, y_pred_c1))




## LDA balanced data