In [1]:
# Python 3.10.15

# Import necessary libraries
import os
import numpy as np
import mne
import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow and Keras 2.15.0
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout, Flatten, Dense)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Suppress TensorFlow warnings (optional)
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)


In [2]:
# Data Preparation
# Define the data directory where subject folders are located
data_dir = '/Users/BAEK/Code/neurEx/data/N170/Data_Preprocessed'

# List all subject folders
subject_folders = [sub for sub in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, sub))]
'''
subjects = []
for sub in os.listdir(data_dir):
    if os.path.isdir(os.path.join(data_dir, sub)):
        subjects.append(sub)
'''

# Initialize lists to hold data and labels from all subjects
X_list = []
y_list = []

# Loop over each subject folder
for subject in subject_folders:
    
    subject_data_dir = os.path.join(data_dir, subject)
    
    # Construct file paths for the subject's data
    X_file = os.path.join(subject_data_dir, f'Epochs_{subject}.fif')
    
    # Check if data files exist
    if os.path.exists(X_file):
        
        print()
        print(f'***** Loading the processed data: {subject}')
        print()
        
        # Load the data
        X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
        y_subject = X_subject.events[:, 2]
        
        # Append to the list
        X_sub_data = X_subject.get_data()
        X_list.append(X_sub_data)
        y_list.append(y_subject)
        
    else:
        print()
        print(f'***** Data file Does Not Exist: {subject}')
        print()

# Ensure that at least one subject has been loaded
if len(X_list) == 0:
    print()
    raise ValueError("No data was loaded. Please check your data directory and files.")



***** Loading the processed data: sub-021



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-026


***** Loading the processed data: sub-019



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-010


***** Loading the processed data: sub-017


***** Loading the processed data: sub-028



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-016


***** Loading the processed data: sub-029


***** Loading the processed data: sub-011



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-027


***** Loading the processed data: sub-018



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-020


***** Loading the processed data: sub-002



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-005


***** Loading the processed data: sub-033


***** Loading the processed data: sub-034


***** Loading the processed data: sub-035



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-032


***** Loading the processed data: sub-004



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-003


***** Loading the processed data: sub-040



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-025


***** Loading the processed data: sub-022



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-014


***** Loading the processed data: sub-013



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-012


***** Loading the processed data: sub-015



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-023


***** Loading the processed data: sub-024



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-039


***** Loading the processed data: sub-006



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-001


***** Loading the processed data: sub-008


***** Loading the processed data: sub-037



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-030


***** Loading the processed data: sub-031



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-009


***** Loading the processed data: sub-036



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)



***** Loading the processed data: sub-038


***** Loading the processed data: sub-007



  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)
  X_subject = mne.read_epochs(X_file, preload=True, verbose=False)


### P8 (Right Parietal): 

The parietal lobe is involved in sensory processing and the integration of sensory information. Specifically, the right parietal region is important for spatial awareness, visual processing, and attention. It is often engaged in tasks related to visual perception and sensory-motor coordination.

### PO8 (Right Parieto-Occipital): 

The occipital lobe is the primary visual processing center in the brain, and the parietal lobe integrates sensory information. The PO8 region is crucial for processing visual information from both the environment and sensory input. It is especially important in the recognition of objects, including faces, and is involved in spatial processing and attention to visual stimuli.

### O2 (Right Occipital): 

The occipital lobe is the brain’s main area for visual processing, including perception of visual stimuli such as shapes, colors, and faces. The right occipital lobe is particularly active during tasks related to visual processing and recognition of visual patterns.

### P10 (Right Parietal-Temporal): 

The parietal lobe is involved in sensory integration, spatial awareness, and attention, while the temporal lobe is important for processing sensory input, especially auditory and visual information. The temporal lobe is heavily involved in memory, recognition, and face processing.


        a = np.array([[1, 2], 
                    [3, 4]])
        b = np.array([[5, 6], 
                    [7, 8]])

        np.concatenate((a, b), axis=0)
        # [[1, 2],
        #  [3, 4],
        #  [5, 6],
        #  [7, 8]]

        np.vstack((a,b))
        # [[1, 2],
        #  [3, 4],
        #  [5, 6],
        #  [7, 8]]

        np.concatenate((a, b), axis=1)
        # [[1 2 5 6]
        #  [3 4 7 8]]

        a = np.array([[[1, 2], 
                    [3, 4]]])  # Shape: (1, 2, 2)

        b = np.array([[[5, 6], 
                    [7, 8]]])  # Shape: (1, 2, 2)

        np.concatenate((a, b), axis=2)
        # [[[1, 2, 5, 6],
        #   [3, 4, 7, 8]]]

        np.vstack((a,b))
        # [[[1, 2],
        #   [3, 4]],
        #
        #   [[5, 6],
        #    [7, 8]]]





        # Mock EEG data with shape (3 samples, 4 channels, 5 timepoints)
        X = np.array([
            [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]],
            [[21, 22, 23, 24, 25], [26, 27, 28, 29, 30], [31, 32, 33, 34, 35], [36, 37, 38, 39, 40]],
            [[41, 42, 43, 44, 45], [46, 47, 48, 49, 50], [51, 52, 53, 54, 55], [56, 57, 58, 59, 60]]
        ])
        # Shape: (3 samples, 4 channels, 5 timepoints)

        chan_idx = [1, 3]  # Select only channels 1 and 3

        X = X[:, chan_idx, :]

        # Extracts only the specified channels
        X = np.array([
            [[6, 7, 8, 9, 10], [16, 17, 18, 19, 20]],
            [[26, 27, 28, 29, 30], [36, 37, 38, 39, 40]],
            [[46, 47, 48, 49, 50], [56, 57, 58, 59, 60]]
        ])
        # Shape: (3 samples, 2 channels, 5 timepoints)





        # Labels for three subjects
        y_list = [
            np.array([0, 1, 0, 1]),  # Subject 1
            np.array([1, 1, 0, 0]),  # Subject 2
            np.array([0, 0, 1, 1])   # Subject 3
        ]

        np.concatenate(y_list, axis=0)
        #[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1])  # Shape: (12,)

In [3]:
### Get channel names from the last loaded subject
# This retrieves the names of the channels from the EEG data. These names represent the locations or positions of the EEG electrodes on the scalp, like ‘FP1’, ‘F3’, ‘P3’, etc.
channels = X_subject.ch_names

# Channels such as P8, PO8, O2, and P10 are situated over or near these regions in the parietal and occipital lobes, which are strongly involved in facial processing and visual stimuli processing.
# Studies on the N170 often report that the most reliable and strongest N170 signals are found in right-lateralized occipital-temporal regions. 
coi = ['P8', 'PO8', 'O2', 'P10']

# Creates a list of indices (chan_idx) for the channels of interest. 
# It finds the index (position) of each channel in the coi list from the full channels list.
chan_idx = [channels.index(chan) for chan in coi] # [25, 27, 29, 26]
'''
chan_idx = []
for chan in coi:
    index = channels.index(chan)    # Get the index of the channel in the full list
    chan_idx.append(index)          # Append the index to the list
'''

### Concatenate data from all subjects
# X_list: This is a list containing the EEG data arrays for multiple subjects. Each array in X_list typically has a shape like (trials, channels, time_points).

# NumPy arrays are multi-dimensional, and each dimension is associated with an axis:
# 0 represents the rows (vertical direction).
# 1 represents the columns (horizontal direction).
# 2, 3, and beyond represent additional dimensions (for higher-dimensional arrays).

# Combines the EEG data for all subjects along the trials dimension. Same as X = np.vstack(X_list) when concatenate(X_list, axis = 0)
# After concatenation, X will contain all trials from all subjects in a single array.
# If, for example, each subject’s data has 100 trials, and there are 10 subjects, then X will have a shape (1000, channels, time_points).
X = np.concatenate(X_list, axis = 0) # will combine all x of X(x,y,z)

# A slicing operation on the X array, and it extracts specific subsets of the data along its second axis (dimensions).
X = X[:, chan_idx, :]  # will change the y of X(x,y,z)

# Combines all the individual y arrays in y_list into a single, larger 1D or 2D array along the first axis (axis=0)
y = np.concatenate(y_list, axis = 0)

print(f'Combined data shape before filtering: X={X.shape}, y={y.shape}')

### Filter stimulus events
stimulus_labels = [1, 2]  # 1: Face, 2: Car
# Checks each element in y to see if it is in the stimulus_labels list.
# Returns a boolean array (stimulus_mask) of the same length as y.
stimulus_mask = np.isin(y, stimulus_labels)
X = X[stimulus_mask] # will change the x of X(x,y,z)
y = y[stimulus_mask]

# Adjust labels to start from 0
y = y - 1

# Focus on the N170 response by selecting data around 170 ms 
# Selecting Data Around the N170 Response (time window selection)
# This defines the time window you’re interested in, i.e., 100ms to 200ms after the stimulus.
# This means you are interested in data collected from 0.1 seconds to 0.2 seconds after the stimulus
tmin, tmax = 0.10, 0.2   # 100 ms to 200 ms
times = X_subject.times  # Time vector from the epochs
# Filter the times so that only the times that lies between tmin and tmax stay
time_mask = (times >= tmin) & (times <= tmax)
X_focused = X[:, :, time_mask]

print(f'Combined data shape after filtering: X={X_focused.shape}, y={y.shape}')

X_focused[0,0]

Combined data shape before filtering: X=(23602, 4, 820), y=(23602,)
Combined data shape after filtering: X=(6104, 4, 102), y=(6104,)


array([-2.65535473e-07, -2.33084976e-07, -2.56221681e-07, -3.35722922e-07,
       -4.71088408e-07, -6.60497783e-07, -9.00880692e-07, -1.18786204e-06,
       -1.51586949e-06, -1.87812459e-06, -2.26676702e-06, -2.67311763e-06,
       -3.08767533e-06, -3.50033306e-06, -3.90088247e-06, -4.27891778e-06,
       -4.62421035e-06, -4.92699383e-06, -5.17809914e-06, -5.36951111e-06,
       -5.49421166e-06, -5.54619740e-06, -5.52102088e-06, -5.41581248e-06,
       -5.22933673e-06, -4.96202801e-06, -4.61585473e-06, -4.19435119e-06,
       -3.70253657e-06, -3.14671301e-06, -2.53441977e-06, -1.87421798e-06,
       -1.17549646e-06, -4.48325484e-07,  2.96874105e-07,  1.04961514e-06,
        1.79936683e-06,  2.53584337e-06,  3.24924111e-06,  3.93041848e-06,
        4.57091855e-06,  5.16319560e-06,  5.70080136e-06,  6.17847393e-06,
        6.59201477e-06,  6.93824957e-06,  7.21520756e-06,  7.42191790e-06,
        7.55842064e-06,  7.62581203e-06,  7.62584541e-06,  7.56124113e-06,
        7.43529985e-06,  

In [4]:
print(X_focused.shape)

X_focused

(6104, 4, 102)


array([[[-2.65535473e-07, -2.33084976e-07, -2.56221681e-07, ...,
          2.21065354e-06,  2.27832317e-06,  2.31764697e-06],
        [ 1.52646923e-06,  1.55423677e-06,  1.55282390e-06, ...,
          7.41099594e-07,  9.07904801e-07,  1.05732774e-06],
        [-7.22147654e-06, -7.38670919e-06, -7.53458689e-06, ...,
         -4.07241284e-07, -2.46243744e-07, -8.30428524e-08],
        [ 5.03884314e-06,  5.25305508e-06,  5.36733101e-06, ...,
          8.15229414e-06,  8.16074655e-06,  8.08678148e-06]],

       [[-1.47737646e-06, -1.67229044e-06, -1.82316827e-06, ...,
          3.79423474e-06,  3.87306189e-06,  3.89641451e-06],
        [-6.68424724e-07, -6.97640298e-07, -7.08379923e-07, ...,
         -1.09889269e-06, -8.65716336e-07, -6.51313899e-07],
        [-7.48422084e-07, -9.07281396e-07, -1.03889632e-06, ...,
         -8.53871045e-07, -6.54174386e-07, -4.62437718e-07],
        [-4.21238231e-06, -4.83159445e-06, -5.40723513e-06, ...,
         -1.00315418e-08,  2.54398673e-07,  5.16855

1. X

NumPy arrays are multidimensional data structures. Each dimension corresponds to a different aspect of the data.

First Dimension (595):
•	Represents 595 epochs (or trials) in your experiment. Each epoch corresponds to a single trial where the EEG data is collected during a stimulus presentation or event.
	•	This dimension will vary as you add more subjects or trials, but each epoch will contain data in the format specified by the next two dimensions.

Second Dimension (33):
•	Represents 33 EEG channels (electrodes) capturing data from various scalp locations. Each channel corresponds to a different electrode that records electrical activity from the brain.
•	These 33 channels provide spatial information about where the brain activity is occurring on the scalp.
•	This dimension will remain the same across all subjects, as you are using the same number of electrodes to capture the data.

Third Dimension (820):
•	Represents 820 time points for each epoch. These are the EEG values recorded over time during each trial. The EEG signal is continuous, and the time points correspond to the sampling rate and the duration of the recording.
•	This dimension also remains the same across all subjects, as the data is sampled at the same rate and duration for each trial.

2. For example 

import numpy as np
box = np.random.rand(10, 20, 5)  # Generate random values in a (10, 20, 5) shape
print(box)

  array([[[0.72, 0.43, 0.99, 0.58, 0.16],  # Epoch 1, Channel 1
          [0.63, 0.34, 0.88, 0.44, 0.01],  # Epoch 1, Channel 2
          ...
          [0.56, 0.11, 0.98, 0.75, 0.22]], # Epoch 1, Channel 20
         [[0.89, 0.22, 0.78, 0.33, 0.55],  # Epoch 2, Channel 1
          [0.71, 0.48, 0.64, 0.88, 0.19],
          ...
          [0.12, 0.95, 0.72, 0.66, 0.23]], # Epoch 2, Channel 20
          ...
         [[0.82, 0.73, 0.61, 0.88, 0.34],  # Epoch 10, Channel 1
          [0.44, 0.56, 0.77, 0.93, 0.18],
          ...
          [0.62, 0.29, 0.38, 0.99, 0.41]]  # Epoch 10, Channel 20
        ])

Step 1: Mapping xx to Excel Files

The structure of your EEG data (xx) can be thought of as:
	•	Rows in the Excel sheet (y-axis): These correspond to the 33 EEG channels.
	•	Columns in the Excel sheet (x-axis): These correspond to the 820 time points.
	•	Multiple Excel sheets (z-axis): Each sheet corresponds to one epoch (595 epochs total).

Imagine one epoch of data as a single Excel file. For Epoch 1, you have a 33×820 table:

Channel/Time	 t1	    t2	 t3	  …	t820
         Ch1	0.12	-0.34	0.56	…	0.44
         Ch2	0.15	-0.23	0.54	…	0.39
         Ch3	0.10	-0.18	0.45	…	0.35
          …	   …	    …	   …	  …	 …
         Ch33	0.22	-0.28	0.67	…	0.50

Step 3: Visualizing Across Epochs

Now, you stack 595 Excel files one on top of the other. Each file has the same dimensions (33×820), but the data values inside them differ because each epoch captures a slightly different response (depending on the stimulus, noise, or other factors).

Step 4: Linking to the Labels (yy)

Each Excel file (epoch) corresponds to a single value in yy, which indicates the event type or condition. For example:

Epoch	Label (Event Type)
  1	    1   (Face stimulus)
  2	    2   (Car stimulus)
  3    	1   (Face stimulus)
  …	    …       …
 595   	4   (Scrambled car)

Step 5: Overall Data Structure

Think of your data as 595 Excel files:
•	Each file has 33 rows (EEG channels) and 820 columns (time points).
•	You also have a separate label file (yy) that tells you what condition or event type corresponds to each Excel file.
•	For the table we created for epoch 1 with a 33×820 table, it's name is 1 (Face stimulus) according to yy.
•	x (first dimension): Epochs (each epoch is like a “table”).
•	y (second dimension): Channels (columns in the table).
•	z (third dimension): Time points (rows in the table).

3. Normalization (Z-score normalization)

Normalization is a standard pre-processing step in machine learning and signal processing to ensure that all features (in this case, the EEG signal values across all epochs) are on the same scale. Without normalization, features with larger scales (like EEG channels with higher amplitude signals) might dominate over features with smaller scales (like signals with less variance). This can make the training of machine learning models less effective.

Z-score Normalization is a type of standardization, where we scale the data such that it has a mean of 0 and a standard deviation of 1. 

Z = \frac{X - \mu}{\sigma}

Where:
•	X is the original data point (EEG signal value at a specific time and channel),
•	\mu is the mean of the data (average signal value),
•	\sigma is the standard deviation (how much the data varies from the mean).

This normalization ensures that the data has a consistent scale, which helps improve the performance of machine learning algorithms, especially deep learning models.

4. For example 

x = [
    [1, 2, 3, 4],  # Epoch 1
    [5, 6, 7, 8],  # Epoch 2
    [9, 10, 11, 12] # Epoch 3
]

Here, each row corresponds to an epoch (a trial), and each column corresponds to a feature (in this case, 4 features).
	•	Epoch 1: [1, 2, 3, 4]
	•	Epoch 2: [5, 6, 7, 8]
	•	Epoch 3: [9, 10, 11, 12]

Step-by-Step Z-Score Normalization


Mean and Standard Deviation Calculation:
For each feature (column), we calculate the mean and standard deviation:
	
  •	Mean of Feature 1:  \frac{1 + 5 + 9}{3} = 5 
	•	Mean of Feature 2:  \frac{2 + 6 + 10}{3} = 6 
	•	Mean of Feature 3:  \frac{3 + 7 + 11}{3} = 7 
	•	Mean of Feature 4:  \frac{4 + 8 + 12}{3} = 8 
	
  •	Standard Deviation of Feature 1:
 \sqrt{\frac{(1-5)^2 + (5-5)^2 + (9-5)^2}{3}} = 3.464 
	•	Standard Deviation of Feature 2:
 \sqrt{\frac{(2-6)^2 + (6-6)^2 + (10-6)^2}{3}} = 3.464 
	•	Standard Deviation of Feature 3:
 \sqrt{\frac{(3-7)^2 + (7-7)^2 + (11-7)^2}{3}} = 3.464 
	•	Standard Deviation of Feature 4:
 \sqrt{\frac{(4-8)^2 + (8-8)^2 + (12-8)^2}{3}} = 3.464 

Apply Z-Score Normalization:

Z-score normalization is applied by the formula:

z = \frac{x - \mu}{\sigma}

where:
	•	 x  is the original value,
	•	 \mu  is the mean of the feature,
	•	 \sigma  is the standard deviation of the feature.
For Epoch 1, applying Z-score to each feature:
	•	Feature 1:  \frac{1 - 5}{3.464} = -1.155 
	•	Feature 2:  \frac{2 - 6}{3.464} = -1.155 
	•	Feature 3:  \frac{3 - 7}{3.464} = -1.155 
	•	Feature 4:  \frac{4 - 8}{3.464} = -1.155 
Normalized Epoch 1: [-1.155, -1.155, -1.155, -1.155]

Similarly, apply the Z-score normalization for Epoch 2 and Epoch 3:

Normalized Epoch 2:
	•	Feature 1:  \frac{5 - 5}{3.464} = 0 
	•	Feature 2:  \frac{6 - 6}{3.464} = 0 
	•	Feature 3:  \frac{7 - 7}{3.464} = 0 
	•	Feature 4:  \frac{8 - 8}{3.464} = 0 
Normalized Epoch 2: [0, 0, 0, 0]

Normalized Epoch 3:
	•	Feature 1:  \frac{9 - 5}{3.464} = 1.155 
	•	Feature 2:  \frac{10 - 6}{3.464} = 1.155 
	•	Feature 3:  \frac{11 - 7}{3.464} = 1.155 
	•	Feature 4:  \frac{12 - 8}{3.464} = 1.155 
Normalized Epoch 3: [1.155, 1.155, 1.155, 1.155]

x = [
    [-1.155, -1.155, -1.155, -1.155],  # Epoch 1
    [0,     0,     0,     0    ],     # Epoch 2
    [1.155, 1.155, 1.155, 1.155]      # Epoch 3
]


In [None]:
# Normalize data per channel (not global)
def normalize_per_channel(X_data):
    # Creates an array of the same shape as X_data, but with all values initialized to zero. 
    # The purpose of this array is to store the normalized data.
    X_norm = np.zeros_like(X_data)
    for i in range(X_data.shape[1]):  # Loop over the range channels: total 4 times from i=0 to i=3
        scaler = StandardScaler()
        # X_channel will have a shape of (6104, 61), representing the data for all trials across all time points for channel i
        # X_data = [[[a1], [a2], [a3], [a4]], 
        #           [[b1], [b2], [b3], [b4]], 
        #           [[c1], [c2], [c3], [c4]]]
        # X_channel = [[a1],[b1],[c1]]
        X_channel = X_data[:, i, :]
        # First, it computes the mean and standard deviation for each feature (each column) in X_channel. 
        # After calculating the mean and standard deviation, it transforms the data by subtracting the mean and dividing by the standard deviation
        X_norm[:, i, :] = scaler.fit_transform(X_channel)
    return X_norm

X_normalized = normalize_per_channel(X_focused)

# Reshape data for Conv2D (samples, channels, times, depth/feature)
# A typical CNN expects the input to be in a 4D shape: (samples, height, width, channels) or (batch_size, rows, columns, channels). 
# This is because CNNs are designed to process multi-dimensional data (images or time series) in terms of both spatial and channel dimensions.
# The additional dimension (1) indicates that each sample is being treated as having a single channel (similar to grayscale images). 
# In other words, you’re telling the model that each time sample has just one feature (or one channel)
# By reshaping it to (1000, 64, 50, 1), you’re essentially telling the model that for each of the 1000 samples, you have 64 channels and 50 time samples, and you are treating each one as a single-channel input.
# Convolutional layers typically expect a 4D input, where the last dimension (1 in this case) represents the number of channels. By reshaping the input, the model can apply convolutional operations over the time and channel dimensions correctly, learning temporal patterns from the data.
X = X_normalized[..., np.newaxis]  # Shape: (samples, channels, times, 1)

'''
Before the np.newaxis operation:
•	The shape of your array X_normalized is something like (samples, channels, time_points).
•	The data is structured in a 3D array with samples as the first dimension, channels as the second dimension, and time points as the third dimension.
•	Each time point for every channel in every sample is a scalar value (real number), and the array looks like this:
 array([[[ 3.24529244e-01,  4.11996053e-01,  4.91609174e-01, ...],
         [ 1.84049422e-01,  2.35332724e-01,  2.83798373e-01, ...],
         [-2.38773378e-01, -2.04526201e-01, -1.74333753e-01, ...],
         [ 4.25627014e-01,  4.73232880e-01,  5.16544162e-01, ...]],

        [[-2.96749702e-01, -2.86137936e-01, -2.64685230e-01, ...],
         [-4.21221455e-01, -4.03617815e-01, -3.79205829e-01, ...],
         [ 2.98252681e-01,  2.99339691e-01,  3.04971293e-01, ...],
         [-2.24049276e-02, -2.27949784e-02, -2.29925722e-02, ...]],

        [[-4.12923850e-01, -3.98015639e-01, -3.82177900e-01, ...], 
        ...

After the np.newaxis operation:
•	The shape changes to (samples, channels, time_points, 1).
•	This operation wraps each individual value (the scalar for each time point) into its own array (of size 1) and adds a new dimension.
•	Now, each time point for each channel/sample is a 1D array (each value in your array is now encapsulated as a single-element list or array).
array([[[[ 3.24529244e-01],
         [ 4.11996053e-01],
         [ 4.91609174e-01],
         ...],
 
        [[ 1.84049422e-01],
         [ 2.35332724e-01],
         [ 2.83798373e-01],
         ...],

        [[-2.38773378e-01],
         [-2.04526201e-01],
         [-1.74333753e-01],
         ...],

        [[ 4.25627014e-01],
         [ 4.73232880e-01],
         [ 5.16544162e-01],
         ...]]], 
       ...
'''

'\nBefore the np.newaxis operation:\n•\tThe shape of your array X_normalized is something like (samples, channels, time_points).\n•\tThe data is structured in a 3D array with samples as the first dimension, channels as the second dimension, and time points as the third dimension.\n•\tEach time point for every channel in every sample is a scalar value (real number), and the array looks like this:\n array([[[ 3.24529244e-01,  4.11996053e-01,  4.91609174e-01, ...],\n         [ 1.84049422e-01,  2.35332724e-01,  2.83798373e-01, ...],\n         [-2.38773378e-01, -2.04526201e-01, -1.74333753e-01, ...],\n         [ 4.25627014e-01,  4.73232880e-01,  5.16544162e-01, ...]],\n\n        [[-2.96749702e-01, -2.86137936e-01, -2.64685230e-01, ...],\n         [-4.21221455e-01, -4.03617815e-01, -3.79205829e-01, ...],\n         [ 2.98252681e-01,  2.99339691e-01,  3.04971293e-01, ...],\n         [-2.24049276e-02, -2.27949784e-02, -2.29925722e-02, ...]],\n\n        [[-4.12923850e-01, -3.98015639e-01, -3.821

In [None]:
# Data Augmentation
def augment_data(X, y):
    
    X_augmented = []
    y_augmented = []
    
    # Loop over each sample (trial or instance) in X
    for i in range(X.shape[0]):
        
        # Original data
        # For each sample, we first append the original data (X[i]) and the corresponding label (y[i]) to the X_augmented and y_augmented lists. 
        # This ensures that the original data is kept in the augmented dataset.
        X_augmented.append(X[i])
        y_augmented.append(y[i])

        # Time-shifted data
        # This block generates new samples by shifting the time steps of the data. 
        # The np.roll() function is used to shift the data along the time axis (axis=2). 
        # It shifts the entire time series by 1 or 2 time steps in both forward and backward directions (given by the shift values [-2, -1, 1, 2]).
	    # X_shifted = np.roll(X[i], shift, axis=2) shifts the time data in X[i] by shift units along the time axis.
        '''
        X[0] (shape: (3, 4)):
        [[0.1, 0.2, 0.3, 0.4],
         [0.5, 0.6, 0.7, 0.8],
         [0.9, 1.0, 1.1, 1.2]]
        
        X_shifted = np.roll(X[0], 1, axis=2)
        
        X_shifted (shift = 1):
        [[0.4, 0.1, 0.2, 0.3],
         [0.8, 0.5, 0.6, 0.7],
         [1.2, 0.9, 1.0, 1.1]]
        '''
	    # The shifted data (X_shifted) is then appended to the augmented dataset (X_augmented), and the corresponding label (y[i]) is appended to y_augmented.
        # Shifting the data with np.roll can potentially disrupt the chronological order of the data, and this could negatively affect the ability of the model to learn meaningful patterns from the data, especially when detecting a time-sensitive event like the N170.
        for shift in [-2, -1, 1, 2]:  # Shift by 1 or 2 time steps
            X_shifted = np.roll(X[i], shift, axis = 2)
            X_augmented.append(X_shifted)
            y_augmented.append(y[i])
        
        # Noise-injected data
        # We add noise to the data to create new augmented samples that simulate real-world variations.
	    # noise = np.random.normal(0, 0.01, X[i].shape) generates random noise from a normal distribution with a mean of 0 and a standard deviation of 0.01. The noise has the same shape as the original data sample X[i].
	    # X_noisy = X[i] + noise adds the generated noise to the original sample.
	    # The noisy data (X_noisy) is then appended to X_augmented, and the corresponding label (y[i]) is appended to y_augmented.
        noise = np.random.normal(0, 0.01, X[i].shape)
        X_noisy = X[i] + noise
        X_augmented.append(X_noisy)
        y_augmented.append(y[i])

    return np.array(X_augmented), np.array(y_augmented)

# Apply augmentation on the whole dataset before splitting
X_aug, y_aug = augment_data(X, y)

print(f'Augmented data shape: X={X_aug.shape}, y={y_aug.shape}')

X_aug

Augmented data shape: X=(36624, 4, 102, 1), y=(36624,)


array([[[[-0.31605341],
         [-0.32619682],
         [-0.34412525],
         ...,
         [ 0.32308469],
         [ 0.31376044],
         [ 0.30061452]],

        [[-0.11632468],
         [-0.12962838],
         [-0.14648719],
         ...,
         [-0.11461243],
         [-0.11596127],
         [-0.11936317]],

        [[-0.99076982],
         [-1.01335294],
         [-1.03497392],
         ...,
         [-0.41966924],
         [-0.41317054],
         [-0.40664085]],

        [[ 0.28014587],
         [ 0.29523614],
         [ 0.29830576],
         ...,
         [ 0.88123175],
         [ 0.86926358],
         [ 0.84805751]]],


       [[[-0.31605341],
         [-0.32619682],
         [-0.34412525],
         ...,
         [ 0.32308469],
         [ 0.31376044],
         [ 0.30061452]],

        [[-0.11632468],
         [-0.12962838],
         [-0.14648719],
         ...,
         [-0.11461243],
         [-0.11596127],
         [-0.11936317]],

        [[-0.99076982],
         [-1.0

In [None]:
#X_aug.shape[2]
print(X_subject.info['sfreq'])

X_aug.shape

1024.0
[[[-4.85724608e-01]
  [-5.27119373e-01]
  [-5.62268219e-01]
  [-5.90478106e-01]
  [-6.11181882e-01]
  [-6.23947065e-01]
  [-6.28502286e-01]
  [-6.24761993e-01]
  [-6.12808235e-01]
  [-5.92900675e-01]
  [-5.65509937e-01]
  [-5.31264647e-01]
  [-4.90954150e-01]
  [-4.45518758e-01]
  [-3.96053484e-01]
  [-3.43758174e-01]
  [-2.89920554e-01]
  [-2.35899617e-01]
  [-1.83065431e-01]
  [-1.32769500e-01]
  [-8.62986058e-02]
  [-4.48436433e-02]
  [-9.45373979e-03]
  [ 1.89875131e-02]
  [ 3.98056557e-02]
  [ 5.25734208e-02]
  [ 5.71027349e-02]
  [ 5.34707654e-02]
  [ 4.19962167e-02]
  [ 2.32894154e-02]
  [-1.81074607e-03]
  [-3.22203967e-02]
  [-6.66553415e-02]
  [-1.03667216e-01]
  [-1.41677501e-01]
  [-1.78994895e-01]
  [-2.13913476e-01]
  [-2.44730521e-01]
  [-2.69807191e-01]
  [-2.87654699e-01]
  [-2.96989723e-01]
  [-2.96749702e-01]
  [-2.86137936e-01]
  [-2.64685230e-01]
  [-2.32264439e-01]
  [-1.89059806e-01]
  [-1.35538586e-01]
  [-7.24695246e-02]
  [-8.88158521e-04]
  [ 7.7942052

(36624, 4, 102, 1)

    1.	nb_classes
	•	Represents the number of output classes for the classification task.
	•	The number of output nodes in the final dense layer of your CNN
	•	For example, in binary classification (like your case of detecting two classes), nb_classes=2.
	
    2.	Chans=4
	•	The number of input channels in the data (e.g., EEG channels such as P8, PO8, O2, P10).
	•	The architecture of the model uses this to define filter shapes (e.g., in the spatial convolution step).
	•	If you select 4 channels from the preprocessed data (['P8', 'PO8', 'O2', 'P10']), set Chans=4.
	
    3.	Samples=50
	•	The number of time samples in the input data.
	•	Helps the model understand the temporal dimension of the signal.
	
	•	In your case, if each epoch spans from t=0.10 to t=0.20 seconds and is sampled at 500 Hz, you would have: Samples = (0.20 - 0.10) seconds * 1000 Hz = 100 samples.
	•	The sampling frequency (or sampling rate) of the EEG data is typically provided in the metadata of the dataset or is known beforehand based on the experimental setup used during the EEG recording.
	
	•	Sampling Frequency = Number of Samples / Time Duration
	•	From your selected range t_min, t_max, which is 0.2 - 0.1 = 0.1 seconds. Sampling Frequency = 100 / 0.1 = 1000 Hz
	•	X_subject.info['sfreq'] returns 1024 Hz and X.shape[2] returns 102.



### Overfitting

Overfitting occurs when a model learns the noise or specific details of the training data too well, and as a result, it performs poorly on unseen data (test data). Dropout is a regularization technique designed to prevent overfitting.

By randomly deactivating (dropping out) a fraction of the neurons during training, dropout forces the network to learn more robust features. The model cannot rely on specific neurons too much, and this helps generalize better to unseen data.

Why Use a Gradually Increasing Dropout Rate?

	1.	Early Layers: Retain Information
	•	Early layers of a neural network often learn low-level features (such as edges in images or simple patterns in time-series data).
	•	In these layers, dropout is typically lower (e.g., 0.2 or 0.3) because the network still needs to retain the important low-level features to build more complex representations in the later layers. Too much dropout here could hurt the network’s ability to learn these basic features.
	
	2.	Later Layers: Prevent Overfitting
	•	Later layers (e.g., the fully connected layers) tend to learn more abstract and complex features or combinations of lower-level features.
	•	As the model progresses, it becomes more powerful (i.e., more parameters), and hence the risk of overfitting increases. To counteract this, dropout is typically higher in later layers (e.g., 0.5), meaning 50% of the neurons are dropped. This helps prevent the model from relying too heavily on specific neurons and promotes better generalization.
	
	3.	Progressive Regularization:
	•	Increasing the dropout rate as you move deeper into the network acts as a form of progressive regularization.
	•	Early in training, the network needs more capacity to learn the basic patterns, so a smaller dropout rate ensures it has sufficient power. However, once the basic features are learned, increasing the dropout in deeper layers reduces the chance that the model will memorize the specific details of the training data, helping to avoid overfitting.

### Blocks

1. block1 = Conv2D(25, (1, 5), padding = 'same', use_bias = False) (input_main)

•	This is a 2D convolutional layer that learns spatial features from the input data.

•	Conv2D is the 2D convolutional layer that is one of the building blocks of a Convolutional Neural Network (CNN). 

•	This layer performs convolutional operations on the input data to extract spatial features by sliding a filter (kernel) over the input. 

•	It is applied to 2D data (like images or time series data with multiple channels).

•	Filters (also called kernels) are learned during the training process. Each filter is responsible for detecting a specific pattern or feature in the input data (e.g., edges, temporal patterns in the case of EEG signals).

#### EX

•	25: The number of filters or kernels(neurons for each layer) to apply in this convolutional layer. Each filter learns different features from the input, so this layer will output 25 feature maps. 25 means that 25 separate filters will be applied to the input data. This means that after applying this convolution, you will get 25 output feature maps (one for each filter). These feature maps will represent different learned features from the input data.

•	(1, 5): The size of the filter (kernel) used in the convolution. The filter has dimensions 1x5:

•	1 is the height of the filter, meaning it will span across the channels dimension (i.e., it’s looking at the entire channel).

•	5 is the width of the filter, meaning it spans across 5 time samples.

•	padding='same': Ensures that the output feature map has the same width and height as the input feature map (i.e., padding is added to the input if necessary).

•	use_bias=False: No bias is used in this layer.

2. EEG-Specific Filters in CNNs:

•	For EEG data, filters may span across time and channels, such as:
	•	1 by 5: Detecting temporal patterns in a single channel.
	•	4 by 1: Capturing spatial relationships across 4 channels.
	•	4 by 5: Combining spatial and temporal information. 
		ex: # Block 1: Combining spatial and temporal information block1 = Conv2D(25, (Chans, 5), padding='same', use_bias=False)(input_main)  

•	input_main vs. block1:
	•	You apply the first convolution directly on the input data (input_main).
	•	For subsequent layers, you apply the convolution on the output of the previous layer (e.g., block1).

3. Temporal Filter for Input (Conv2D(25, (1, 5), padding='same')): Captures Temporal Features

•	Temporal: Detects frequency-specific patterns (e.g., alpha, beta, gamma bands).

•	Captures patterns or relationships over time within each individual channel (electrode).

•	EEG signals are time-series data, so temporal filters help in detecting features like oscillations, event-related potentials (ERPs), or rapid changes in voltage.

•	By applying a temporal filter of size (1, 5), we analyze small windows of time points for each channel independently.

•	Height = 1: The height is fixed to 1 because the goal is to analyze each channel independently. By keeping the height as 1, the filter does not mix information and stay independent across channels.

•	Width = 5: The width of 5 indicates the size of the temporal window being analyzed. This is a design choice and can vary based on:
	•	The sampling rate of your EEG data (e.g., how densely the time points are sampled).
	•	The temporal resolution of the patterns you’re trying to capture (e.g., short bursts vs. long trends).

•	For example:
	•	Detecting short-term changes in amplitude or frequency.
	•	Recognizing local patterns like spikes or bursts in the signal.

•	This step ensures the network extracts temporal dependencies in the data, which are crucial for identifying patterns like rhythmic 		 activity (alpha waves, theta waves) or time-locked responses.

4. Spatial Filter for Temporal Features (Conv2D(25, (Chans, 1))): Captures Spatial Features

when the kernel is applied, the result is placed at the top-left corner (or the position where the kernel is centered)

•	Spatial: Identifies brain regions generating or interacting with these oscillations.

•	After extracting temporal patterns within each channel, we apply a spatial filter to combine information across channels.

•	Combines information across multiple channels (electrodes).

•	Captures the spatial distribution of activity, which can reveal interactions or correlations between different parts of the brain.

•	EEG signals recorded at different electrodes (channels) are not independent—they often exhibit spatial dependencies:
	•	For instance, responses in occipital electrodes (back of the head) might correlate with visual stimuli.
	•	Spatial filters help detect distributed patterns of activation or relationships between electrodes.
	•	In some cases, specific brain activities (e.g., N170 ERP) may appear localized in certain regions but still involve coordinated activity across multiple electrodes.

•	By applying a filter of size (Chans, 1), the network learns how different channels contribute together to meaningful patterns.

#### EX for X_aug.shape = (36624, 4, 102, 1)

Where:
	•	36624 is the batch size (number of samples).
	•	4 is the number of channels (EEG electrodes).
	•	102 is the number of time samples (time points).
	•	1 is the depth (single-channel data).

##### Step 1: Temporal Convolution (Conv2D(25, (1, 5), padding='same', use_bias=False)(input_main))

This layer applies a temporal filter of size (1, 5) across the time dimension (102 time points) for each channel independently. The filter has:
	•	A height of 1 (which means it’s applied across all the channels), and
	•	A width of 5 (which means it’s applied across 5 consecutive time points).

Since you’re using padding='same', the output width will remain the same as the input width (102), and the depth increases to 25 (because you are using 25 filters). The height remains 4 since the filter only operates along the time dimension, and the channels are not affected.

After this operation, the output shape will be: (36624, 4, 102, 25)

	•	Batch size: 36624 (unchanged).
	•	Channels: 4 (unchanged).
	•	Time points (Samples): 102 (unchanged, because of padding='same').
	•	Depth (Filters): 25 (as specified by the number of filters).

So, the neurons in the first layer will have the shape (4, 102, 25). This means for each time point (in the time dimension) and each EEG channel (in the channel dimension), there will be 25 different features (neurons) created by the 25 filters.

##### Step 2: Spatial Convolution (Conv2D(25, (Chans, 1), use_bias=False)(block1))

This second convolution layer applies a spatial filter of size (4, 1) (because Chans = 4) across the channel dimension (4 EEG channels). The filter has:
	•	A height of 4 (which means it’s applied across all 4 channels), and
	•	A width of 1 (which means it’s applied across a single time point).

Since you are using padding='same', the width (time samples) remains unchanged at 102, but the height (channels) will reduce to 1 because the filter aggregates information across all channels into a single spatial representation for each time point.

After this operation, the output shape will be: (36624, 1, 102, 25)

	•	Batch size: 36624 (unchanged).
	•	Channels: 1 (because the filter aggregates all channels into one).
	•	Time points (Samples): 102 (unchanged).
	•	Depth (Filters): 25 (as specified by the number of filters).


5. Why Perform These Steps Sequentially?

•	Temporal first, then spatial:

•	EEG signals are first processed channel-wise because the temporal dynamics within a channel are critical and often unique to that channel.

•	Once these temporal features are extracted, combining them across channels enables the network to learn global spatiotemporal patterns that span the entire brain region.

•	Avoid losing temporal resolution early:
•	If we combined channels first (spatial filtering) before applying temporal filters, we would lose detailed temporal information about each channel’s signal.

6. Why Not Apply Both Simultaneously?

It’s possible to design filters that analyze both temporal and spatial dimensions at the same time (e.g., (Chans, 5)). However:
•	Decoupling spatial and temporal processing simplifies training because:
•	Temporal filters only learn from time-domain relationships (fewer parameters to optimize).
•	Spatial filters only learn from spatial relationships (also fewer parameters).

•	Sequential processing is particularly useful for EEG data, where:
•	Temporal patterns (oscillations, bursts) are often more consistent across trials, while spatial patterns can vary.
•	Isolating the two dimensions helps the network focus on distinct aspects of the data.

7. What happens with padding=‘same’ in convolution:

(Chans, samples, depth)

•	Padding = ‘same’ means that the spatial dimensions (height and width) of the input and output will remain the same if the stride is 1.

•	Spatial dimension (height): If you’re using padding='same', the height of the input will stay the same in the output (if the stride is 1). In your case, this is the number of EEG channels, which is fixed.

•	Temporal dimension (width): The width of the input (number of time samples) will also stay the same because of the same padding, assuming stride is 1. This corresponds to the number of time points.

•	Depth: The number of feature maps or filters used in the previous layer
•	First Convolution (block1): After applying the convolution with the filter size (1, 5), the depth will be 25 (as specified).
•	Second Convolution (block2): After applying the next convolution with the filter size (1, 5), the depth increases to 50 because you specified 50 filters in the layer.

8. What is Bias in a Neural Network?

In a typical neural network, each layer has two components that influence the output:
	1.	Weights: These define the strength of the connections between the neurons (or, in the case of convolutional layers, the kernel/filter).
	2.	Bias: This is an additional parameter added to the output of the convolution operation before applying the activation function. It shifts the activation of the neuron and can help the network learn a wider range of functions.

In EEG, the temporal and spatial dependencies between different channels and time points are critical. By using use_bias=False in conjunction with batch normalization, you are allowing the model to focus more on these dependencies without introducing unnecessary complexity. In fact, when dealing with EEG, the model needs to:
•	Learn temporal patterns (how EEG signals evolve over time),
•	Learn spatial patterns (how different channels might correlate or react together to certain stimuli, like faces),
•	Adapt to these patterns through convolutional filters and batch normalization (which controls variance and scaling).

9. Batch Normalization (block1 = BatchNormalization()(block1))

Batch normalization is applied to the feature maps generated by the previous convolutional layer. Here’s what it does:

It normalizes the activations of the previous layer across the mini-batch (across the batch dimension). This helps in reducing internal covariate shift, where the distribution of activations changes during training, which can slow down training and make it unstable.



Activations are the values produced by neurons in a layer after the network processes the input data and applies an activation function to them. 

For example, in a fully connected layer, the activation might be calculated as a weighted sum of the inputs to the neuron, followed by the application of a non-linear function like ReLU, Sigmoid, or ELU (feature map in out case).
•	Activation = sigma(W * X + b) 



In the context of batch normalization, the activation changes refer to the modification of these outputs across the network’s layers, particularly as they propagate forward during training.

Batch normalization changes the activations of neurons across the network in a specific way:

•	For each mini-batch of data, the activations of each neuron in that mini-batch are calculated.

•	The mean and standard deviation of the activations for each feature map (i.e., each channel or neuron) are computed across all examples in the mini-batch.

•	The activations are then normalized by subtracting the mean and dividing by the standard deviation. This process makes the activations have zero mean and unit variance, reducing internal covariate shift, which helps with model stability and speed during training.



The goal of normalizing activations is to reduce the variability in the distribution of activations from one layer to the next. 

Without normalization, the distribution of activations can vary significantly during training, which can cause problems for training stability and convergence speed (this is called internal covariate shift).
	
•	Internal covariate shift occurs when the distribution of outputs from one layer shifts during training as the network weights are updated, which can make it harder for the next layer to learn.
	
•	Batch normalization reduces this issue by ensuring that activations have a stable distribution across layers.

•	During training, the model processes the data in mini-batches rather than using the entire dataset at once (which is called batch gradient descent) or one data point at a time (which is stochastic gradient descent).

•	A mini-batch is a subset of the training data that the model processes simultaneously. For example, if the batch size is 32, each mini-batch will consist of 32 randomly selected samples from the training dataset.

•	Batch Normalization is applied independently at each layer. When you have mini-batches of size 32 (for example), each of the layers (block1, block2, block3, block4) will receive and normalize the data in mini-batches of 32 samples, and each layer will compute the normalization using the batch it is processing at that time. 

•	The epochs for the mini-batch selection are chosen from the original data: normalizing the feature data at each layer based on the mini-batch from the original data, not the processed feature maps

•	The feature maps for each neuron are normalized by subtracting the mini-batch mean and dividing by the mini-batch standard deviation. This helps ensure that the activations from different neurons in the layer are in a similar range, improving training stability.

10. ELU (Exponential Linear Unit)

Activation functions like ELU, ReLU, Sigmoid, and others are essential components of neural networks because they introduce non-linearity into the model. Here’s why we apply these functions to the output of neurons in a neural network:

activation functions help “normalize” or “scale” the input values to a range that’s easier for the network to work with

Introducing Non-Linearity:

•	Neural networks aim to model complex relationships in data. Without activation functions, a neural network would essentially be just a linear model, no matter how many layers it had. This is because stacking multiple linear operations (like matrix multiplications) still results in a linear transformation of the input.

•	Non-linear activation functions allow the network to model more complex patterns. They enable the network to learn and approximate complex functions or decision boundaries that would be impossible with just linear transformations.

•	In short, without these activation functions, even a deep neural network would behave like a single-layer model, unable to capture the intricacies of complex data like EEG signals.

ELU = 
x, if x > 0 
alpha (exp(x) - 1), if x 

•	ELU behaves like ReLU for positive values (outputs the input directly).

•	For negative values, it outputs an exponentially decaying value, which helps to prevent neurons from being “dead” (always outputting zero like in ReLU).

•	Both tanh and sigmoid suffer from the vanishing gradient problem when the input values are very large or very small. This is because their derivatives approach zero at the extremes of their input range.

•	ELU, on the other hand, avoids this issue for positive inputs (since its gradient is constant for positive values) and smoothens the behavior for negative inputs, making it a good choice for deep networks.

•	In your EEG project, particularly with N170 detection, the data likely contains negative values (as EEG signals fluctuate around a baseline), so using ReLU could cause neurons to “die” and stop contributing to learning if they encounter negative values.

•	ELU ensures that both negative and positive activations are treated properly, and the network learns better from both types of input.

•	ELU also helps speed up training, which is important when dealing with large datasets like EEG signals with many epochs.

11. max pooling operation

MaxPooling is a downsampling operation commonly used in convolutional neural networks (CNNs). 

It reduces the spatial dimensions (height and width) of the input while retaining the most important information. 

This helps to reduce the number of parameters, computational load, and the risk of overfitting.

The MaxPooling2D layer operates over a 2D window (or filter) that slides over the input.

The window size is defined by the tuple (1, 2). This means that:

	Along the height (time dimension), the window moves by 1 unit (no change in the height dimension).

	Along the width (sample dimension), the window moves by 2 units at a time, reducing the width by a factor of 2.

For each window, the maximum value within the window is selected and passed to the next layer.

##### Ex. Applying MaxPooling2D((1, 2)):

•	The window size is (1, 2), which means the pooling operation will look at 2 samples at a time (along the columns) and pick the maximum value for each row across these two adjacent samples. The height (time dimension) remains the same because the window size along the height is 1.

Time 1 (a1,1)	Time 2 (a1,2)	Time 3 (a1,3)	Time 4 (a1,4)	Time 5 (a1,5)	Time 6 (a1,6)	Time 7 (a1,7)	Time 8 (a1,8)
a1,1			a1,2			a1,3			a1,4			a1,5			a1,6			a1,7			a1,8
a2,1			a2,2			a2,3			a2,4			a2,5			a2,6			a2,7			a2,8
a3,1			a3,2			a3,3			a3,4			a3,5			a3,6			a3,7			a3,8
a4,1			a4,2			a4,3			a4,4			a4,5			a4,6			a4,7			a4,8

	1.	First window (samples 1 and 2):
		•	Look at columns 1 and 2:
		•	Max(a1,1, a1,2) = max(a1,1, a1,2)
		•	Max(a2,1, a2,2) = max(a2,1, a2,2)
		•	Max(a3,1, a3,2) = max(a3,1, a3,2)
		•	Max(a4,1, a4,2) = max(a4,1, a4,2)
		•	So, the pooled output for the first window is the maximum value from columns 1 and 2 for each row.
	
	2.	Second window (samples 3 and 4):
	•	Look at columns 3 and 4:
	•	Max(a1,3, a1,4) = max(a1,3, a1,4)
	•	Max(a2,3, a2,4) = max(a2,3, a2,4)
	•	Max(a3,3, a3,4) = max(a3,3, a3,4)
	•	Max(a4,3, a4,4) = max(a4,3, a4,4)
	
	3.	Third window (samples 5 and 6):
	•	Look at columns 5 and 6:
	•	Max(a1,5, a1,6) = max(a1,5, a1,6)
	•	Max(a2,5, a2,6) = max(a2,5, a2,6)
	•	Max(a3,5, a3,6) = max(a3,5, a3,6)
	•	Max(a4,5, a4,6) = max(a4,5, a4,6)
	
	4.	Fourth window (samples 7 and 8):
	•	Look at columns 7 and 8:
	•	Max(a1,7, a1,8) = max(a1,7, a1,8)
	•	Max(a2,7, a2,8) = max(a2,7, a2,8)
	•	Max(a3,7, a3,8) = max(a3,7, a3,8)
	•	Max(a4,7, a4,8) = max(a4,7, a4,8)

Time 1 (a1,1)	Time 2 (a1,3)	Time 3 (a1,5)	Time 4 (a1,7)
max(a1,1, a1,2)	max(a1,3, a1,4)	max(a1,5, a1,6)	max(a1,7, a1,8)
max(a2,1, a2,2)	max(a2,3, a2,4)	max(a2,5, a2,6)	max(a2,7, a2,8)
max(a3,1, a3,2)	max(a3,3, a3,4)	max(a3,5, a3,6)	max(a3,7, a3,8)
max(a4,1, a4,2)	max(a4,3, a4,4)	max(a4,5, a4,6)	max(a4,7, a4,8)

12. What is Dropout?

•	A base dropout rate for regularization.
•	Specifies the fraction of neurons to randomly “turn off” during training to prevent overfitting.

•	During training, 50% of the neurons in the layer will be randomly deactivated (or “dropped out”) at each training step
•	This helps prevent the network from relying too heavily on any single neuron, promoting more robust learning.

•	0 means no dropout (i.e., all neurons are active during training).
•	1 means all neurons are deactivated, which is not practical because the network would not learn.
•	0.2 (for early layers)
•	0.3 to 0.4 (for deeper layers)
•	0.5 (for the final fully connected layers to prevent overfitting)

•	Recommendations:
•	0.2 to 0.3: For shallow networks or when you want to allow more information flow through the network.
•	0.4 to 0.5: For deeper networks or more complex models to ensure strong regularization and prevent overfitting.

•	In your case, in the code you shared, different dropout rates are applied to different layers:
•	Block 1: Dropout rate is 0.2. So, 20% of the neurons in this layer will be randomly deactivated.
•	Block 2: Dropout rate is 0.3. 30% of the neurons in this layer will be dropped.
•	Block 3: Dropout rate is 0.4. 40% of the neurons in this layer will be dropped.
•	Block 4: Dropout rate is 0.5. 50% of the neurons in this layer will be dropped.



How does Dropout work?

•	During training:

•	Dropout will randomly set 20% of the activations in the input tensor to zero.
•	The remaining 80% of neurons will be scaled up to ensure that the overall output is balanced. This means the activations of the remaining neurons are multiplied by a factor of 1/(1 - 0.2) = 1.25 during training.


•	During inference (testing or prediction):

•	No dropout is applied. All neurons are used for computation, but the weights are scaled down by the same factor of 0.8 to match the effect of training.



13. Why Dropout?

•	Prevents overfitting: Dropout forces the model to not rely on specific neurons and helps it generalize better to unseen data.

•	Improves robustness: By randomly disabling neurons during training, the network learns to create more robust features and can generalize to new patterns more effectively.


Example:

Imagine that block1 is a feature map with shape (36624, 1, 102, 25) (for simplicity, let’s assume it’s just 25 channels and 102 time points for a single epoch). When Dropout is applied:

•	Randomly 20% of the values from the block1 output will be set to zero.

•	This means that during each training iteration, the network will “forget” about certain neurons (features) and only focus on the remaining 80%.


Increasing the Number of Filters:

•	Purpose: The number of filters (also known as channels or feature maps) increases as we go deeper into the network because deeper layers are meant to learn increasingly abstract and complex features from the data.

•	Reason: Early layers of the network (e.g., Block 1) focus on simple features like edges, textures, or local patterns. As the network progresses, it captures more complex, high-level features that are built from the simpler ones detected earlier.

•	For example:
•	Block 1 (25 filters) might learn simple patterns in the data (like edges or small features in EEG signals).
•	Block 2 (50 filters) might combine these simple features to detect more complex patterns.
•	Block 3 (100 filters) could detect even higher-level patterns, such as combinations of previous patterns.
•	Block 4 (200 filters) can capture even more intricate patterns or more abstract representations.

By increasing the number of filters, the network has the capacity to learn more complex features from the data. This is especially useful as the layers get deeper, and the network needs more “capacity” to model more abstract representations.

14. Increasing the Dropout Rate:

•	Purpose: Dropout is a regularization technique that helps prevent overfitting by randomly “dropping out” (setting to zero) a fraction of the neurons in the network during training. The purpose of increasing the dropout rate as you go deeper in the network is to reduce overfitting and improve generalization as the model learns more complex patterns.

•	Reason:

•	In the early layers (Block 1), the network is learning basic features, so overfitting is less of an issue. A lower dropout rate (e.g., 0.2) allows the model to focus on learning the simpler features effectively.

•	As the network gets deeper, the features it learns become more complex. At this point, overfitting to these more complex features can become a greater risk. Increasing the dropout rate (e.g., to 0.5 in Block 4) forces the network to regularize more heavily and prevents it from relying too much on any single neuron, which helps improve generalization to unseen data.

14. Overview of Training Phases

	1.	Block 1 to Block 4: Feature Extraction
	•	The layers in Block 1 to Block 4 are primarily focused on feature extraction. These blocks consist of convolutional layers (Conv2D), activation functions (e.g., ELU), normalization (BatchNormalization), pooling layers (MaxPooling2D), and dropout layers (Dropout).
	•	The role of these layers is to process the input data, extract important features, and reduce the spatial dimensions while increasing the depth (number of filters).
	•	During training, these layers are trained through backpropagation, meaning the weights in the convolutional layers (filters) are adjusted to minimize the loss function.
	
	2.	Flattening the Output
	•	After Block 4, you have a feature map with 1 row and 1 column (if we assume this scenario). This output is passed through a Flatten layer, which converts the multi-dimensional feature map into a 1D vector.
	•	This flattened vector is what will be passed to the Dense layer.
	
	3.	Dense Layer: Final Classification (Fully Connected Layer)
	•	The Dense layer is a fully connected layer, meaning it connects every input neuron (from the flattened feature map) to every output neuron (nb_classes, in this case).
	•	In the Dense layer, the network learns to map the features extracted by the previous blocks to the final classification output (e.g., class probabilities).
	
	4.	Activation (Softmax)
	•	The output from the Dense layer is passed through a Softmax activation. Softmax is typically used in classification tasks to convert the output scores into probability distributions over the classes (for multi-class classification). It ensures that the sum of the output values is 1 and each value represents the likelihood of each class.


In [None]:
### CNN Model Implementation using DeepConvNet
# Defines a convolutional neural network (CNN) tailored for classification tasks.

def DeepConvNet(nb_classes, Chans, Samples):
    
    # This defines the shape of the input data that the model will receive.
    # 1: Indicates that the input is a single-channel input (i.e., grayscale or a single feature per time sample)
    input_main = Input((Chans, Samples, 1))
    
    # Block 1
    block1 = Conv2D(25, (1, 5), padding = 'same', use_bias = False) (input_main)
    block1 = Conv2D(25, (Chans, 1), use_bias = False) (block1)
    block1 = BatchNormalization()(block1)
    # applying the ELU (Exponential Linear Unit) activation function to the output of the previous layer
    block1 = Activation('elu')(block1)
    block1 = MaxPooling2D((1, 2))(block1)
    # Hard coded dropout rate instead of using a parameter for the progessive regularization
    block1 = Dropout(0.2)(block1)
    
    # Block 2
    block2 = Conv2D(50, (1, 5), padding='same', use_bias=False)(block1)
    block2 = BatchNormalization()(block2)
    block2 = Activation('elu')(block2)
    block2 = MaxPooling2D((1, 2))(block2)
    block2 = Dropout(0.3)(block2)
    
    # Block 3
    block3 = Conv2D(100, (1, 5), padding='same', use_bias=False)(block2)
    block3 = BatchNormalization()(block3)
    block3 = Activation('elu')(block3)
    block3 = MaxPooling2D((1, 2))(block3)
    block3 = Dropout(0.4)(block3)
    
    # Block 4
    block4 = Conv2D(200, (1, 5), padding='same', use_bias=False)(block3)
    block4 = BatchNormalization()(block4)
    block4 = Activation('elu')(block4)
    block4 = MaxPooling2D((1, 2))(block4)
    block4 = Dropout(0.5)(block4)
    
    ### Flatten and Dense Layers
    # After your convolutional and pooling layers, the feature map size has been reduced to 1 \times 1 \times N, where N is the number of channels (depth). 
    # The Flatten() layer is used to convert this 3D tensor into a 1D vector (flattened vector) so that it can be fed into fully connected (dense) layers.
    '''
    1.	Input to Flatten:
	•	Suppose the output from Block 4 is a feature map of size (1, 8, 200). This means it has 1 row, 8 columns, and 200 channels (depth). These 200 channels represent the feature maps learned by Block 4.
	2.	Flattening Process:
	•	The Flatten operation takes the entire (1, 8, 200) feature map and converts it into a 1D vector.
	•	The size of the vector will be the product of the dimensions of the feature map. So, if the feature map is (1, 8, 200), 
        flattening it will result in a vector of size: 1 * 8 * 200 = 1600
	•	This means the output of the Flatten operation will be a 1D vector of size 1600.
    '''
    flatten = Flatten()(block4)
	# The Dense layer (also called a fully connected layer) applies a set of weights to the input vector. Each neuron in the Dense layer is connected to every value in the input vector.
	# nb_classes represents the number of output units or neurons in the dense layer, which typically corresponds to the number of categories (or classes) you want to classify.
    # For example, if nb_classes = 10, this means that the Dense layer will output a vector of size 10.
	# In a Dense layer, the input vector (of size 1600, in our case) is multiplied by the weight matrix of the layer and then has a bias term added to it.
    # If nb_classes = 2 and the flatten is a 1D vector with 1600 values, the Dense layer will output a 1D vector of size 2, which represents the scores for the two classes.
    # W = [[w1,1, w1,2, ..., w1,1600],  is multiplied by 1 by 1600 input vector x
    #      [w2,1, w2,2, ..., w2,1600]]
    dense = Dense(nb_classes)(flatten)
    # After the Dense layer, you apply an activation function (like Softmax or Sigmoid) to get probabilities for each class, which helps in classification tasks.
    # Softmax: Usually used when you want to treat the outputs as mutually exclusive classes (one class is chosen).
	# Example: Classifying an image into one of 10 categories (e.g., cat, dog, bird, etc.).
	# Sigmoid: Used for binary decisions or when multiple classes can be true simultaneously (multi-label classification).
	# Example: Classifying an image as containing a cat and a dog (both can be true at the same time).
    softmax = Activation('softmax')(dense)
    
    return Model(inputs = input_main, outputs = softmax)




1. Gradient Descent and Local Minima

What is Gradient Descent?

	•	Gradient Descent is an optimization algorithm used to minimize a loss function (a mathematical measure of how far the network’s predictions are from the actual labels).
	
    •	The gradient of the loss function with respect to the weights indicates the direction and magnitude of change required to reduce the loss.

How Does It Work?

	1.	Forward Pass:
	•	Input data is passed through the network layers (convolutions, activations, pooling, etc.), producing an output (prediction).
	
    2.	Compute Loss:
	•	The output is compared to the ground truth (actual labels) using a loss function (e.g., cross-entropy for classification).
	
    3.	Backward Pass (Backpropagation):
	•	The gradient of the loss with respect to each weight in the network is computed using the chain rule.
	•	Gradients flow backward from the output layer to earlier layers, updating the weights to reduce the loss.

Key Role of Local Minima:

	•	By adjusting weights based on the gradient, the algorithm moves toward a local minimum of the loss function, where the loss is lowest and the predictions are most accurate.
	
    •	Global vs. Local Minima: In deep learning, the loss landscape is highly complex with many local minima. Modern optimization techniques (like stochastic gradient descent) help navigate this landscape effectively.

2. Gradient Calculation

	•	For each training sample or batch:
	•	During the forward pass, the model predicts an output using the current weights.
	•	The loss is calculated by comparing the predicted output to the true label.
	•	During the backward pass, gradients of the loss with respect to each weight (∂Loss/∂W) are computed. These gradients represent the direction and magnitude of change needed to minimize the loss.

	For each weight  W_{ij} :
	•	Gradients are calculated independently for that weight.
	•	The weight is updated using the gradient:

	W_{ij} \leftarrow W_{ij} - \eta \cdot \frac{\partial \text{Loss}}{\partial W_{ij}}

	Where:
		•	 \eta  is the learning rate (a small positive number determining the step size).
		•	 \frac{\partial \text{Loss}}{\partial W_{ij}}  is the gradient of the loss with respect to  W_{ij} .
	
	When training in batches or mini-batches:
	•	Gradients are computed for each sample in the batch.
	•	The gradients are then averaged across the batch.
	•	The averaged gradient is used to update the weights.

	This averaging ensures that the updates are more stable and reflect the overall trend of the batch rather than being influenced by individual noisy samples.

3.	Optimize the Cost Function using Derivatives

Derivatives quantify how small changes in one variable (e.g., a weight or bias) affect another variable (e.g., the cost). 

	•	A derivative measures the rate of change of one variable with respect to another.
	•	For a function  f(x) , the derivative  f{\prime}(x)  tells us how much  f(x)  changes when  x  changes slightly.

	Example:
	Suppose  f(x) = x^2 :
	•	When  x = 2 ,  f'(x) = 2x = 4 
	•	This means that at  x = 2 , if  x  increases by a tiny amount (Delta x),  f(x)  will increase approximately by  4 * Delta(x)

	The derivative gives us a local understanding of how changes in  x  affect  f(x) .



The goal of training a neural network is to minimize the cost function (e.g., mean squared error or cross-entropy), which measures how far the model’s predictions are from the true labels.

	•	To find the optimal weights (W) that minimize the cost, we need to know:
	•	In which direction to adjust W.
	•	How much to adjust W.
	•	The derivative of the cost w.r.t. the weights provides this information:

\frac{\partial \text{Cost}}{\partial W}

	•	Sign of the derivative: Determines the direction of adjustment.
	•	If positive, reduce the weight.
	•	If negative, increase the weight.
	•	Magnitude of the derivative: Determines the size of the adjustment.
	•	Large derivative → larger change.
	•	Small derivative → smaller change.

In a neural network, the cost function  C  depends on the weights  W  (and biases). Each weight contributes to the model’s output, which in turn affects the cost.

The derivative  \frac{\partial C}{\partial W}  tells us:
	•	How sensitive the cost is to a small change in a particular weight.
	•	Specifically, it answers: “If I slightly increase or decrease this weight, how will the cost change?”

Why is this helpful?

	1.	If the cost decreases significantly when a weight increases, we want to increase that weight.
	2.	If the cost increases when a weight increases, we want to decrease that weight.



Gradient descent also uses derivatives to iteratively adjust the weights:

W_{\text{new}} = W_{\text{old}} - \eta \cdot \frac{\partial \text{Cost}}{\partial W}

	•	By repeatedly moving in the direction of the negative gradient, the algorithm converges toward the minimum of the cost function.


Measure Sensitivity

	•	The derivative tells us how sensitive the cost is to changes in a specific weight or input. This helps identify which parameters have the most significant impact on the network’s performance.



Chain Rule in Backpropagation :

	•	In a neural network,  C  depends on  W  indirectly through intermediate layers:

	w -> z -> a -> c 

	•	C = f(g(h(W)))

	•	To compute  \frac{\partial C}{\partial W} , we use the chain rule:

	•	\frac{\partial C}{\partial W} = \frac{\partial C}{\partial h} \cdot \frac{\partial h}{\partial g} \cdot \frac{\partial g}{\partial W}

	•	This ensures the gradient accounts for all layers and connections between  W  and  C .




1. compile

The optimizer is responsible for updating the weights during training to minimize the loss function.

Adam (used in this code):

•	Adam (Adaptive Moment Estimation) combines:
	•	Momentum: Accelerates convergence by considering past gradients.
	•	Adaptive Learning Rates: Scales learning rates for each weight individually based on past gradients.

    •	Why it matters: Adam automatically adjusts the learning rate for each weight based on the gradient history. This is particularly useful in CNNs, where:
	
    •	Gradients might vary significantly across layers (e.g., early layers vs. deeper layers).
	
    •	Features extracted in convolutional layers may lead to diverse scales of gradient values.
	
    •	Benefit for your project: Since your CNN involves multiple blocks (block1 to block4), adaptive learning rates ensure stable and efficient updates across all layers without requiring manual tuning.

    •	Each weight in the network (whether in block1, block2, or block4) gets its own learning rate that adjusts dynamically during training.
	
    •	Adam essentially scales the influence of the gradient based on how it behaved in previous iterations, ensuring smoother and more stable updates.
	

•	Parameters:
	•	learning_rate = 1e-3: Initial step size for weight updates.
	•	Common values: 10^{-2}, 10^{-3}, 10^{-4}.
    
    •	Higher values (e.g., 1e-2) can lead to faster initial learning but risk overshooting the optimal weights.
	
    •	Lower values (e.g., 1e-4) result in slower training, which might be unnecessary given Adam’s built-in safeguards against large gradient steps.

•	Other Adam-specific parameters:
	•	beta_1=0.9: Exponential decay rate for the first moment estimate.
	•	beta_2=0.999: Exponential decay rate for the second moment estimate.
	•	epsilon=1e-7: Small value to prevent division by zero.

Other Common Optimizers:

•	SGD (Stochastic Gradient Descent):
	•	Updates weights using a fixed learning rate.
	•	Parameters:
	•	learning_rate: Fixed step size.
	•	momentum: Adds inertia to updates.
	•	Example: optimizer=SGD(learning_rate=0.01, momentum=0.9)

    •	Simple Approach: Updates weights based on the gradient of the loss function with respect to weights.
	
    •	Learning Rate Scheduling: Requires careful tuning of the learning rate, often needing decay strategies for optimal results.
	
    •	Momentum (Optional): Can include momentum to accelerate convergence.

When to Choose What?

Adam:

	•	When you’re working with:
	•	A deep architecture (e.g., Transformers, CNNs, RNNs).
	•	Sparse data or embeddings (e.g., language models, recommendation systems).
	•	A new problem where hyperparameter tuning is challenging.

SGD:

	•	When:
	•	You can invest time in tuning learning rates and momentum.
	•	You prioritize generalization over faster convergence.
	•	Working on image tasks like ResNet or models with batch normalization.



loss = 'sparse_categorical_crossentropy'

What Does It Mean?

	•	The loss function is the measure of how well the model’s predictions match the true labels. During training, the optimizer minimizes this loss.
	•	sparse_categorical_crossentropy is a specific type of loss function used for classification problems where:
	•	The target labels are integers (e.g., 0, 1, 2, …).
	•	The model outputs probabilities for each class via a softmax activation.

Why Sparse?

	•	Sparse means the target labels are provided as integers instead of one-hot encoded vectors.
	•	Example: If you have 3 classes, a target label of 1 would represent the class [0, 1, 0] in one-hot encoding.
	•	sparse_categorical_crossentropy saves computation and memory by working directly with integer labels.



metrics = ['accuracy']

What Does It Mean?

	•	Metrics are used to evaluate the performance of your model during training and validation. Unlike the loss function, metrics don’t influence training directly but provide useful feedback.

Why Accuracy?

	•	Accuracy is the percentage of correctly classified samples:

\text{Accuracy} = \frac{\text{Number of Correct Predictions}}{\text{Total Number of Predictions}}

	•	For classification tasks, it’s a straightforward and interpretable metric.

When to Use:

	•	Use accuracy for balanced datasets where each class has roughly the same number of examples.
	•	If your dataset is imbalanced, consider alternative metrics like precision, recall, or F1-score.



2. EarlyStopping

•	What It Does:
	•	Monitors the model’s performance on the validation dataset (val_loss in this case).
	•	Stops training early if the monitored metric (val_loss) does not improve for a specified number of epochs (patience).
	•	Prevents overfitting and saves computation time by avoiding unnecessary training epochs.
	
•	Key Parameters:
	•	monitor: The metric to track. Common choices are:
	•	'val_loss': Validation loss (used here).
	•	'val_accuracy': Validation accuracy.

•	patience: Number of epochs to wait for improvement before stopping training.
	•	In this case, training will stop if val_loss does not improve for 25 consecutive epochs.

•	restore_best_weights: Ensures the weights from the best epoch are used at the end of training.
	•	Prevents the model from ending with weights from a less optimal epoch.

3. ReduceLROnPlateau

•	What It Does:
	•	Dynamically adjusts the learning rate during training if the model’s performance plateaus (stops improving).
	•	Reducing the learning rate allows the model to make smaller, more precise adjustments to the weights, which can help it converge better.
	
•	Key Parameters:
	•	monitor: The metric to track. Like EarlyStopping, this is set to 'val_loss' here.
	
	•	factor: The factor by which the learning rate is reduced.
		•	In this case, the learning rate is halved (factor=0.5) when the validation loss stops improving.
	
	•	patience: Number of epochs to wait before reducing the learning rate.
		•	If val_loss does not improve for 10 epochs, the learning rate is reduced.
	
	•	min_lr: The minimum learning rate allowed.
		•	Prevents the learning rate from becoming too small and stopping useful updates.
		•	Here, it’s set to 1e-6.
	
	•	verbose: Controls the verbosity.
		•	verbose=1 prints a message when the learning rate is reduced.


In [None]:
### Parameters for DeepConvNet
# Binary classification
nb_classes = 2  
# Chans: Number of channels.
Chans = X.shape[1]
# Samples: Number of time samples.
Samples = X.shape[2]

# Compile the model
model = DeepConvNet(nb_classes = nb_classes, 
                    Chans = Chans, 
                    Samples = Samples)

model.compile(optimizer = Adam(learning_rate = 1e-3),
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

# Adjust EarlyStopping and ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', 
                               patience=25, 
                               restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                              factor=0.5, 
                              patience=10,
                              min_lr=1e-6, 
                              verbose=1)


1. epochs

•	Definition:
	•	The number of full passes over the training dataset during training.
	•	Each epoch involves training the model on the entire dataset once.
	
•	Value Set:
	•	epochs = 500 means the model will go through the dataset up to 500 times unless interrupted by EarlyStopping.
	
•	Why 500?:
	•	It’s a high enough value to ensure the model has ample opportunity to learn.
	•	In practice, EarlyStopping will likely terminate the training before reaching 500 epochs if the validation loss stops improving.

The epochs in the context of training a neural network refer to how many times the entire training dataset is passed through the model, not the number of individual data points in the dataset.

Backpropagation will occur once per epoch, so if you have 500 epochs, backpropagation will be applied 500 times.

•	Epochs: One epoch means that the model has seen every data point in the training set once.
	
    •	For example, if you have a dataset with 13,000 samples, and you set epochs = 500, this means the model will go through the 13,000 samples 500 times during training.

    •	When you train the model, it will process all 13,000 samples in one epoch.

	•	After completing the first pass (epoch) over the data, the model will adjust the weights using backpropagation and then start the next epoch, repeating this 500 times.

In one epoch, you:
	1.	Do the forward pass to get predictions.
	2.	Compute the loss based on the predictions.
	3.	Perform backpropagation to compute the gradients of the loss with respect to the model’s weights.
	4.	Use gradient descent (or Adam, or other optimizers) to update the weights.

2. batch_size

•	Batch size is the number of samples from your training dataset that the model will process simultaneously in one forward and backward pass before the weights are updated.

In your case, with batch_size = 16, the model will:
	1.	Take 16 samples from the training dataset (e.g., 16 images or 16 data points).
	2.	Perform a forward pass through the network with these 16 samples.
	3.	Compute the loss (error) for all 16 samples.
	4.	Perform backpropagation to calculate the gradients of the loss for each of the weights, considering all 16 samples at once.
	5.	Use the optimizer (e.g., Adam) to update the weights based on the calculated gradients.

After these steps, the model will have completed one batch of training, and then it will move on to the next batch of 16 samples, continuing the training process for the entire dataset.

Why Batch Size Matters:

•	Smaller Batch Sizes (like 16):
	•	Pros:
	•	Faster computation per batch since fewer samples are processed at a time.
	•	Can provide a more accurate estimate of the gradients for each batch.
	•	Helps in better generalization by adding more noise in the gradient estimates (this can help avoid overfitting).
	
    •	Cons:
	•	The updates to the weights can be noisier and less stable, as the gradient estimate might be less accurate for smaller batches.

•	Larger Batch Sizes:
	•	Pros:
	•	More stable gradient updates as the weight updates are computed over a larger number of samples.
	•	Can potentially speed up training since fewer updates are needed for each epoch.
	
    •	Cons:
	•	Can be more computationally expensive and require more memory.
	•	Might overfit more since the gradients are less noisy.

Given 13,000 samples and a batch size of 16, the dataset will be divided into batches, each containing 16 samples. For each epoch, the model will process all of these batches sequentially. 

When batch normalization is used in the model, it is applied independently to each batch rather than to the entire dataset.


In [None]:

# Training
epochs = 500
batch_size = 16  # Keep batch size small for better gradient estimation

# Split data into training and testing sets
# split the dataset into training and test sets, ensuring that the model can be trained on one portion of the data and evaluated on another to check its generalization performance.
# test_size=0.2:
    # This parameter specifies the proportion of the dataset to be used for the test set.
    # 0.2 means 20% of the data will be used for testing, and the remaining 80% will be used for training (the model will be trained on 80% of the data and tested on 20%).
# random_state=42:
    # This is a seed for the random number generator that ensures the split is reproducible. If you run the code multiple times with the same random_state, you will get the same split of data each time.
    # It’s a good practice to set a random_state when you want to ensure that your results can be replicated exactly.
# stratify=y_aug:
    # This ensures that the distribution of classes in the training and test sets is similar to the distribution in the original dataset (y_aug).
    # For example, if your data has an equal number of classes, stratification ensures that both the training and test sets will have roughly the same proportions of each class.
    # This is especially useful in imbalanced datasets, where one class might be underrepresented. Without stratification, the test set might end up with very few or no samples of a certain class, which would hurt the model evaluation.
X_train_val, X_test, y_train_val, y_test = train_test_split(X_aug, 
                                                            y_aug, 
                                                            test_size = 0.2, 
                                                            random_state = 42, 
                                                            stratify = y_aug)


# Train the model
# validation_data=(X_test, y_test):
	# During training, the model will also evaluate its performance on the validation data (X_test, y_test) at the end of each epoch (i.e., after each pass through the training data).
	# The validation data is used to track how well the model is generalizing to unseen data, and it’s crucial for monitoring overfitting.
	# The loss and accuracy on the validation data will be calculated and printed alongside the training loss and accuracy.
history = model.fit(X_train_val, 
                    y_train_val,
                    epochs = epochs,
                    batch_size = batch_size,
                    validation_data = (X_test, y_test),
                    callbacks = [early_stopping, reduce_lr],
                    verbose = 1)

In [None]:
# Evaluation and Visualization
# Evaluates the trained model on the test data (X_test) and the corresponding true labels (y_test).
# This function calculates two things:
	# Test Loss: The loss value on the test data (how well the model’s predictions match the true labels).
	# Test Accuracy: The accuracy of the model on the test data (how many predictions were correct).
# verbose=0: This suppresses output during evaluation. If set to 1, it will display the progress bar.
# print(f'Test Accuracy: {test_accuracy * 100:.2f}%'): This prints the test accuracy, multiplied by 100 to express it as a percentage.
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Visualize training history
# Plot accuracy
plt.figure(figsize = (12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label = 'Train Accuracy', color = 'blue')
plt.plot(history.history['val_accuracy'], label = 'Validation Accuracy', color = 'orange')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label = 'Train Loss', color = 'blue')
plt.plot(history.history['val_loss'], label = 'Validation Loss', color = 'orange')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


# Create the 'figures' directory if it does not exist
if not os.path.exists('figures'):
    os.makedirs('figures')
    
# Save the figure to the 'figures' directory in the current working directory
plt.savefig('figures/model_accuracy_loss.png')
# plt.close() 

In [None]:
# Predictions
# This generates the predicted probabilities for each class for the test samples. 
# Since the model output is a probability distribution over the classes (using the softmax activation function in the final layer), the output y_pred_prob is a 2D array with the shape (num_samples, num_classes).
y_pred_prob = model.predict(X_test)
# This finds the class with the highest predicted probability for each sample. 
# The axis=1 means it looks across the columns (the class probabilities for each sample), and np.argmax returns the index of the highest value (which corresponds to the predicted class). 
# The result is a 1D array of predicted class labels (y_pred).
y_pred = np.argmax(y_pred_prob, axis=1)

# Confusion Matrix and Classification Report
# This calculates the confusion matrix, which compares the true labels (y_test) to the predicted labels (y_pred). 
# The confusion matrix is a table used to describe the performance of a classification model. 
# It shows the counts of true positive, true negative, false positive, and false negative predictions.
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))

sns.heatmap(cm, 
            annot = True, 
            fmt = 'd', 
            cmap = 'Blues',
            xticklabels = ['Face', 'Car'],
            yticklabels = ['Face', 'Car'])

plt.xlabel('Predicted Label')

plt.ylabel('True Label')

plt.title('Confusion Matrix')

plt.show()


# Save the figure to the 'figures' directory in the current working directory
plt.savefig('figures/confusion_matrix.png')

# plt.close()


report = classification_report(y_test, y_pred, target_names=['Face', 'Car'])

print(report)


# Save the classification report to a text file
with open('classification_report.txt', 'w') as f:
    f.write(report)  # Write the report to the file


model.save('figures/V2.keras')