### Todo
1. fix plotting graph error
2. get the data
  - calculate the data needed for each time try to balance both ofthe on and off
3. test the model and adjust the model layers
5. create an app that automtically preprocess the data
  - Select between SMOTE and RandomOverSampler
  - modifying the Layers
    - dropoff selection
    - LSTM vs GRU
    - modify the amount of node used in each layer
      - if it overfitting decrease the node
      - if it underfitting increase the node
6. add the model analysis graph at the end of the training


### For real time EEG use (if we already have a trained model)

1. real time Data acquisition
2. Data Preprocessing Pipeline
3. Main Real time Loop

## Let's set up and load some file

In [None]:
!pip install imblearn

In [None]:
# import library section
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, ReLU, MaxPool1D, LSTM, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# add the file path here
filepath = "/content/drive/MyDrive/EEG_on_off/.csv.zip"
extract_path = "/content/drive/MyDrive/EEG_on_off/"
with ZipFile(filepath, "r") as zip:
    zip.extractall(extract_path)

In [None]:

# I think the information we are going to collect and use is in the full file from
# but I think we are going to think on one file and think off one file
# and then came here and label everything in those file
# on or off or set it as 0, 1
on_path = ""
off_path = ""

on_data = pd.read_csv(on_path)
off_data = pd.read_csv(off_path)

### Data Exploration

In [None]:
shape_on = on_data.shape
shape_off = off_data.shape

print(shape_on)
print(shape_off)

add label to both of the data

In [None]:
# prompt: add a label column to on and off dataframe

on_data['label'] = 1  # Label 'on' data with 1
off_data['label'] = 0 # Label 'off' data with 0

In [None]:
display(on_data.describe())
display(off_data.describe())

In [None]:
on_data.head()

In [None]:
off_data.head()

In [None]:
on_data.info()
off_data.info()

Compare Sample data



In [None]:
"""
!!! Don't forget to change the sequence length here
"""

# let's select some random sample
n = np.random.randint(100)
seq_len = 2549

# .iloc[row, column]
# see some random eeg data
on_eeg = on_data.iloc[n, :seq_len]
off_eeg = off_data.iloc[n, :seq_len]

plt.plot(on_eeg, label="on")
plt.plot(off_eeg, label="off")
plt.legend()
plt.show()

Plot Entire data

In [None]:
# now let's see all of the data in the graph to see the overall different
plt.figure(figsize=(16,10), dpi=200)
# plot 2 rows 1 column at position 1
plt.subplot(2, 1, 1)
plt.plot(on_data.iloc[:, :seq_len], label="on")
plt.title("On")
plt.legend()

# plot 2 rows 1 column at position 2
plt.subplot(2, 1, 2)
plt.title("Off")
plt.plot(off_data.iloc[:, :seq_len], label="off")
plt.legend()

plt.show()

Combine both data frame

In [None]:
# prompt: label every thing in those two dataframe with 0 mean off and 1 mean on, and then join those two dataframe (join in vertical though like adding more data)

# Label the dataframes
on_data['label'] = 1
off_data['label'] = 0

# Combine the dataframes vertically
# so now I join with ignore index so the the index won't combine with each other
combined_data = pd.concat([on_data, off_data], ignore_index=True)

# let's check the head and the tail of the data
print(combined_data.head())
print(combined_data.tail())
combined_data.shape


In [None]:
combined_data.info()

Check Null data


In [None]:
# display the max columns (the column with the most NULL)

pd.set_option('display.max_column', None)
on_data.isnull().sum()
off_data.isnull().sum()

Check Each type of the Sample


In [None]:
on_eeg = combined_data.loc[combined_data['label'] == 1, :].iloc[:, :seq_len]
off_eeg = combined_data.loc[combined_data['label'] == 0, :].iloc[:, :seq_len]

plt.figure(figsize=(16,10), dpi=200)

plt.subplot(2,1,1)
plt.plot(on_eeg.T, label="on") # Use .T to plot each sample as a line
plt.title("On")
plt.legend()

plt.subplot(2,1,2)
plt.plot(off_eeg.T, label="off") # Use .T to plot each sample as a line
plt.title("Off")
plt.legend()

plt.show()

See the distribution of the data

In [None]:
labels = {
    0: "off",
    1: "on"
}

value_counts = combined_data.iloc[:,-1].value_counts().rename(labels)

plt.bar(value_counts.index, value_counts.values)
plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Distribution of Labels")
plt.xticks(rotation=15)
plt.show()

## Data Preprocessing

### Data Augmentation

In [None]:
ros = RandomOverSampler(random_state=42)
data = combined_data.iloc[:, :-1]
labels = combined_data.iloc[:, -1]
X_train_resampled, y_train_resampled = ros.fit_resample(data, labels)

df = pd.concat([X_train_resampled, y_train_resampled], axis=1)
# replave the old data frame with the same index (new dataframe)

df.head()

In [None]:
df.shape

Check the data Distribution

In [None]:
labels = {
    0: "off",
    1: "on"
}

value_counts = df.iloc[:,-1].value_counts().rename(labels)

plt.bar(value_counts.index, value_counts.values)
plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Distribution of Labels")
plt.xticks(rotation=15)
plt.show()

### Train Test Split

In [None]:
"""
we can adjust the split of the data here if we want to see the difference
"""

train, temp = train_test_split(df, test_size=0.4, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

train.shape()
test.shape()
val.shape()

Split Label and Feature

In [None]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

X_val = val.iloc[:, :-1]
y_val = val.iloc[:, -1]

X_train.shape()
y_train.shape()
X_test.shape()
y_test.shape()
X_val.shape()
y_val.shape()

Data Reshape

In [None]:
# just like the same shpae but add the make the voltage value into array for future feature to came in
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

X_train.shape()
X_test.shape()
X_val.shape()

In [None]:
"""
Don't need to run this. but in the future we maybe have to run this
because now our output is just 0,1
but later we maybe want to convert it to array
"""
classification_num = 2 # on off
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)


## Create the model

I want the model to be something like this

"Let try a lot of combination we still got sometime"
Input layer
1. 1D CNN layers x3 (32, 74, 128)
  - BatchNormalization
  - ReLU()
  - Maxpool1D
2. LSTM layer (1-2 layer would be good)
  - 64 - 128 units
3. Dense layer
  - 64 ReLu
  - 32 ReLu
  - 1 Sigmoid


In [None]:
model_on_off = Sequential([
    Input(shape=(X_train.shape[1:])),

    Conv1D(32, 5, padding="same"),
    BatchNormalization(),
    ReLU(),
    MaxPool1D(pool_size=2),
    Dropout(0.1),

    Conv1D(64, 5, padding="same"),
    BatchNormalization(),
    ReLU(),
    MaxPool1D(pool_size=2),
    Dropout(0.15),

    Conv1D(128, 5, padding="same"),
    BatchNormalization(),
    ReLU(),
    MaxPool1D(pool_size=2),
    Dropout(0.2),

    LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, activation="tanh"),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2, activation="tanh"),
    Dropout(0.3),

    Dense(64, activation="relu"),
    Dropout(0.4),
    Dense(32, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid") # Changed activation to sigmoid for binary classification
    # if we want to add more function I might have to use the softmax (later adjustment)
])

model_on_off.summary()

### Compile the model

In [None]:
model_on_off.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

### Model Callbacks

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=8),
             ReduceLROnPlateau(
                 patience=20,
                 monitor='val_loss',
                 min_lr=1e-5,
                 cool_down=20),
              ModelCheckpoint('best_model.h5',
                              save_best_only=True,
                              monitor='val_loss')
              ]

In [None]:
history = model_on_off.fit(X_train, y_train,
                           validation_data=(X_val, y_val),
                           epochs=10,
                           batch_size=32,
                           callbacks=callbacks,
                           verbose=1)

In [None]:
# Save the model
model_on_off.save('/content/drive/MyDrive/my_models/my_eeg_model.h5')

## Access Model and use model

In [None]:
import tensorflow as tf

# Load the saved model
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/my_models/my_eeg_model.h5')

# Display the model summary to confirm it's loaded correctly
loaded_model.summary()

So to use this model when attaching to the EEG measure tool I need to made an app that autometically preprocess the data like process before making the model