<a href="https://colab.research.google.com/github/AndrewKruszka/NeuralMachineLearning/blob/main/HW7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Read in the data and determine problem

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the EEG dataset
url = "https://raw.githubusercontent.com/AndrewKruszka/NeuralMachineLearning/main/EEG_data.csv"

# Read the dataset
df = pd.read_csv(url)

# Display the first few rows
print("First few rows of the dataset:")
display(df.head())

# Display column names
print("\nColumn Names:")
print(df.columns)

# Identify input features and target variable
features = df.drop(columns=['predefinedlabel', 'user-definedlabeln'])  # Assuming 'predefinedlabel' is the target
target = df['predefinedlabel']

print("\nFeature Variables:", features.columns.to_list())
print("Target Variable:", target.name)
print()
print()
# Determine if this is a classification or regression problem
if target.nunique() <= 10:  # Assuming classification if labels are limited
    print("\nProblem Type: Classification")
else:
    print("\nProblem Type: Regression")


First few rows of the dataset:


Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
0,0.0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,45097.0,33228.0,8293.0,0.0,0.0
1,0.0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,3687.0,5293.0,2740.0,0.0,0.0
2,0.0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,130536.0,57243.0,25354.0,0.0,0.0
3,0.0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,62462.0,49960.0,33932.0,0.0,0.0
4,0.0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,99603.0,44790.0,29749.0,0.0,0.0



Column Names:
Index(['SubjectID', 'VideoID', 'Attention', 'Mediation', 'Raw', 'Delta',
       'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2',
       'predefinedlabel', 'user-definedlabeln'],
      dtype='object')

Feature Variables: ['SubjectID', 'VideoID', 'Attention', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
Target Variable: predefinedlabel



Problem Type: Classification


Fit a LogisticRegression using sklearn and neural network classifier using PyTorch on the data

In [None]:
# Drop non-feature columns
X = df.drop(columns=['SubjectID', 'VideoID', 'predefinedlabel', 'user-definedlabeln'])
y = df['predefinedlabel']  # Target variable

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression (Increase max_iter to 1000)
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)

print("\033[1mLogistic Regression Model Performance (After Scaling):\033[0m")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



[1mLogistic Regression Model Performance (After Scaling):[0m
Accuracy: 0.5380

Classification Report:
              precision    recall  f1-score   support

         0.0       0.54      0.73      0.62      1333
         1.0       0.53      0.33      0.41      1230

    accuracy                           0.54      2563
   macro avg       0.54      0.53      0.51      2563
weighted avg       0.54      0.54      0.52      2563



The original model has an accuracy of 53.8% and performs better on one of the classes than the other, indicating that some features may have more influence

Next we will perform some feature engineering in order to improve the classification

In [None]:
# Load the dataset
url = "https://raw.githubusercontent.com/AndrewKruszka/NeuralMachineLearning/main/EEG_data.csv"
df = pd.read_csv(url)

# Drop non-feature columns
X = df.drop(columns=['SubjectID', 'VideoID', 'predefinedlabel', 'user-definedlabeln'])
y = df['predefinedlabel']

### 🔹 Step 1: Normalize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\033[1mFeature normalization completed.\033[0m")

### 🔹 Step 2: Eliminate Outliers using IQR
Q1 = np.percentile(X_scaled, 25, axis=0)
Q3 = np.percentile(X_scaled, 75, axis=0)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

mask = ~((X_scaled < lower_bound) | (X_scaled > upper_bound)).any(axis=1)

X_filtered = X_scaled[mask]
y_filtered = y[mask]

print(f"\033[1mOutliers removed:\033[0m {X_scaled.shape[0] - X_filtered.shape[0]} samples")

### 🔹 Step 3: Feature Selection
selected_features = ['Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
X_selected = df[selected_features].values

# Normalize after feature selection
X_selected_scaled = scaler.fit_transform(X_selected)

print(f"\033[1mSelected {len(selected_features)} key EEG features.\033[0m")

### 🔹 Step 4: Subset of SubjectIDs
selected_subjects = df['SubjectID'].unique()[:5]  # Use only first 5 subjects
df_subset = df[df['SubjectID'].isin(selected_subjects)]

X_subject = df_subset[selected_features].values
y_subject = df_subset['predefinedlabel'].values

X_subject_scaled = scaler.fit_transform(X_subject)

print(f"\033[1mTraining on {len(selected_subjects)} subjects.\033[0m")


[1mFeature normalization completed.[0m
[1mOutliers removed:[0m 4430 samples
[1mSelected 8 key EEG features.[0m
[1mTraining on 5 subjects.[0m


In [None]:
# Use the cleaned and selected features
X_final = X_subject_scaled  # Feature-engineered dataset
y_final = y_subject         # Corresponding labels

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42, stratify=y_final)

# Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)

print("\033[1mLogistic Regression Model Performance (After Feature Engineering):\033[0m")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[1mLogistic Regression Model Performance (After Feature Engineering):[0m
Accuracy: 0.5306

Classification Report:
              precision    recall  f1-score   support

         0.0       0.54      0.70      0.61       668
         1.0       0.52      0.35      0.42       623

    accuracy                           0.53      1291
   macro avg       0.53      0.52      0.51      1291
weighted avg       0.53      0.53      0.52      1291

