In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/button-tone-sz/demographic.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

# Manually Feature Extraction

In [None]:
df_18 = pd.read_csv('/kaggle/input/button-tone-sz/18.csv/18.csv',header =None)
df_time = pd.read_csv('/kaggle/input/button-tone-sz/time.csv')
df_column = pd.read_csv('/kaggle/input/button-tone-sz/columnLabels.csv')
df_demographic = pd.read_csv('/kaggle/input/button-tone-sz/demographic.csv')

In [None]:
print(df_demographic.columns.tolist())
# Clean column names
df_demographic.columns = df_demographic.columns.str.strip() # group has a leading space so we have to clean this 


In [None]:
# Step 1: Assign column names to df_18 using df_column
df_18.columns = df_column.columns

# Step 2: Merge with df_time on 'sample' to add time in milliseconds
df_18 = df_18.merge(df_time, on='sample', how='left')

# Step 3: Extract metadata for subject 18 from df_demographic
subject_id = 18
subject_meta = df_demographic[df_demographic['subject'] == subject_id].iloc[0]

# Step 4: Add subject-level info (metadata) to every row in df_18
df_18['subject'] = subject_id
df_18['group'] = subject_meta['group']         # 0 = Control, 1 = Schizophrenia
df_18['age'] = subject_meta['age']
df_18['gender'] = subject_meta['gender']
df_18['education'] = subject_meta['education']

# Show result
df_18.head()


In [None]:
# Step 1: Choose EEG channels (excluding meta columns)
eeg_channels = df_column.columns[4:]  # Skip ['subject', 'trial', 'condition', 'sample']

# Step 2: Create a dictionary to hold each trial's EEG matrix
trials = {}

# Step 3: Group df_18 by trial
for trial_num, trial_df in df_18.groupby('trial'):
    eeg_data = trial_df[eeg_channels].values.T  # Shape: [channels x time]
    trials[int(trial_num)] = eeg_data


In [None]:
# Print shape of one trial (e.g., trial 1)
print("Trial 1 EEG shape:", trials[1].shape)  # Example: (64 channels, 3072 time points)


In [None]:
import numpy as np
import pandas as pd
import pywt
from scipy.stats import skew, kurtosis
from scipy.signal import welch

# Simulate a single EEG trial shape (70 channels x 6144 samples)
np.random.seed(42)
eeg_data = np.random.randn(70, 6144)

# Frequency bands (Hz)
bands = {
    "delta": (0.5, 4),
    "theta": (4, 8),
    "alpha": (8, 12),
    "beta": (12, 30),
    "gamma": (30, 100),
}

# Sampling rate (from time.csv: ~0.976 ms/sample → ~1024 Hz)
fs = 1024  

def extract_features(eeg_trial, fs):
    features = []
    for channel_data in eeg_trial:
        # Statistical features
        features += [
            np.mean(channel_data),
            np.std(channel_data),
            skew(channel_data),
            kurtosis(channel_data),
            np.min(channel_data),
            np.max(channel_data),
            np.median(channel_data),
        ]

        # Spectral features using Welch PSD
        freqs, psd = welch(channel_data, fs)
        for band in bands.values():
            idx = np.logical_and(freqs >= band[0], freqs <= band[1])
            features.append(np.sum(psd[idx]))

        # Wavelet features (using 'db4', 5-level decomposition)
        coeffs = pywt.wavedec(channel_data, 'db4', level=5)
        for c in coeffs:
            features.append(np.mean(c))
            features.append(np.std(c))
    return features

# Extract features from the simulated EEG trial
trial_features = extract_features(eeg_data, fs)

# Generate feature column names
feature_names = []
for ch in range(70):
    feature_names += [
        f"ch{ch}_mean", f"ch{ch}_std", f"ch{ch}_skew", f"ch{ch}_kurt", 
        f"ch{ch}_min", f"ch{ch}_max", f"ch{ch}_median"
    ]
    feature_names += [f"ch{ch}_{band}_power" for band in bands]
    for w in range(6):  # 6 wavelet coefficient sets from level 5 decomposition
        feature_names.append(f"ch{ch}_w{w}_mean")
        feature_names.append(f"ch{ch}_w{w}_std")

# Convert features to DataFrame and add dummy label
df_features = pd.DataFrame([trial_features], columns=feature_names)
df_features["label"] = 1  # Example label (1 = Schizophrenia)

# Save to CSV (Kaggle-safe path)
csv_path = "./eeg_features_example.csv"
df_features.to_csv(csv_path, index=False)

print(f"CSV file saved at: {csv_path}")


In [None]:
# Section: Helper - Generate Column Names
def generate_column_names():
    names = []
    for ch in range(70):
        names += [f"ch{ch}_mean", f"ch{ch}_std", f"ch{ch}_skew", f"ch{ch}_kurt", f"ch{ch}_min", f"ch{ch}_max", f"ch{ch}_median"]
        names += [f"ch{ch}_{band}_power" for band in bands]
        names += [f"ch{ch}_w{w}_mean" for w in range(6)]
        names += [f"ch{ch}_w{w}_std" for w in range(6)]
    return names


In [None]:
all_features = []
labels = []
subject_ids = []

trial_length = 6144  # Number of rows per trial
valid_subjects = range(1, 82)

for subject_id in valid_subjects:
    file_path = f"/kaggle/input/button-tone-sz/{subject_id}.csv/{subject_id}.csv"
    
    if not os.path.exists(file_path):
        continue

    try:
        df = pd.read_csv(file_path, header=None)
        data = df.values[:, 4:]  # Remove metadata: keep only EEG channels (cols 4 onward)

        # Reshape into trials
        total_samples = data.shape[0]
        n_trials = total_samples // trial_length
        data = data[:n_trials * trial_length]  # Truncate overflow
        trials = data.reshape(n_trials, trial_length, -1)  # shape: (n_trials, 6144, 70)

        for trial in trials:
            trial = trial.T  # Shape it as (channels, time) = (70, 6144)
            features = extract_features(trial, fs)
            all_features.append(features)
            labels.append(df_demographic[df_demographic["subject"] == subject_id]["group"].values[0])
            subject_ids.append(subject_id)
    
    except Exception as e:
        print(f"❌ Error for subject {subject_id}: {e}")


In [None]:
#Section: Save Full Feature Dataset to CSV
df_features = pd.DataFrame(all_features, columns=generate_column_names())
df_features["label"] = labels
df_features["subject"] = subject_ids

df_features.to_csv("eeg_all_features.csv", index=False)
print("✔️ Saved as eeg_all_features.csv")


In [None]:
# Convert features and labels to DataFrame
df_all_features = pd.DataFrame(all_features)
df_all_features["label"] = labels  # Binary labels (0 = Control, 1 = Schizophrenia)


In [None]:
# Check if features and labels were collected
print("✅ Total EEG feature vectors extracted:", len(all_features))
print("✅ Total labels collected:", len(labels))

# Optional: Check a few labels
print("🔍 First 5 labels:", labels[:5])


In [None]:
import os

data_dir = "/kaggle/input/button-tone-sz"
available_dirs = [name for name in os.listdir(data_dir) if name.endswith(".csv")]

valid_subjects = []

for dirname in available_dirs:
    try:
        subject_id = int(dirname.replace(".csv", ""))
        file_path = os.path.join(data_dir, dirname, f"{subject_id}.csv")
        if os.path.exists(file_path):
            valid_subjects.append(subject_id)
    except:
        continue  # Skip if folder name is not a number

print(f"✅ Total valid EEG files found: {len(valid_subjects)}")


In [None]:
# ——— Build full paths to each subject’s CSV ———
eeg_files = [
    os.path.join(data_dir, dirname, f"{dirname.replace('.csv','')}.csv")
    for dirname in available_dirs
    if os.path.exists(os.path.join(data_dir, dirname, f"{dirname.replace('.csv','')}.csv"))
]
print(f"✅ Total EEG files found: {len(eeg_files)}")


In [None]:
# Make sure these are not empty first
print(len(all_features), len(labels), len(subject_ids))  # should all be > 0

# Save the data properly
df_features = pd.DataFrame(all_features, columns=generate_column_names())
df_features["label"] = labels
df_features["subject"] = subject_ids

df_features.to_csv("eeg_all_features.csv", index=False)
print("✔️ Saved eeg_all_features.csv with shape:", df_features.shape)


In [None]:
from sklearn.model_selection import train_test_split

df_all_features = pd.read_csv("eeg_all_features.csv")
X = df_all_features.drop(columns=["label", "subject"])
y = df_all_features["label"]

# Now this should work
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Training samples: {len(X_train)}")
print(f"✅ Testing samples: {len(X_test)}")


In [None]:
from sklearn.model_selection import train_test_split

# Load the full feature dataset (if not already loaded)
df_all_features = pd.read_csv("eeg_all_features.csv")

# Separate features (X) and target label (y)
X = df_all_features.drop(columns=["label", "subject"])  # Drop subject column too
y = df_all_features["label"]

# Check if dataset is non-empty
if len(df_all_features) == 0:
    raise ValueError("❌ No data found in eeg_all_features.csv. Cannot split.")

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Training samples: {len(X_train)}")
print(f"✅ Testing samples: {len(X_test)}")


# ML MODELS

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt


### Random forest

In [None]:
# Initialize Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Predict on test set
y_pred = rf_clf.predict(X_test)

# Evaluate
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Control", "Schizophrenia"], yticklabels=["Control", "Schizophrenia"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("🧠 Confusion Matrix - Random Forest")
plt.show()


In [None]:
# Probability scores for positive class (label=1)
y_proba = rf_clf.predict_proba(X_test)[:, 1]

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc_score = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color="darkorange", label=f"AUC = {auc_score:.2f}")
plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("📈 ROC Curve - Random Forest")
plt.legend(loc="lower right")
plt.grid()
plt.show()

print(f"✅ AUC Score: {auc_score:.4f}")


### Feature Importance (Interpretability)

In [None]:
# Feature importance from Random Forest
importances = rf_clf.feature_importances_
feature_names = X.columns
feature_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feature_df = feature_df.sort_values(by="Importance", ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_df.head(20), palette="viridis")
plt.title("🔍 Top 20 Important Features - Random Forest")
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.tight_layout()
plt.show()


## Comparision of ML Models

In [None]:
# Define Evaluation Function
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
import time

def evaluate_model(model, model_name):
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    report = classification_report(y_test, y_pred, output_dict=True)

    return {
        "Model": model_name,
        "Accuracy": acc,
        "AUC": auc,
        "F1": report["weighted avg"]["f1-score"],
        "Time (s)": round(end - start, 2)
    }


In [None]:
# train all the models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Define all models
models = [
    (RandomForestClassifier(random_state=42), "Random Forest"),
    (SVC(kernel="rbf", probability=True, random_state=42), "SVM"),
    (LogisticRegression(max_iter=1000, random_state=42), "Logistic Regression"),
    (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), "XGBoost")
]

# Evaluate and store results
results = [evaluate_model(model, name) for model, name in models]


In [None]:
# Compare All Models

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display sorted by AUC
results_df = results_df.sort_values(by="AUC", ascending=False)
print(results_df)

# Plot comparison
plt.figure(figsize=(10, 6))
sns.barplot(x="AUC", y="Model", data=results_df, palette="mako")
plt.title("📈 Model Comparison (AUC Score)")
plt.xlabel("AUC Score")
plt.ylabel("Model")
plt.tight_layout()
plt.show()


# Deep learning Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.3),

    Dense(32, activation='relu'),

    Dense(1, activation='sigmoid')  # Binary classification
])


In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
loss, accuracy, auc = model.evaluate(X_test_scaled, y_test)
print(f"✅ Test Accuracy: {accuracy:.4f}")
print(f"📈 Test AUC: {auc:.4f}")



In [None]:
import matplotlib.pyplot as plt

# Plot Accuracy
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

# Plot AUC
plt.plot(history.history['auc'], label='Train AUC')
plt.plot(history.history['val_auc'], label='Val AUC')
plt.title('Model AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.legend()
plt.grid()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_dl = model.predict(X_test).argmax(axis=1)  # for one-hot
# or use threshold if sigmoid + binary: y_pred_dl = (model.predict(X_test) > 0.5).astype("int32")

cm = confusion_matrix(y_test, y_pred_dl)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix (Deep Learning)")
plt.show()


# Feature extraction through ML and training through DL

In [None]:
# Step-by-step: Feature Extraction via Autoencoder + DL Training

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# 1. Load your feature dataset
features_df = pd.read_csv("eeg_all_features.csv")
X = features_df.drop(columns=["label", "subject"])
y = features_df["label"]

# 2. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# 4. Autoencoder model for feature learning
input_dim = X_train.shape[1]
encoding_dim = 32  # compressed feature size

input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation="relu")(input_layer)
encoded = Dense(encoding_dim, activation="relu")(encoded)

decoded = Dense(64, activation="relu")(encoded)
decoded = Dense(input_dim, activation="linear")(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(1e-3), loss="mse")
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=64,
                shuffle=True,
                validation_split=0.2,
                callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
                verbose=1)

# 5. Extract compressed features
encoder = Model(input_layer, encoded)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

# 6. Train Deep Learning model using compressed features
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout

model = Sequential([
    Dense(64, activation='relu', input_shape=(encoding_dim,)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

history = model.fit(
    X_train_encoded, y_train,
    validation_data=(X_test_encoded, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

# 7. Evaluate
loss, accuracy, auc = model.evaluate(X_test_encoded, y_test, verbose=0)
print(f"\n✅ DL Model on Autoencoder Features")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")