# Generate hard test cases

#### Motivation
Since we have achieved a high baseline (97.8% F1 score) on the seizure dataset, the potential for improvement on the overall dataset is limited. Therefore, we will create challenging test cases where seizure classification is more difficult to evaluate the model's robustness.

#### Methodologies
Use SVM and ANN(Artificial Neural Network) to generate hard test cases

1. **Misclassified Data**:
   - Identify data misclassified by SVM and ANN.

2. **Data Near the Decision Boundary**:
   - **SVM**: Data whose distance to the hyperplanes is within a certain threshold.
   - **ANN**: Data whose predicted probability is around 0.5.

3. **Support Vectors (SVM)**:
   - Identify data points classified as support vectors by the SVM.

In [94]:
# Importing necessary libraries
import os
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, accuracy_score, classification_report

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense


In [91]:
# Read data
df = pd.read_csv("data/data_cleaned.csv")
X = df.loc[:, 'X1':'X178']  # All feature columns
y = df['y']  # Labels (non-seizure/seizure)

In [95]:
# Necassary functions

def evaluate_classification_metrics(y_ground_truth, y_pred, y_pred_prob):
    # Calculate accuracy
    accuracy = accuracy_score(y_ground_truth, y_pred)
    
    # Calculate precision
    precision = precision_score(y_ground_truth, y_pred)
    
    # Calculate recall
    recall = recall_score(y_ground_truth, y_pred)
    
    # Calculate F1 score
    f1 = f1_score(y_ground_truth, y_pred, average='weighted')
    
    if len(set(y_ground_truth)) > 1:
        # Check if ROC-AUC can be calculated (i.e., both classes are present)
        roc_auc = roc_auc_score(y_ground_truth, y_pred_prob)
        # Calculate Cohen's Kappa
        kappa = cohen_kappa_score(y_ground_truth, y_pred)
    else:
        roc_auc = None  # Not computable, only one class in y_true
        kappa = None
    
    # Calculate metrics for seizure class (y_label=1)
    precision_seizure = precision_score(y_ground_truth, y_pred, pos_label=1)
    recall_seizure = recall_score(y_ground_truth, y_pred, pos_label=1)
    f1_seizure = f1_score(y_ground_truth, y_pred, pos_label=1)
    
    # Calculate metrics for non-seizure class (y_label=0)
    precision_non_seizure = precision_score(y_ground_truth, y_pred, pos_label=0)
    recall_non_seizure = recall_score(y_ground_truth, y_pred, pos_label=0)
    f1_non_seizure = f1_score(y_ground_truth, y_pred, pos_label=0)
    
    print(f'\nSeizure (y=1):')
    print(f'  Precision: {precision_seizure * 100:.2f} %')
    print(f'  Recall: {recall_seizure * 100:.2f} %')
    print(f'  F1 Score: {f1_seizure * 100:.2f} %')
    
    print(f'\nNon-Seizure (y=0):')
    print(f'  Precision: {precision_non_seizure * 100:.2f} %')
    print(f'  Recall: {recall_non_seizure * 100:.2f} %')
    print(f'  F1 Score: {f1_non_seizure * 100:.2f} %')
    
    # Print metrics
    print(f'\nOverall:')
    print(f'  Accuracy: {accuracy * 100:.2f} %')
    print(f'  Precision: {precision * 100:.2f} %')
    print(f'  Recall: {recall * 100:.2f} %')
    print(f'  F1 Score: {f1 * 100:.2f} %')
    if roc_auc is not None:
        print(f'  ROC-AUC: {roc_auc * 100:.2f} %')
    if kappa is not None:
        print(f'  Cohen\'s Kappa: {kappa * 100:.2f} %')

In [96]:
# Define ANN

def ann(X_train, y_train):
    # Initializing the ANN
    classifier = Sequential()
    
    # Adding the input layer and the first hidden layer
    classifier.add(Dense(units=80, kernel_initializer='uniform', activation='relu', input_dim=178))
    
    # Adding the second hidden layer
    classifier.add(Dense(units=80, kernel_initializer='uniform', activation='relu'))
    
    # Adding the output layer
    classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
    
    classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Fitting the ANN to the training set without printing the epoch output
    classifier.fit(X_train, y_train, batch_size=10, epochs=100, verbose=0)
    
    return classifier

In [97]:
# Print the metrics on the hard test cases
verbose = True

## 1. Misclassified Data

In [98]:
def get_error_points(X_test, y_test, y_pred, y_pred_prob):
    # Identify error cases (misclassified points)
    error_mask = y_test != y_pred
    X_error = X_test[error_mask]
    y_error_true = y_test[error_mask]
    y_error_pred = y_pred[error_mask]
    y_error_prob = y_pred_prob[error_mask]
    
    if verbose:
        evaluate_classification_metrics(y_error_true, y_error_pred, y_error_prob)
        
    return X_error, y_error_true

### SVM

In [99]:
def misclassified_data_svm():
    if verbose:
        print ("\n--------------------misclassified_data_svm--------------------")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = SVC(class_weight='balanced', probability=True, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    X_error, y_error = get_error_points(X_test, y_test, y_pred, y_pred_prob)
    return X_error, y_error

X_error, y_error = misclassified_data_svm()
print(f"\nNumber of error cases: {len(y_error)}")
print(f"Label distribution in error cases (actual labels):\n {pd.Series(y_error).value_counts()}")


--------------------misclassified_data_svm--------------------

Seizure (y=1):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Non-Seizure (y=0):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Overall:
  Accuracy: 0.00 %
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %
  ROC-AUC: 0.00 %
  Cohen's Kappa: -73.86 %

Number of error cases: 49
Label distribution in error cases (actual labels):
 0    34
1    15
Name: y, dtype: int64


### ANN

In [101]:
def misclassified_data_ann():
    if verbose:
        print ("\n--------------------misclassified_data_ann--------------------")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ann_clf = ann(X_train, y_train)
    y_pred_prob = ann_clf.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).ravel()
    X_error, y_error = get_error_points(X_test, y_test, y_pred, y_pred_prob)
    return X_error, y_error

X_error, y_error = misclassified_data_ann()
print(f"\nNumber of error cases: {len(y_error)}")
print(f"Label distribution in error cases (actual labels):\n{pd.Series(y_error).value_counts()}")


--------------------misclassified_data_ann--------------------


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445us/step

Seizure (y=1):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Non-Seizure (y=0):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Overall:
  Accuracy: 0.00 %
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %
  ROC-AUC: 0.00 %
  Cohen's Kappa: -50.97 %

Number of error cases: 93
Label distribution in error cases (actual labels):
1    73
0    20
Name: y, dtype: int64


## 2. Data Near the Decision Boundary
### SVM
- Points within a distance to the hyperplanes.
- Points are from the test data.

In [102]:
def data_near_decision_boundary_svm():
    # Define distance threshold from hyperplanes
    distance_threshold = 0.3

    # Train a SVM
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = SVC(class_weight='balanced', probability=True, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Get the decision function (distance from the decision boundary)
    decision_distances = clf.decision_function(X_test)
    # Find the cases where the absolute value of the distance is close to 0 (i.e., near the boundary)
    boundary_cases = np.where(np.abs(decision_distances) < distance_threshold)[0]
    
    # Get the vectors and labels near the decision boundary
    X_near_boundary = X_test.iloc[boundary_cases]
    y_near_boundary = y_test.iloc[boundary_cases]
    y_pred_near_boundary = y_pred[boundary_cases]
    decision_distances_near_boundary = decision_distances[boundary_cases]  # Filter decision distances
    
    if verbose:
        print ("\n--------------------data_near_decision_boundary_svm--------------------")
        evaluate_classification_metrics(y_near_boundary, y_pred_near_boundary, decision_distances_near_boundary)

    return X_near_boundary, y_near_boundary

In [110]:
X_near_boundary, y_near_boundary = data_near_decision_boundary_svm()
print(f"\nNumber of points near the decision boundary: {y_near_boundary.shape[0]}")
print(f"Label distribution in points near the decision boundary (actual labels):\n{pd.Series(y_near_boundary).value_counts()}")


--------------------data_near_decision_boundary_svm--------------------

Seizure (y=1):
  Precision: 50.00 %
  Recall: 70.59 %
  F1 Score: 58.54 %

Non-Seizure (y=0):
  Precision: 73.68 %
  Recall: 53.85 %
  F1 Score: 62.22 %

Overall:
  Accuracy: 60.47 %
  Precision: 50.00 %
  Recall: 70.59 %
  F1 Score: 60.77 %
  ROC-AUC: 76.02 %
  Cohen's Kappa: 22.81 %

Number of points near the decision boundary: 43
Label distribution in points near the decision boundary (actual labels):
0    26
1    17
Name: y, dtype: int64


### ANN
- Get points where the predicted probability is near the decision threshold (e.g., near 0.5).
- prob=0 means non-seizure, prob=1 means seizure, prob=0.5 means not sure.
- Assuming distance threshold is 0.2, any data points whose probabilities is within 0.3 to 0.7 will be considered near decision boundary.

In [105]:
def data_near_decision_boundary_ann():

    # Define the distance threshold
    distance_threshold = 0.2

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ann_clf = ann(X_train, y_train)
    y_pred_prob = ann_clf.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).ravel()
    boundary_cases = np.where(np.abs(y_pred_prob - 0.5) < distance_threshold)[0]
    
    X_near_boundary = X_test.iloc[boundary_cases]
    y_near_boundary = y_test.iloc[boundary_cases]
    y_pred_near_boundary = y_pred[boundary_cases]
    decision_prob_near_boundary = y_pred_prob[boundary_cases]
    
    if verbose:
        print ("\n--------------------data_near_decision_boundary_ann--------------------")
        evaluate_classification_metrics(y_near_boundary, y_pred_near_boundary, decision_prob_near_boundary)
    return X_near_boundary, y_near_boundary

In [106]:
X_near_boundary, y_near_boundary = data_near_decision_boundary_ann()
print(f"\nNumber of points near the decision boundary: {y_near_boundary.shape[0]}")
print(f"Label distribution in points near the decision boundary (actual labels):\n{pd.Series(y_near_boundary).value_counts()}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 429us/step

--------------------data_near_decision_boundary_ann--------------------

Seizure (y=1):
  Precision: 18.18 %
  Recall: 25.00 %
  F1 Score: 21.05 %

Non-Seizure (y=0):
  Precision: 45.45 %
  Recall: 35.71 %
  F1 Score: 40.00 %

Overall:
  Accuracy: 31.82 %
  Precision: 18.18 %
  Recall: 25.00 %
  F1 Score: 33.11 %
  ROC-AUC: 24.11 %
  Cohen's Kappa: -36.36 %

Number of points near the decision boundary: 22
Label distribution in points near the decision boundary (actual labels):
0    14
1     8
Name: y, dtype: int64


## 3. Support Vectors (SVM)
- Support vectors are the data points that lie closest to the decision boundary. These points have the highest influence on determining the position of the boundary.
- These points are from training data.

In [116]:
def data_support_vectors():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = SVC(class_weight='balanced', probability=True, random_state=42)
    clf.fit(X_train, y_train)

    support_vectors_indices = clf.support_
    support_vectors = clf.support_vectors_
    X_support_vectors = X_train.iloc[support_vectors_indices]
    y_support_vectors = y_train.iloc[support_vectors_indices]

    if verbose:
        print ("\n--------------------support_vectors_svm--------------------")
        y_pred_within_support_vectors = clf.predict(support_vectors)
        y_pred_prob_within_support_vectors = clf.decision_function(support_vectors)
        evaluate_classification_metrics(y_support_vectors, y_pred_within_support_vectors, y_pred_prob_within_support_vectors)
        
    return X_support_vectors, y_support_vectors

In [117]:
X_support_vectors, y_support_vectors = data_support_vectors()
print(f"\nNumber of support vectors: {y_support_vectors.shape[0]}")
# Convert y_support_vectors to Pandas Series for label distribution
y_support_vectors_series = pd.Series(y_support_vectors)
print("Label distribution of support vectors (actual labels):\n", y_support_vectors_series.value_counts())  


--------------------support_vectors_svm--------------------





Seizure (y=1):
  Precision: 62.12 %
  Recall: 90.11 %
  F1 Score: 73.54 %

Non-Seizure (y=0):
  Precision: 96.24 %
  Recall: 82.16 %
  F1 Score: 88.65 %

Overall:
  Accuracy: 84.11 %
  Precision: 62.12 %
  Recall: 90.11 %
  F1 Score: 84.95 %
  ROC-AUC: 87.44 %
  Cohen's Kappa: 62.73 %

Number of support vectors: 1114
Label distribution of support vectors (actual labels):
 0    841
1    273
Name: y, dtype: int64


## Function to get hard test cases

In [166]:
def get_hard_test_cases(method_list):
    total_test_data = []
    for method in method_list:
        result = eval(method + '()')

        # Get hard test data
        if isinstance(result, tuple):
            X_test, y_test = result
            y_test = pd.Series(y_test, name='y')
            combined_df = pd.concat([X_test, y_test], axis=1)
        total_test_data.append(combined_df)
        total_test_data_df = pd.concat(total_test_data)

    # Drop duplicate rows
    hard_test_df_cleaned = total_test_data_df.drop_duplicates()
    print ("\nNumber of hard test cases:", hard_test_df_cleaned.shape[0])
    print ("Distribution of label: ", hard_test_df_cleaned['y'].value_counts())

    # Turn rest of the dataset into training data
    train_data = df.loc[~df.index.isin(hard_test_df_cleaned.index), df.columns[df.columns.get_loc('X1'):df.columns.get_loc('X178') + 1].tolist() + ['y']]
    hard_test_df_cleaned.sort_index(inplace=True)

    return train_data, hard_test_df_cleaned

In [None]:
method_list = [
                "misclassified_data_svm",
                "misclassified_data_ann",
                "data_near_decision_boundary_svm",
                "data_near_decision_boundary_ann",
                "data_support_vectors",
                ]
train_data, test_data = get_hard_test_cases(method_list)

os.makedirs("data/hard_test_cases", exist_ok = True)
train_data.to_csv('data/hard_test_cases/train.csv')
test_data.to_csv('datahard_test_cases/test.csv')


--------------------misclassified_data_svm--------------------

Seizure (y=1):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Non-Seizure (y=0):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Overall:
  Accuracy: 0.00 %
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %
  ROC-AUC: 0.00 %
  Cohen's Kappa: -73.86 %

Number of hard test cases: 49
Distribution of label:  0    34
1    15
Name: y, dtype: int64


In [169]:
# Evaluate performance on the hard test cases using SVM

train_data = pd.read_csv('hard_test_cases/train.csv')
test_data = pd.read_csv('hard_test_cases/test.csv')

X_train = train_data.drop(columns=['y']).values
y_train = train_data['y'].values
X_test = test_data.drop(columns=['y']).values
y_test = test_data['y'].values

# Train the SVM model
clf = SVC(class_weight='balanced', probability=True, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
decision_distances = clf.decision_function(X_test)

evaluate_classification_metrics(y_test, y_pred, decision_distances)


Seizure (y=1):
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 0.00 %

Non-Seizure (y=0):
  Precision: 31.82 %
  Recall: 20.59 %
  F1 Score: 25.00 %

Overall:
  Accuracy: 14.29 %
  Precision: 0.00 %
  Recall: 0.00 %
  F1 Score: 17.35 %
  ROC-AUC: 3.92 %
  Cohen's Kappa: -64.90 %
