In [None]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta
from glob import glob

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, accuracy_score, confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, accuracy_score, confusion_matrix
)
import numpy as np
from sklearn.metrics import roc_auc_score

In [None]:
y_pred_path = "./submissions/y_pred_challenge1.csv"

y_pred_df = pd.read_csv(y_pred_path, low_memory=False)

y_true_path = "./y_true_df_challenge1.csv"
y_true_df = pd.read_csv(y_true_path, low_memory=False)

In [None]:
def evaluate_error(y_pred_df, y_true_df):
    """
    Compute classification metrics and count of detected error events.
    
    Parameters
    ----------
    y_pred_df : pd.DataFrame
        DataFrame must contain 'task', 'trial', 'start', 'end', 'y_pred_err',
    y_pred_df : pd.DataFrame
        DataFrame must contain 'task', 'trial', 'error_onset', 'error_offset'.
        
    Returns
    -------
    tp: number of true positives 
    fp: number of false positives
    total_errors: number of errors in total
    """

    # Make sure y_true only include trials in pred
    evaluation_trials = y_pred_df['trial'].astype(int).unique()
    y_true_df = y_true_df.loc[y_true_df['trial'].isin(evaluation_trials)]
    
    # Look for true positive and false positive 
    pos_pred = y_pred_df.loc[y_pred_df['y_pred_err'] == 1]
    pos_pred['id'] = pos_pred.index
    pos_pred['overlap_error'] = 0

    # initialize metrics
    tp = 0
    fp = 0
    total_errors = len(y_true_df)

    # check that the trials contain errors 
    if len(y_true_df) > 0:
        for _, row in y_true_df.iterrows():
            task = row['task']
            trial = row['trial']
            error_onset = row['error_onset'].total_seconds() - 1 # added one second tolerance
            error_offset = row['error_offset'].total_seconds() + 1 # added one second tolerance

            # check if the predicted error overlaps with actual error 
            detected_err = pos_pred[(pos_pred['task'] == task) & (pos_pred['trial'] == trial) & 
                                    ((pos_pred['start'] >= error_onset) & (pos_pred['start'] <= error_offset)) |
                                    ((pos_pred['end']   >= error_onset) & (pos_pred['end']   <= error_offset)) |
                                    ((pos_pred['start'] <= error_onset) & (pos_pred['end'] >= error_offset))]
            pos_pred.loc[pos_pred['id'].isin(detected_err['id'])] = 1
            if len(detected_err) > 0: 
                tp = tp + 1

        # prediction is a false positive if it does not overlap with actual error 
        fp = len(pos_pred.loc[pos_pred['overlap_error'] == 0])
        
        print(f"True Positive: {tp} ({tp / total_errors * 100}/%)")
        print(f"False Positive: {fp}")

    return tp, fp, total_errors

In [None]:
def evaluate_error(y_pred_df, y_true_df):
    """
    Compute classification metrics and count of detected error events.
    
    Parameters
    ----------
    y_pred_df : pd.DataFrame
        DataFrame must contain 'task', 'trial', 'start', 'end', 'y_pred_err',
    y_pred_df : pd.DataFrame
        DataFrame must contain 'task', 'trial', 'error_onset', 'error_offset'.
        
    Returns
    -------
    tp: number of true positives 
    fp: number of false positives
    total_errors: number of errors in total
    """

    print("Starting Error Evaluation")
    
    # Make sure y_true only include trials in pred
    evaluation_trials = y_pred_df['trial'].astype(int).unique()
    y_true_df = y_true_df.loc[y_true_df['trial'].isin(evaluation_trials)]
    
    # Look for true positive and false positive 
    pos_pred = y_pred_df.loc[y_pred_df['y_pred_err'] == 1]
    pos_pred['id'] = pos_pred.index
    pos_pred['overlap_error'] = 0

    # initialize metrics
    tp = 0
    fp = 0
    total_errors = len(y_true_df)
    print(total_errors)

    y_true_df['error_onset'] = pd.to_timedelta(y_true_df['error_onset'])
    y_true_df['error_offset'] = pd.to_timedelta(y_true_df['error_offset'])
              
    # check that the trials contain errors 
    if len(y_true_df) > 0:
        for _, row in y_true_df.iterrows():
            task = row['task']
            trial = row['trial']

            error_onset = row['error_onset'].total_seconds() - 1 # added one second tolerance
            error_offset = row['error_offset'].total_seconds() + 1 # added one second tolerance

            # check if the predicted error overlaps with actual error 
            detected_err = pos_pred[(pos_pred['task'] == task) & (pos_pred['trial'] == trial) & 
                                    ((pos_pred['start'] >= error_onset) & (pos_pred['start'] <= error_offset)) |
                                    ((pos_pred['end']   >= error_onset) & (pos_pred['end']   <= error_offset)) |
                                    ((pos_pred['start'] <= error_onset) & (pos_pred['end'] >= error_offset))]
            pos_pred.loc[pos_pred['id'].isin(detected_err['id'])] = 1
            if len(detected_err) > 0: 
                tp = tp + 1

        # prediction is a false positive if it does not overlap with actual error 
        fp = len(pos_pred.loc[pos_pred['overlap_error'] == 0])
        
        print(f"True Positive: {tp} ({tp / total_errors * 100}/%)")
        print(f"False Positive: {fp}")

        fn = total_errors - tp
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall) 
        print(f"F1: {f1}")

    return tp, fp, total_errors

In [None]:
y_pred_df = y_pred_df.rename(columns={'y_pred_reaction': 'y_pred_err'})

tp, fp, total_error = evaluate_error(y_pred_df, y_true_df)