In [9]:
import scipy.io
import pandas as pd
import numpy as np 
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_val_predict
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, roc_auc_score
import scipy.stats as stats
from mat_preproc import preproc

from joblib import Parallel, delayed
import pickle as pkl

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set_style("darkgrid")


#Load in the MATLAB file with all the data
data = scipy.io.loadmat('data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat')

print(data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'user_class_min_1', 'user_feat_1', 'user_prob_1', 'user_resp_1', 'user_source_1', 'user_tr_order_1', 'user_train_prob_1', 'user_weights_1'])


# Encodings for Each Label

## *source information*

1. SC (Source Correct)
2. CR (Correct Rejection)
3. SI (Source Incorrect)
4. Miss 
5. FA  (False Alarm)

## *label for the source response*

1. RS (Remember Source)
2. RO (Remember Other)
3. F (Familiarity)
4. MN (Maybe New) 
5. SN (Sure New)


In [10]:
# class attributes
source_info = ["SC", "CR", "SI", "M", "FA"]
response_info = ["RS", "RO", "F", "MN", "SN"]

# the x-axis on the projection graph
x_axis = [(1, 1), (3, 1), (5, 1), (1, 2), (5, 2), (1, 3), (3, 3), (5, 3), (4, 4), (2, 4), (4, 5), (2, 5)]

In [11]:
def cal_acc(X, y, subject):
    """
    use Kueida's evaluation method to evaluate the acc of the clf 
    on a balanced dataset.
    """
    logo = LeaveOneGroupOut()
    scores = []
    for train_idx, test_idx in logo.split(X, y, subject):
        X_train, y_train = X[train_idx,:], y[train_idx]
        X_test, y_test = X[test_idx,:], y[test_idx]
        LDA = LinearDiscriminantAnalysis(shrinkage = "auto", solver = 'eigen')
        LDA.fit(X_train, y_train)
        # randomly drop datapoint to balance class
        pos_idx, neg_idx = np.arange(len(test_idx))[y_test == 1], np.arange(len(test_idx))[y_test != 1]
        pos_len, neg_len = len(pos_idx), len(neg_idx)
        acc = []
        for _ in range(10):
            if pos_len > neg_len:
                # when there are more positive class than negative
                # randomly drop positive class to equivalent the negative class
                pos_chosen = np.random.choice(pos_idx, neg_len, replace=True)
                neg_chosen = neg_idx
            else:
                pos_chosen = pos_idx
                neg_chosen = np.random.choice(neg_idx, pos_len, replace=True)
            filter_test_idx = np.concatenate([pos_chosen, neg_chosen])
            X_test_balanced, y_test_balanced = X_test[filter_test_idx, :], y_test[filter_test_idx]
            assert sum(y_test_balanced) == 0 # to check whether they are balanced class
            acc.append(LDA.score(X_test_balanced, y_test_balanced))
        acc = np.array(acc)
        scores.append(acc)
    return np.array(scores).mean()

In [12]:
def simulate_acc_SN_vs_MN(trail_num, iter_num):
    clf_name = "SN_vs_MN"
    file_path = f"data_CRM_SN_vs_MN_imbalLDA_order_proj_{trail_num}.mat"
    data_preproc = preproc(file_path, trail_num)
    pos1, neg1 = data_preproc.filter_index(2,5,2,4)
    pos2, neg2 = data_preproc.filter_index(4,5,4,4)
    pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
    X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx, 
                                                   eliminate_trails=True)
    accs = Parallel(n_jobs=8)(delayed(cal_acc)(X, y, subject)
                              for _ in range(int(iter_num)))
    with open(f"{clf_name}_empirical_accs_leftout_boot_{trail_num}.pkl", "wb") as f:
        pkl.dump(accs, f)

simulate_acc_SN_vs_MN(1, 1e4)
simulate_acc_SN_vs_MN(2, 1e4)

In [13]:
def simulate_acc_F_vs_CR(trail_num, iter_num):
    clf_name = "F_vs_CR"
    file_path = f"data_F_vs_CR_imbalLDA_order_proj_{trail_num}.mat"
    data_preproc = preproc(file_path, trail_num)
    pos1, neg1 = data_preproc.filter_index(1,3,2,4)
    pos2, neg2 = data_preproc.filter_index(3,3,2,5)
    pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
    X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx, 
                                                   eliminate_trails=True)
    accs = Parallel(n_jobs=8)(delayed(cal_acc)(X, y, subject)
                              for _ in range(int(iter_num)))
    with open(f"{clf_name}_empirical_accs_leftout_boot_{trail_num}.pkl", "wb") as f:
        pkl.dump(accs, f)

simulate_acc_F_vs_CR(1, 1e4)
simulate_acc_F_vs_CR(2, 1e4)