In [31]:
import scipy.io
import pandas as pd
import numpy as np 
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_val_predict
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
import scipy.stats as stats

In [2]:
#Load in the MATLAB file with all the data
data = scipy.io.loadmat('data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat')

print(data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'user_class_min_1', 'user_feat_1', 'user_prob_1', 'user_resp_1', 'user_source_1', 'user_tr_order_1', 'user_train_prob_1', 'user_weights_1'])


In [3]:
data["user_resp_1"][0][0].shape

(432, 1)

In [4]:
#Had to index them like this because there was an unnecessary dimension 

# user trail order
tr_order = data['user_tr_order_1'][0]

# projection scores
proj_score = data['user_prob_1'][0]

# source and response label
source_label = data['user_source_1'][0]
resp_label = data['user_resp_1'][0]

# features, group the channels and average over the windows,
# those are called features, and we use the features for training
behav_feat = data['user_feat_1'][0]

len(resp_label[0])

432

In [5]:
tr_order[0].shape, tr_order[1].shape

((432, 1), (134, 1))

# Encodings for Each Label

## *source information*

1. SC (Source Correct)
2. CR (Correct Rejection)
3. SI (Source Incorrect)
4. Miss 
5. FA  (False Alarm)

## *label for the source response*

1. RS (Remember Source)
2. RO (Remember Other)
3. F (Familiarity)
4. MN (Maybe New) 
5. SN (Sure New)


# Preproc

In [6]:
file_path = 'data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat'

# wrapper class for data preprocessing
class preproc:
    
    # class attributes
    source_info = ["SC", "CR", "SI", "M", "FA"]
    response_info = ["RS", "RO", "F", "MN", "SN"]
    
    def __init__(self, file_path):
        
        data = scipy.io.loadmat(file_path)
        # user trail order
        self.tr_order = data['user_tr_order_1'][0]

        # projection scores
        self.proj_score = data['user_prob_1'][0]

        # source and response label
        self.source_label = data['user_source_1'][0]
        self.resp_label = data['user_resp_1'][0]

        # features, group the channels and average over the windows,
        # those are called features, and we use the features for training
        self.behav_feat = data['user_feat_1'][0]
        
        self.data = data
    
    def filter_index(self, pos_source_label: int, pos_resp_label: int,
                           neg_source_label: int, neg_resp_label: int):
        """
        A simplified version of prepare label. Instead of returns all the
        dataset (X, y, group), this only returns a boolean array of the corresponding index
        Since the data is in a nested array (in other words, 2-d array with different dim),
        the returned element should also be a nested array.
        
        The purpose of this is to prepared for multi-subclass merge for a single class preparation
        
        Parameters:
        -----------
        pos_source_label : int
            the positive class's source label.
            for details, please refer to the above encodings
        pos_resp_label : int
            the positive class's response label
        neg_source_label : int
            the negative class's source label
        neg_response_label : int
            the negative class's response label

        Returns:
        --------
        pos_idx : np.ndarray
            the nested boolean array that indicates the location of
            the positive class.
        neg_idx : np.ndarray 
            the nested boolean array that indicates the location of
            the negative class.
        """
        pos_idx, neg_idx = [], []
        
        for source, response, behavior_feature in zip(
            self.source_label, self.resp_label, self.behav_feat
        ):
            # use the logical intersection to subtract out the indices 
            # of the positive and negative class
            pos_index_single_subject = (
                (source.flatten()==pos_source_label) &
                (response.flatten()==pos_resp_label)
                        )
            neg_index_single_subject = (
                (source.flatten()==neg_source_label) & 
                (response.flatten()==neg_resp_label)
            )
            # aggregate back
            pos_idx.append(pos_index_single_subject)
            neg_idx.append(neg_index_single_subject)
        
        return np.array(pos_idx, dtype=object), np.array(neg_idx, dtype=object)
    
    def merge_two_class(self, pos1, neg1, pos2, neg2):
        """
        Apply logical OR to two positive class and two negative class
        Purpose is to merge 1 and 2
        
        Parameters:
        -----------
        pos1 : np.ndarray([Object])
            the positive class 1 index array
        neg1 : np.ndarray([Object])
            the negative class 1 index array
        pos2 : np.ndarray([Object])
            the positive class 2 index array
        neg2 : np.ndarray([Object])
            the negative class 2 index array
            
        Returns:
        --------
        pos_idx : np.ndarray
            the merged nested boolean array that indicates the location of
            the positive class.
        neg_idx : np.ndarray 
            the merged nested boolean array that indicates the location of
            the negative class.
        """
        pos_idx, neg_idx = [], []
        for p1, n1, p2, n2 in zip(pos1, neg1, pos2, neg2):
            pos_idx.append((p1 | p2))
            neg_idx.append((n1 | n2))
        return np.array(pos_idx, dtype=object), np.array(neg_idx, dtype=object)
    
    def get_data_by_index(self, pos_idx, neg_idx):
        """
        given positive and negative index array, indexing out the
        given data matrices and flattern them out
        
        Parameters:
        -----------
        pos_idx : np.ndarray
            the nested boolean array that indicates the position of the
            positive class
        neg_idx : np.ndarray
            the nested boolean array that indicates the position of the
            negative class
        
        Returns:
        --------
        X : np.ndarray
            the input for the formatted flattern data
        y : np.ndarray
            the ground truth label
        subject : np.ndarray
            th subject number that corresponds to the data_x and data_y
        """
        X, y, subject = np.array([]), np.array([]), np.array([])
        
        for subject_num, zipped in enumerate(zip(pos_idx, neg_idx, self.behav_feat)):
            pos, neg, behavior_feature = zipped
            # the num of pos and neg class is their count of True
            # in the boolean array
            pos_len, neg_len = pos.sum(), neg.sum()
            
            # append positive class
            try: 
                X = np.vstack([X, behavior_feature[pos, :]])
            except ValueError:
                # catch the first case where the X is empty
                X = behavior_feature[pos, :]
            y = np.append(y, np.repeat(1, pos_len))
            
            # append negative class
            X = np.vstack([X, behavior_feature[neg, :]])
            y = np.append(y, np.repeat(-1, neg_len))

            # record their subject id
            subject = np.append(subject, np.repeat(subject_num, pos_len + neg_len))
        return X, y, subject

In [7]:
data_pre = preproc(file_path)
X, y, group = data_pre.prepare_labels(2,5,2,4)

In [10]:
# multiple group
data_preproc = preproc(file_path)
pos1, neg1 = data_preproc.filter_index(2,5,2,4)
pos2, neg2 = data_preproc.filter_index(4,5,4,4)
pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)
X.shape

(3898, 72)

In [10]:
subject.shape

(3898,)

In [11]:
y.shape

(3898,)

# trainning - `CRSN` v.s. `CRMN`

This has limited performance since `CRSN` and `CRMN` are only a subset of the `SN` and `MN` categories. To fully train on the `SN` and `MN` class, we need to merge `CRSN` and `MSN` and `CRMN` and `MMN` together.

In [39]:
data_pre = preproc(file_path)
pos_idx, neg_idx = data_pre.filter_index(2,5,2,4)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)

logo = LeaveOneGroupOut()

print(f"there are {logo.get_n_splits(X, y, group)} groups")

there are 26 groups


In [40]:
LDA = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
scores = cross_val_score(LDA, X, y, cv=logo, groups=group)
scores.mean()

0.5778343140636092

In [41]:
LDA = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
pred = cross_val_predict(LDA, X, y, cv=logo, groups=group)
accuracy_score(y, pred)

0.5973876296580868

# Trainning - SN v.s. MN

In [42]:
data_preproc = preproc(file_path)
# combine SN and MN
pos1, neg1 = data_preproc.filter_index(2,5,2,4)
pos2, neg2 = data_preproc.filter_index(4,5,4,4)

pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)

logo = LeaveOneGroupOut()

scores = []

for train_idx, test_idx in logo.split(X, y, subject):
    X_train, y_train = X[train_idx,:], y[train_idx]
    X_test, y_test = X[test_idx,:], y[test_idx]
    LDA = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
    LDA.fit(X_train, y_train)
    scores.append(LDA.score(X_test, y_test))
scores = np.array(scores)

In [56]:
LDA = LinearDiscriminantAnalysis(shrinkage = None, solver = 'svd')
scores = cross_val_score(LDA, X, y, cv=logo, groups=subject)
scores.mean()

0.5490567526922828

In [51]:
LDA = LinearDiscriminantAnalysis(shrinkage = None, solver = 'lsqr')
pred = cross_val_predict(LDA, X, y, cv=logo, groups=subject)
accuracy_score(y, pred)

0.5695228322216521