In [1]:
import scipy.io
import pandas as pd
import numpy as np 
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_val_predict
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from mat_preproc import preproc

In [2]:
# the mat file is generated by CRM_SN_vs_MN_imbalLDA_1450ms_order_proj.m

#Load in the MATLAB file with all the data
data = scipy.io.loadmat('data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat')

print(data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'user_class_min_1', 'user_feat_1', 'user_prob_1', 'user_resp_1', 'user_source_1', 'user_tr_order_1', 'user_train_prob_1', 'user_weights_1'])


In [3]:
#Had to index them like this because there was an unnecessary dimension 

# user trail order
tr_order = data['user_tr_order_1'][0]

# projection scores
proj_score = data['user_prob_1'][0]

# source and response label
source_label = data['user_source_1'][0]
resp_label = data['user_resp_1'][0]

# features, group the channels and average over the windows,
# those are called features, and we use the features for training
behav_feat = data['user_feat_1'][0]

len(resp_label[0])

432

In [4]:
tr_order[0].shape, tr_order[1].shape

((432, 1), (134, 1))

# Encodings for Each Label

## *source information*

1. SC (Source Correct)
2. CR (Correct Rejection)
3. SI (Source Incorrect)
4. Miss 
5. FA  (False Alarm)

## *label for the source response*

1. RS (Remember Source)
2. RO (Remember Other)
3. F (Familiarity)
4. MN (Maybe New) 
5. SN (Sure New)


In [23]:
# class attributes
source_info = ["SC", "CR", "SI", "M", "FA"]
response_info = ["RS", "RO", "F", "MN", "SN"]

# the x-axis on the projection graph
x_axis = [(1, 1), (3, 1), (5, 1), (1, 2), (5, 2), (1, 3), (3, 3), (5, 3), (4, 4), (2, 4), (4, 5), (2, 5)]

In [24]:
for source, resp in x_axis:
    print(f"{source_info[source-1]}-{response_info[resp-1]}")

SC-RS
SI-RS
FA-RS
SC-RO
FA-RO
SC-F
SI-F
FA-F
M-MN
CR-MN
M-SN
CR-SN


# Preproc

In [7]:
# multiple group
file_path = "data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat"
data_preproc = preproc(file_path)
pos1, neg1 = data_preproc.filter_index(2,5,2,4)
pos2, neg2 = data_preproc.filter_index(4,5,4,4)
pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)
X.shape

(3813, 72)

In [8]:
subject.shape

(3813,)

In [9]:
y.shape

(3813,)

# trainning - `CRSN` v.s. `CRMN`

This has limited performance since `CRSN` and `CRMN` are only a subset of the `SN` and `MN` categories. To fully train on the `SN` and `MN` class, we need to merge `CRSN` and `MSN` and `CRMN` and `MMN` together.

In [10]:
data_pre = preproc(file_path)
pos_idx, neg_idx = data_pre.filter_index(2,5,2,4)
X, y, subject = data_pre.get_data_by_index(pos_idx, neg_idx)

logo = LeaveOneGroupOut()

print(f"there are {logo.get_n_splits(X, y, subject)} groups")

there are 24 groups


In [11]:
LDA = LinearDiscriminantAnalysis(shrinkage = 0, solver = 'lsqr')
scores = cross_val_score(LDA, X, y, cv=logo, groups=subject)
scores.mean()

0.5788368295042321

In [12]:
LDA = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
pred = cross_val_predict(LDA, X, y, cv=logo, groups=subject)
accuracy_score(y, pred)

0.5932203389830508

# Trainning - SN v.s. MN

In [19]:
data_preproc = preproc(file_path)
# combine SN and MN
pos1, neg1 = data_preproc.filter_index(2,5,2,4)
pos2, neg2 = data_preproc.filter_index(4,5,4,4)

pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)

logo = LeaveOneGroupOut()

scores = []

for train_idx, test_idx in logo.split(X, y, subject):
    X_train, y_train = X[train_idx,:], y[train_idx]
    X_test, y_test = X[test_idx,:], y[test_idx]
    LDA = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
    LDA.fit(X_train, y_train)
    scores.append(LDA.score(X_test, y_test))
scores = np.array(scores)

0.5421

In [20]:
LDA = LinearDiscriminantAnalysis(shrinkage = 0.176, solver = 'eigen')
scores = cross_val_score(LDA, X, y, cv=logo, groups=subject)
scores.mean()

0.5503701949440657

In [21]:
X.shape

(3813, 72)

In [22]:
LDA = LinearDiscriminantAnalysis(shrinkage = "auto", solver = 'eigen')
pred = cross_val_predict(LDA, X, y, cv=logo, groups=subject)
accuracy_score(y, pred)

0.5680566483084186

# Testing the multiclass projection onto this classifier

In [None]:
# fitted on the SN vs. MN clf
LDA = LDA.fit(X, y)

In [25]:
x_axis

[(1, 1),
 (3, 1),
 (5, 1),
 (1, 2),
 (5, 2),
 (1, 3),
 (3, 3),
 (5, 3),
 (4, 4),
 (2, 4),
 (4, 5),
 (2, 5)]