In [1]:
import scipy.io
#import mne
#import mat73
import pandas as pd
import numpy as np 
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
import scipy.stats as stats

In [2]:
#Load in the MATLAB file with all the data
data = scipy.io.loadmat('data_CRM_SN_vs_MN_imbalLDA_order_proj_1.mat')

In [3]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'user_class_min_1', 'user_feat_1', 'user_prob_1', 'user_resp_1', 'user_source_1', 'user_tr_order_1', 'user_train_prob_1', 'user_weights_1'])

In [4]:
len(data['user_tr_order_1'][0])

26

In [5]:
#Had to index them like this because there was an unnecessary dimension 
tr_order = data['user_tr_order_1'][0]
proj_score = data['user_prob_1'][0]
source_label = data['user_source_1'][0]
resp_label = data['user_resp_1'][0]
behav_feat = data['user_feat_1'][0]
len(resp_label[0])

432

In [6]:
behav_feat[0]

array([[ -7.23661128, -13.93739628, -22.81788437, ...,  -9.40298223,
        -11.77818306, -18.10076694],
       [ -2.63865315,   0.3358343 ,  -1.92867504, ...,  14.19302948,
          9.98168734,  17.5732571 ],
       [ -4.28406267,  -9.36639654, -16.71320915, ...,  -2.0318536 ,
         -3.8530683 ,   3.86939731],
       ...,
       [ -3.88435706,  -4.79033675,  -5.75699235, ...,  14.81179504,
         17.04872137,  13.36880608],
       [-13.37804033,  -7.12282845,  -1.5593848 , ...,  11.92473593,
         15.36710888,  26.49037254],
       [  0.18273821,  -0.71073707,  -6.72626149, ...,  -3.00038489,
         -4.9075405 , -15.84218536]])

In [7]:
#Checked the size of the features, and noticed how jagged the data was 
for x in behav_feat:
    print(x.shape)

(432, 72)
(134, 72)
(521, 72)
(107, 72)
(487, 72)
(497, 72)
(349, 72)
(301, 72)
(161, 72)
(359, 72)
(433, 72)
(551, 72)
(331, 72)
(498, 72)
(567, 72)
(473, 72)
(306, 72)
(526, 72)
(231, 72)
(486, 72)
(259, 72)
(424, 72)
(399, 72)
(468, 72)
(515, 72)
(495, 72)


In [8]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [9]:
#Function to help prepared the appropriate labels
def prepare_labels(pos_source_label, pos_resp_label, neg_source_label, neg_response_label):
    temp_data_x = []
    temp_data_y = []
    temp_data_s = []
    s = 0
    for source, response, behavior_feature in zip(source_label,resp_label,behav_feat):
        pos_index = (
            (source.flatten()==pos_source_label) &
            (response.flatten()==pos_resp_label)
                    )
        neg_index = (
            (source.flatten()==neg_source_label) & 
            (response.flatten()==neg_response_label)
        )
        temp_data_x.append(behavior_feature[pos_index,:])
        temp_data_x.append(behavior_feature[neg_index,:])
        temp_data_y.append(['SN' for x in behavior_feature[pos_index,:]])
        temp_data_y.append(['MN' for x in behavior_feature[neg_index,:]])
        temp_data_s.append([s for x in behavior_feature[pos_index,:]])
        temp_data_s.append([s for x in behavior_feature[neg_index,:]])
        s = s + 1 
    data_x = np.vstack(temp_data_x)
    data_y = np.concatenate(temp_data_y)
    data_s = np.concatenate(temp_data_s)
    return data_x, data_y, data_s

In [10]:
data_x, data_y, data_s = prepare_labels(2,5,2,4)

In [11]:
#Check the shape of the data
data_s.shape

(2603,)

In [12]:
#Training Function 
def train(pos_source_label, pos_resp_label, neg_source_label, neg_response_label, pos_label, neg_label):
    temp_data_x = []
    temp_data_y = []
    temp_data_s = []
    s = 0
    for i,j,k in zip(source_label,resp_label,behav_feat):
        pos_index = (i.flatten()==pos_source_label) & (j.flatten()==pos_resp_label) 
        neg_index = (i.flatten()==neg_source_label) & (j.flatten()==neg_response_label)
        temp_data_x.append(k[pos_index,:])
        temp_data_x.append(k[neg_index,:])
        temp_data_y.append([pos_label for x in k[pos_index,:]])
        temp_data_y.append([neg_label for x in k[neg_index,:]])
        temp_data_s.append([s for x in k[pos_index,:]])
        temp_data_s.append([s for x in k[neg_index,:]])
        s = s + 1 
    data_x = np.vstack(temp_data_x)
    data_y = np.concatenate(temp_data_y)
    data_s = np.concatenate(temp_data_s)
    crossval = LeaveOneGroupOut()
    clf = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
    true = []
    pred = []
    trained = []
    cvit = 0
    
    scores

    for train_index, test_index in crossval.split(data_x, data_y, groups = data_s):
        cvit = cvit + 1
        clf.fit(data_x[train_index], data_y[train_index])
        pred.append(clf.predict(data_x[test_index]))
        true.append(data_y[test_index])
        trained.append(clf)
        
    pred_mod = flatten(pred)
    true_mod = flatten(true)
    acc_score = accuracy_score(true_mod, pred_mod)
    return acc_score

In [13]:
acc_score = train(2,5,2,4,'SN','MN')

In [14]:
acc_score

0.5973876296580868

In [15]:
#Need to find a way to check this because I have modularize the above function
y = clf.decision_function(data_x[test_index])
w_norm = np.linalg.norm(clf.coef_)
dist = y #/ w_norm
stats.describe(dist)

NameError: name 'clf' is not defined

In [28]:
#implement the same for F vs RS// didnt change it into function format
temp_data_x_F_RS= []
temp_data_y_F_RS = []
temp_data_s_F_RS = []
s_F_RS = 0
for i,j,k in zip(source_label,resp_label,behav_feat):
    pos_index = (i.flatten()==1) & (j.flatten()==1) 
    neg_index = ((i.flatten()==1) | (i.flatten()==3)) & (j.flatten()==3)
    temp_data_x_F_RS.append(k[pos_index,:])
    temp_data_x_F_RS.append(k[neg_index,:])
    temp_data_y_F_RS.append(['RS' for x in k[pos_index,:]])
    temp_data_y_F_RS.append(['F' for x in k[neg_index,:]])
    temp_data_s_F_RS.append([s_F_RS for x in k[pos_index,:]])
    temp_data_s_F_RS.append([s_F_RS for x in k[neg_index,:]])
    s_F_RS = s_F_RS + 1 
data_x_F_RS = np.vstack(temp_data_x_F_RS)
data_y_F_RS = np.concatenate(temp_data_y_F_RS)
data_s_F_RS = np.concatenate(temp_data_s_F_RS)

In [30]:
crossval_F_RS = LeaveOneGroupOut()
clf_F_RS = LinearDiscriminantAnalysis(shrinkage = 'auto', solver = 'eigen')
true_F_RS = []
pred_F_RS = []
trained_F_RS = []
cvit_F_RS = 0

for train_index_F_RS, test_index_F_RS in crossval_F_RS.split(data_x_F_RS, data_y_F_RS, groups = data_s_F_RS):
    cvit_F_RS = cvit_F_RS + 1
    clf_F_RS.fit(data_x_F_RS[train_index_F_RS], data_y_F_RS[train_index_F_RS])
    pred_F_RS.append(clf_F_RS.predict(data_x_F_RS[test_index_F_RS]))
    true_F_RS.append(data_y_F_RS[test_index_F_RS])
    trained_F_RS.append(clf_F_RS)
    print('Pred: ', pred_F_RS)
    print('True: ', true_F_RS)

Pred:  [array(['RS', 'F', 'RS', 'RS', 'F', 'F', 'RS', 'RS', 'RS', 'RS', 'RS',
       'RS', 'RS', 'RS', 'RS', 'F', 'RS', 'F', 'RS', 'F', 'F', 'RS', 'RS',
       'F', 'RS', 'RS', 'F', 'RS', 'RS', 'F', 'RS', 'F', 'RS', 'RS', 'RS',
       'RS', 'F', 'RS', 'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS',
       'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS',
       'RS', 'RS', 'RS', 'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS',
       'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'F', 'F', 'RS', 'RS',
       'RS', 'RS', 'F', 'RS', 'F', 'RS', 'F', 'RS', 'F', 'RS', 'RS', 'F',
       'RS', 'RS', 'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS',
       'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'F', 'RS', 'RS', 'RS', 'F',
       'RS', 'RS', 'RS', 'RS', 'F', 'F', 'F', 'RS', 'RS', 'RS', 'RS', 'F',
       'RS', 'RS', 'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS',
       'RS', 'F', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'RS', 'F', 'RS',
       'F', 'RS', 'RS', 'RS', 'RS'], dtype='<U2')]


In [31]:
pred_mod_F_RS = flatten(pred_F_RS)
true_mod_F_RS = flatten(true_F_RS)
len(pred_mod_F_RS)
acc_score_F_RS = accuracy_score(true_mod_F_RS, pred_mod_F_RS)
acc_score_F_RS

0.7350196463654224