# LDA Projections onto the Discriminant Learned by the Complete Dataset

In [None]:
# Build from source
!pip install -e '../../[dev]'

Obtaining file:///Users/scottyang/Desktop/SP23_Working/EEG/EEG-Familiarity
Installing collected packages: EEG-Familiarity
  Attempting uninstall: EEG-Familiarity
    Found existing installation: EEG-Familiarity 0.0.1
    Uninstalling EEG-Familiarity-0.0.1:
      Successfully uninstalled EEG-Familiarity-0.0.1
  Running setup.py develop for EEG-Familiarity
Successfully installed EEG-Familiarity-0.0.1


In [None]:
import scipy.io
import pandas as pd
import numpy as np 
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import ledoit_wolf
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, roc_auc_score
import scipy.stats as stats

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import seaborn as sns
sns.set_style("darkgrid")

from EEG_Familiarity.preproc import preproc

# class attributes
source_info = ["SC", "CR", "SI", "M", "FA"]
response_info = ["RS", "RO", "F", "MN", "SN"]

# the x-axis on the projection graph
x_axis = [(1, 1), (3, 1), (5, 1), (1, 2), (5, 2),
          (1, 3), (3, 3), (5, 3), (4, 4), (2, 4),
          (4, 5), (2, 5)]

In [None]:
def cal_acc_balanced(clf, trail_num):
    """
    A summary function that calculate the accuracy in the literature
    based the aforementioned approach. 
    
    clf and trail_num can be used to specify which classifier and which
    experiment that we wish to learn.
    """
    file_path = f"data_imbalLDA_{trail_num}.mat"
    data_preproc = preproc(file_path, trail_num)
    if clf == "SN_MN":
        pos1, neg1 = data_preproc.filter_index(2,5,2,4)
        pos2, neg2 = data_preproc.filter_index(4,5,4,4)
    elif clf == "F_CR":
        pos1, neg1 = data_preproc.filter_index(1,3,2,4)
        pos2, neg2 = data_preproc.filter_index(3,3,2,5)
    else:
        raise ValueError("Unknown Classifier. Should be either `SN_MN` or `F_CR`")
    pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
    X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)

    logo = LeaveOneGroupOut()

    scores = []

    for train_idx, test_idx in logo.split(X, y, subject):
        X_train, y_train = X[train_idx,:], y[train_idx]
        X_test, y_test = X[test_idx,:], y[test_idx]
        LDA = LinearDiscriminantAnalysis(shrinkage = None, solver = 'eigen')
        LDA.fit(X_train, y_train)
        # randomly drop datapoint to balance class
        pos_idx, neg_idx = np.arange(len(test_idx))[y_test == 1], np.arange(len(test_idx))[y_test != 1]
        pos_len, neg_len = len(pos_idx), len(neg_idx)
        acc = []
        for _ in range(10):
            if pos_len > neg_len:
                # when there are more positive class than negative
                # randomly drop positive class to equivalent the negative class
                pos_chosen = np.random.choice(pos_idx, neg_len, replace=False)
                neg_chosen = neg_idx
            else:
                pos_chosen = pos_idx
                neg_chosen = np.random.choice(neg_idx, pos_len, replace=False)
            filter_test_idx = np.concatenate([pos_chosen, neg_chosen])
            X_test_balanced, y_test_balanced = X_test[filter_test_idx, :], y_test[filter_test_idx]
            assert sum(y_test_balanced) == 0 # to check whether they are balanced class
            acc.append(LDA.score(X_test_balanced, y_test_balanced))
        acc = np.array(acc)
        scores.append(acc)
    scores = np.array(scores)
    return scores.mean()

In [None]:
# multiple group
file_path = "../../../EEG-Familiarity-Prediction/data_imbalLDA_1.mat"

data_preproc = preproc(file_path, experiment_num=1)
pos1, neg1 = data_preproc.filter_index(2,5,2,4)
pos2, neg2 = data_preproc.filter_index(4,5,4,4)

pos_idx, neg_idx = data_preproc.merge_two_class(pos1, neg1, pos2, neg2)
X, y, subject = data_preproc.get_data_by_index(pos_idx, neg_idx)

LDA = LinearDiscriminantAnalysis(shrinkage=None, solver="eigen")

pos_idx = [10, 11]
neg_idx = [8, 9]
data_preproc.generate_projections(LDA, pos_idx, neg_idx, X, y, subject)
plt.title(f"{clf} Projection Experiment {exp}", fontsize = 12)

ValueError: Found input variables with inconsistent numbers of samples: [2, 3813, 3813]

In [None]:
X.shape

(3813, 72)

In [None]:
y.shape

(3813,)

In [None]:
subject.shape

(3813,)