# Notebook #3: PCA Rotation

This notebook runs Principal Components Analysis on the response-locked data,
and uses the rotation vectors to tranform the stimulus- and response-locked epochs for further analysis.

In [None]:
# coding: utf-8
import os
import sys
from glob import glob
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import mne
from mne.decoding import UnsupervisedSpatialFilter
# from sklearn.decomposition import PCA, FastICA, FactorAnalysis, IncrementalPCA

import sys
sys.path.append('src')
import eegf # My generic EEG Functions
import functions # Specific to this analysis
from functions import topomap, do_rt_comparison, do_threeway_comparison, do_twoway_comparison, do_component

mpl.rcParams['font.size'] = 20
mpl.rcParams['axes.titlesize'] = 'medium'
mpl.rcParams['axes.labelsize'] = 'medium'
mpl.rcParams['xtick.labelsize'] = 'medium'
mpl.rcParams['ytick.labelsize'] = 'medium'
mpl.rcParams['legend.fontsize'] = 'medium'
mpl.rcParams['figure.titlesize'] = 'medium'
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.edgecolor'] = 'k'
million = 1000000

subjects = [1001, 1002, 1003, 1004,
            1005, 1006,       1008,
            1009, 1010, 1011, 1012,
            1013, 1014, 1015, 1016,
            1017, 1018, 1019, 1020,
            1021]

data = pd.read_csv('data/all_trial_metadata.csv')

%mkdir -p data/pca_rotation

# Response-locked data


In [None]:
response_epochs = mne.read_epochs('data/response_epo.fif')
response_epochs_csd = eegf.surface_laplacian(response_epochs, m=5)

In [None]:
# data.groupby('participant')['action'].sum().plot(label='Originals')
# response_epochs.metadata.groupby('participant').count()['cb'].plot(label='EEG')
# plt.legend()
# plt.title('How many trials do we have?')

In [None]:
def plot_erp(epochs, ch=9):
    X = epochs.get_data()[:, ch]
    eegf.plot_mean_sem(X, epochs.times)
    
plot_erp(response_epochs)

In [None]:
plot_erp(response_epochs_csd)

In [None]:
resp_rej = functions.find_outlier_trials(response_epochs, thresh=120./million)
response_epochs = response_epochs[resp_rej==False]
response_epochs_csd = response_epochs_csd[resp_rej==False]

In [None]:
len(response_epochs_csd)

In [None]:
plot_erp(response_epochs_csd)

In [None]:
eegf.plot_joint(response_epochs.average(), [-1, 0]);

In [None]:
eegf.plot_joint(response_epochs_csd.average(), [-1, -.05, 0]);

In [None]:
trial_epochs = mne.read_epochs('data/trial_epo.fif').apply_baseline((-.1, 0))
trial_epochs_csd = eegf.surface_laplacian(trial_epochs, m=5).apply_baseline((-.1, 0))

In [None]:
plot_erp(trial_epochs_csd, 26)

In [None]:
trial_rej = functions.find_outlier_trials(trial_epochs, thresh=120./million)
trial_epochs = trial_epochs[trial_rej==False]
trial_epochs_csd = trial_epochs_csd[trial_rej==False]

In [None]:
eegf.plot_joint(trial_epochs.average(), [.125, .2, .27, .5, .625]);

In [None]:
eegf.plot_joint(trial_epochs_csd.average(), [.125, .2, .27, .5, .625]);

In [None]:
chan_indices = dict(zip(trial_epochs.ch_names, range(32)))

plt.figure(figsize=(20, 5))
for i, epochs, title in zip([1,2], 
                            [trial_epochs, response_epochs],
                            ['Stimulus-locked', 'Response-locked']):
    X = epochs.get_data()[:, :32] * million # Trial x Channel x Time
    plt.subplot(1, 2, i)
    for ch in ['Fcz', 'Pz']:
        ch_i = chan_indices[ch]
        eegf.plot_mean_sem(X[:, ch_i], epochs.times, label=ch)
    plt.title(title)
    plt.legend()
    eegf.flipy()
    plt.xlabel('Time (s)')
    plt.ylabel(u'Î¼V')
plt.show()    

In [None]:
original_data = pd.read_csv('data/all_trial_metadata.csv')
original_response_data = original_data[original_data['action']==1]
resp_data = response_epochs.metadata

In [None]:
n_orig = original_response_data.groupby('participant').count()['cb']
n_eeg = (
    resp_data.groupby('participant').count()['cb']
    .reset_index().rename({'cb':'eeg'}, axis=1)
    .assign(original= n_orig.values)
    .set_index('participant')
)
print('Trials dropped:')
print( np.mean(1 - (n_eeg['eeg'] / n_eeg['original']).astype(float)))
print( 1 - n_eeg['eeg'].sum() / n_eeg['original'].sum().astype(float))



## PCA - CSD

In [None]:
eegf.plot_joint(response_epochs_csd.average(), times=[-1, -.1, 0]);

In [None]:
# late_resp_epochs_csd = response_epochs_csd.copy().crop(-.2, 0)
# X = trial_epochs_csd.get_data()[:, :32]
respX = response_epochs_csd.get_data()[:, :32]
# X = response_epochs.copy().crop(-.2, 0).get_data()[:, :32]
# respX = response_epochs.get_data()[:, :32]
# X = trial_epochs_csd.get_data()[:, :32]
# late_respX = late_resp_epochs_csd.get_data()[:, :32]

X = response_epochs_csd.copy().crop(-.2, 0).get_data()[:, :32]
cov_method = 'avg' # Almost no difference!
if cov_method == 'avg':
    covariance_csd = np.array([np.cov(X[i] - X[i].mean()) 
                                    for i in range(X.shape[0])])
    cov = covariance_csd.mean(0)
if cov_method == 'concat':
    cX = X.swapaxes(0, 1).reshape(32, -1)
    cov = np.cov(cX)

sns.heatmap(cov, cmap='seismic', center=0)

In [None]:
X.shape

In [None]:
eig_vals, eig_vecs = np.linalg.eig(cov)
ix = np.flip(np.argsort(eig_vals))
eig_vals = eig_vals[ix]
eig_vecs = eig_vecs[:, ix] # Is this the right axis? - Yes

# plt.plot(np.log(eig_vals[:5] / eig_vals.mean()), '-o')
# plt.plot(eig_vals[:12] / eig_vals.mean(), '-o')
# plt.hlines(1, linestyle='dashed', *plt.xlim())

# Eigenvalues
print('Eigenvalues:', eig_vals[:4] / eig_vals.mean())

## Variance explained
ve = eig_vals / eig_vals.sum()
print('Variance explained:', ve[:4])

## Residual variance explained
print('Residual variance explained:')
for i in range(4):
    print('\t', ve[i] / (1-ve[:i].sum()))

In [None]:
from functions import plot_weight_topomaps, correct_rotation_signs
help(correct_rotation_signs)

In [None]:
plot_weight_topomaps(eig_vecs.T[:9], response_epochs.info);
plt.suptitle('Original weights')

In [None]:
eig_vecs_flipped = correct_rotation_signs(eig_vecs, response_epochs_csd, -2, 0)
plot_weight_topomaps(eig_vecs_flipped.T[:9], response_epochs.info)
plt.suptitle('Flipped weights')
eig_vecs = eig_vecs_flipped

In [None]:
plt.figure(figsize=(14, 8))
for i in range(32):
    ax = plt.subplot(4, 8, i+1)
    topomap(eig_vecs[:, i], response_epochs.info, axes=ax)
    plt.title('Component %i' % (i+1))
plt.suptitle('All unrotated components')
plt.savefig('data/pca_rotation/pca_full.png')
plt.show()

In [None]:
plt.plot(range(1, len(ve)+1), ve*100, '-o')
plt.hlines(ve.mean()*100, linestyle='dashed', *plt.xlim())
plt.text(19, .5 + ve.mean()*100, 'eigenvalue = 1')
plt.ylabel('% variance explained')
plt.xlabel('Component')
# plt.xticks([8, 16, 24, 32])
plt.xticks(range(1, 32, 2))
plt.xlim(0, 33)
plt.savefig('data/pca_rotation/explained.png')
plt.show()

In [None]:
n_to_retain = 9

In [None]:
do_big_computations = True
mpl.rcParams['font.size'] = 14
%mkdir  -p figures/components/pca

In [None]:
rotation_matrix = eig_vecs[:, :n_to_retain]
trial_epochs_pca = functions.rotate_epochs(trial_epochs_csd, rotation_matrix)
response_epochs_pca = functions.rotate_epochs(response_epochs_csd, rotation_matrix)

In [None]:
if do_big_computations:
    for i in range(n_to_retain):
        print(i, end=', ')
        fig = do_component(trial_epochs_pca, response_epochs_pca, 
                           ch=i, weights=eig_vecs[:, i], info=trial_epochs.info,
                           title = 'PCA Component #%i' % i, neg_up=False)
        fig.savefig('figures/components/pca/c%i.svg' % i)
        plt.close()

In [None]:
## Save to R
from functions import export_to_df
df1 = export_to_df(trial_epochs_pca,    chans=[1], filename='data/long_trial_pca.csv')
df2 = export_to_df(response_epochs_pca, chans=[1], filename='data/long_response_pca.csv')

In [None]:
np.savetxt('data/pca_rotation/resp_cov.csv', cov, delimiter=',')
np.savetxt('data/pca_rotation/eig_vecs.csv', eig_vecs, delimiter=',')
np.savetxt('data/pca_rotation/eig_vals.csv', eig_vals, delimiter=',')

# Varimax

In [None]:
n_to_retain = 9 # 3, 5, 8, or 9
varimax_vectors = functions.varimax(eig_vals[:n_to_retain] * eig_vecs[:, :n_to_retain], method='varimax')


In [None]:
plot_weight_topomaps(varimax_vectors, response_epochs.info);
plt.suptitle('Original varimax weights')


In [None]:
varimax_vectors_flipped = correct_rotation_signs(varimax_vectors.T, response_epochs_csd, -2, 0).T
plot_weight_topomaps(varimax_vectors_flipped, response_epochs.info)
plt.suptitle('Flipped varimax weights')
plt.savefig('data/pca_rotation/vmax_topo.svg')
varimax_vectors = varimax_vectors_flipped

In [None]:
# ## Sum of squares per loading -- This is NOT the variance explained
r_eigvals = np.sum(varimax_vectors**2, 1)
r_eigvals_prop = r_eigvals / r_eigvals.sum()
retained_prop =  (eig_vals/eig_vals.sum())[:n_to_retain].sum()
# plt.plot(r_eigvals_prop, '-o')
plt.plot(r_eigvals_prop * retained_prop, '-o')
plt.hlines(0, *plt.xlim())

In [None]:
do_big_computations = True
mpl.rcParams['font.size'] = 14
%mkdir  -p figures/components/vmax

In [None]:
L = varimax_vectors.T
L = L / L.max()
trial_epochs_vmax = functions.rotate_epochs(trial_epochs_csd, L)
response_epochs_vmax = functions.rotate_epochs(response_epochs_csd, L)

In [None]:
if do_big_computations:
    for i in range(n_to_retain):
        print(i, end=', ')
        fig = do_component(trial_epochs_vmax, response_epochs_vmax, 
                           ch=i, weights=varimax_vectors[i], info=trial_epochs.info,
                           title = 'Varimax Component #%i' % i, neg_up=False)
        fig.savefig('figures/components/vmax/c%i.svg' % i)
        plt.close()

In [None]:

## Save to R
from functions import export_to_df
df1 = export_to_df(trial_epochs_vmax,    chans=[1], filename='data/long_trial_vmax.csv')
df2 = export_to_df(response_epochs_vmax, chans=[1], filename='data/long_response_vmax.csv')

# np.savetxt('data/pca_rotation/resp_cov.csv', cov, delimiter=',')
# np.savetxt('data/pca_rotation/eig_vecs.csv', eig_vecs, delimiter=',')
# np.savetxt('data/pca_rotation/eig_vals.csv', eig_vals, delimiter=',')

In [None]:
# Double-check exported data
X = response_epochs_vmax.get_data()[:, 1] * million
plt.plot(response_epochs_vmax.times, X.mean(0), label='Original')

csv = pd.read_csv('data/long_response_vmax.csv')
m = csv.groupby('time')['ch1'].mean() * million
m.plot(label='Exported')
plt.legend();

## Full PCA Plot

In [None]:
components_to_plot = [2, 6, 1]
trialXR = trial_epochs_vmax.get_data() * million
respXR = response_epochs_vmax.get_data() * million

In [None]:
v = varimax_vectors[components_to_plot, :]
plot_weight_topomaps(v, response_epochs.info);
# plt.savefig('figures/pca_prt1.svg')

In [None]:
# fig, axes = plt.subplots(figsize=(12, 10), ncols=1, nrows=3, gridspec_kw={'height_ratios':[1, .5, 1]})
# fig, axes = plt.subplots(figsize=(12, 10), ncols=1, nrows=2)
fig, axes = plt.subplots(figsize=(12, 14), ncols=1, nrows=4, gridspec_kw={'height_ratios':[.2, 1, .2, 1]})

plt.sca(axes[0])
plt.hist(resp_data['rt'], bins=100)
plt.xticks([])
plt.xlim(-.5, 2.)

plt.sca(axes[1])
for i in [2, 6, 1]:
    eegf.plot_mean_sem(trialXR[:, i] * million, trial_epochs.times, label='C %i' % (i))
plt.legend(loc='upper left')
plt.vlines(0, linestyle='--', *plt.ylim())
plt.xlim(-.5, 2.)
plt.xlabel('Time from onset (s)')

plt.sca(axes[2])
plt.hist(-1 * resp_data['rt'], bins=100)
plt.xticks([])
plt.xlim(-2, .5)

plt.sca(axes[3])
for i in [2, 6, 1]:
    eegf.plot_mean_sem(respXR[:, i] * million, response_epochs.times, label='C %i' % (i))
plt.vlines(0, linestyle='--', *plt.ylim())
plt.xlim(-2, .5)
plt.xlabel('Time to action (s)')
# plt.savefig('figures/pca_prt2.svg')
plt.tight_layout()
plt.show()