# part0: imports

In [25]:
import os, sys, pathlib
from pprint import pprint 
from importlib import reload
import logging
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.simplefilter("ignore")


import pandas as pd
import numpy as np
import xarray as xr
from sklearn.decomposition import PCA
import scipy.linalg as linalg

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
from matplotlib.ticker import MaxNLocator


from tools import utilityTools as utility
from tools import dataTools as dt
import pyaldata as pyal

%matplotlib inline

root = pathlib.Path("/data")

# Compare different epochs

the idea is to see whether canonical axes between 2 animals provide a higher VAF for time epochs in the trial that they have not been trained on, compared to, for example, M1-PMd axes in a single animal.

Check the VAF for a single session as a test

In [2]:
animal= 'Chewie'
fname = root / animal / "Chewie_CO_CS_2016-10-21.mat"

df = pyal.mat2dataframe(fname, shift_idx_fields=True)

preprocessing

In [3]:
def get_target_id(trial):
    return int(np.round((trial.target_direction + np.pi) / (0.25*np.pi))) - 1

df = pyal.select_trials(df, df.result== 'R')
df = pyal.select_trials(df, df.epoch=='BL')
df = pyal.remove_low_firing_neurons(df, "M1_spikes", 1)
df = pyal.remove_low_firing_neurons(df, "PMd_spikes", 1)

df = pyal.add_firing_rates(df, 'smooth')

df["target_id"] = df.apply(get_target_id, axis=1)  # add a field `target_id` with int values



pca to m1 and pmd

In [4]:
M1_rates = np.concatenate(df.M1_rates.values, axis=0)
pca_model = PCA(n_components=20, svd_solver='full');
pca_model.fit(M1_rates);
df = pyal.apply_dim_reduce_model(df, pca_model, 'M1_rates', 'M1_pca');

PMd_rates = np.concatenate(df.PMd_rates.values, axis=0)
pca_model = PCA(n_components=20, svd_solver='full');
pca_model.fit(PMd_rates);
df = pyal.apply_dim_reduce_model(df, pca_model, 'PMd_rates', 'PMd_pca');


applying PCA for 300ms past movement onset

In [5]:
df_ = pyal.restrict_to_interval(df, start_point_name='idx_movement_on', rel_start=0, rel_end=30)

M1_rates = np.concatenate(df_.M1_rates.values, axis=0)
M1_model = PCA(n_components=20, svd_solver='full');
M1_model.fit(M1_rates);
df_ = pyal.apply_dim_reduce_model(df_, M1_model, 'M1_rates', 'M1_pca');

PMd_rates = np.concatenate(df_.PMd_rates.values, axis=0)
PMd_model = PCA(n_components=20, svd_solver='full');
PMd_model.fit(PMd_rates);
df_ = pyal.apply_dim_reduce_model(df_, PMd_model, 'PMd_rates', 'PMd_pca');

to make life easy, just limit everything to 1 target

In [6]:
df_= pyal.select_trials(df_, df_.target_direction ==0)

CCA on m1-pmd

In [23]:
d0 = np.concatenate(df_['M1_pca'].values, axis=0)
d1 = np.concatenate(df_['PMd_pca'].values, axis=0)

n_samples = min ([d0.shape[0], d1.shape[0]])
d0 = d0[:n_samples,:]
d1 = d1[:n_samples,:]

A, B, r, _, _ = dt.canoncorr(d0, d1, fullReturn=True)

In [8]:
print(f'the CCs are:{r}')

the CCs are:[0.94779339 0.87838562 0.83821908 0.73555619 0.70545989 0.66681631
 0.61031016 0.5606342  0.51480629 0.46384192 0.42750414 0.42011013
 0.32919582 0.29094555 0.28618524 0.23505218 0.22347631 0.11351897
 0.10931743 0.06073484]


---

From Bence's code [here](https://github.com/BeNeuroLab/pysubspaces/blob/4b7c7512ccdff8fdbd1334cb6c3acc8f5bdbf3d6/pysubspaces/subspaces.py#L29).

In [9]:
def variance_in_subspace(X, W):
    """
    Variance in a given subspace
    Parameters
    ----------
    X : 2D np.ndarray
        n_samples x n_features data array
    W : 2D np.ndarray
        n_components x n_features projection matrix
    Returns
    -------
    variance in the subspace
    """
    return np.trace(W @ np.cov(X.T) @ W.T)

In [22]:
variance_in_subspace (M1_rates, A.T @ M1_model.components_)/ np.var(M1_rates)


0.36386542693890445

but what is the total variance? what does it mean?
based on the Nat Comm paper and some playing: 
$$\%VAF=\frac{var(X)-var(X-XC^TA^T(A^TA)^{-1}AC)}{var(X)}$$
where:
- $A$ is the CCA output, projection matric
- $C$ is the `PCA_model.components_`
- $X$ is the data matrix, $T\times n$ with $T$ time points and $n$ neurons

In [26]:
C = M1_model.components_
X = M1_rates

VAF = np.var(X) - np.var(X-X@C.T@A.T@linalg.inv(A.T@A)@A@C)

VAF /= np.var(X)

VAF

-84.77620263749426

all the preprocessing steps

In [None]:
def prep_pyalData (df):
    df_= pyal.add_firing_rates(df, 'smooth')
    df_= pyal.select_trials(df_, df_.result== 'R')
    df_= pyal.select_trials(df_, df_.epoch=='BL')
    df_= pyal.select_trials(df_, df_.target_direction < -2)
    
    df_ = pyal.remove_low_firing_neurons(df_, "M1_spikes", 1)
    df_ = pyal.remove_low_firing_neurons(df_, "PMd_spikes", 1)
    
    df_ = pyal.restrict_to_interval(df_, start_point_name='idx_movement_on', rel_start=0, rel_end=30)
    
    M1_rates = np.concatenate(df_.M1_rates.values, axis=0)
    pca_model = PCA(n_components=20, svd_solver='full');
    pca_model.fit(M1_rates);
    df_ = pyal.apply_dim_reduce_model(df_, pca_model, 'M1_rates', 'M1_pca');

    PMd_rates = np.concatenate(df_.PMd_rates.values, axis=0)
    pca_model = PCA(n_components=20, svd_solver='full');
    pca_model.fit(PMd_rates);
    df_ = pyal.apply_dim_reduce_model(df_, pca_model, 'PMd_rates', 'PMd_pca');

    
    return df_

Prep both sessions

In [None]:
df_data_ = [prep_pyalData(df) for  df in df_data]

plot the CCs across animals, to one target

In [None]:
d0_M1 = np.concatenate(df_data_[0].M1_pca.values, axis=0)
d1_M1 = np.concatenate(df_data_[1].M1_pca.values, axis=0)

# same number of timepoints in both matrices
n_samples_M1 = min ([d0_M1.shape[0], d1_M1.shape[0]])
d0_M1_ = d0_M1[:n_samples_M1,:]
d1_M1_ = d1_M1[:n_samples_M1,:]

CC_M1 = dt.canoncorr(d0_M1_, d1_M1_)

_,ax = plt.subplots()

ax.plot(CC_M1, 'r-o', label='aligned')

ax.set_xlabel('components')
ax.set_ylabel('correlation')
ax.legend()
ax.set_ylim([0,1])
ax.set_title('CCA between M1 of two monkeys');

#### BE WARE!

results depend on the initial number of PCs, i.e., features in the data matrix.
For 10 PCs, the max CC will be 0.75 instead of 0.85

---
Next, let's add to the plot above, within monkey M1 CCs.

In [None]:
d0_PMd = np.concatenate(df_data_[0].PMd_pca.values, axis=0)
d1_PMd = np.concatenate(df_data_[1].PMd_pca.values, axis=0)

# same number of timepoints in both matrices
n_samples_d0 = min ([d0_M1.shape[0], d0_PMd.shape[0]])
n_samples_d1 = min ([d1_M1.shape[0], d1_PMd.shape[0]])
d0_PMd_ = d0_PMd[:n_samples_d0,:]
d1_PMd_ = d1_PMd[:n_samples_d1,:]
n_samples_PMd = min ([d0_PMd.shape[0], d1_PMd.shape[0]])
d0_PMd_ = d0_PMd[:n_samples_PMd,:]
d1_PMd_ = d1_PMd[:n_samples_PMd,:]



CC_d0 = dt.canoncorr(d0_M1_, d0_PMd_)
CC_d1 = dt.canoncorr(d1_M1_, d1_PMd_)
CC_PMd= dt.canoncorr(d0_PMd_, d1_PMd_)

_,ax = plt.subplots()

ax.plot(CC_M1, label='M1 across')
ax.plot(CC_PMd, label='PMd across')
ax.plot(CC_d0,'--', label='Monkey1 within')
ax.plot(CC_d1,'--', label='Monkey2 within')

ax.set_xlabel('components')
ax.set_ylabel('canonical correlation')
ax.legend()
ax.set_ylim([0,1])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_title('CCA --- to a single target', usetex=True);

average over every target

In [None]:
def get_target_id(trial):
    return int(np.round((trial.target_direction + np.pi) / (0.25*np.pi))) - 1

def prep_pyalData_all_targets (df, n_components=20):
    df["target_id"] = df.apply(get_target_id, axis=1)  # add a field `target_id` with int values

    df_= pyal.add_firing_rates(df, 'smooth')
    df_= pyal.select_trials(df_, df_.result== 'R')
    df_= pyal.select_trials(df_, df_.epoch=='BL')
    
    df_ = pyal.remove_low_firing_neurons(df_, "M1_spikes", 1)
    df_ = pyal.remove_low_firing_neurons(df_, "PMd_spikes", 1)
    
    df_ = pyal.restrict_to_interval(df_, start_point_name='idx_movement_on', rel_start=0, rel_end=30)
    
    M1_rates = np.concatenate(df_.M1_rates.values, axis=0)
    pca_model = PCA(n_components=n_components, svd_solver='full');
    pca_model.fit(M1_rates);
    df_ = pyal.apply_dim_reduce_model(df_, pca_model, 'M1_rates', 'M1_pca');

    PMd_rates = np.concatenate(df_.PMd_rates.values, axis=0)
    pca_model = PCA(n_components=n_components, svd_solver='full');
    pca_model.fit(PMd_rates);
    df_ = pyal.apply_dim_reduce_model(df_, pca_model, 'PMd_rates', 'PMd_pca');

    
    return df_


df_data__ = [prep_pyalData_all_targets(df) for  df in df_data]

In [None]:
fig,ax = plt.subplots()

CC=[]
for target in range(8):
    dfs = [pyal.select_trials(df_,df_.target_id==target) for df_ in df_data__]
    
    # monkey0: m1~pmd
    CC.append(CCA_pyal(dfs[0],'M1_pca', dfs[0],'PMd_pca'))
    # monkey1: m1~pmd
    CC.append(CCA_pyal(dfs[1],'M1_pca', dfs[1],'PMd_pca'))
    # m1: mon1~mon2
    CC.append(CCA_pyal(dfs[0],'M1_pca', dfs[1],'M1_pca'))
    # pmd: mon1~mon2
    CC.append(CCA_pyal(dfs[0],'PMd_pca', dfs[1],'PMd_pca'))

CC= np.array(CC)

utility.shaded_errorbar(ax,CC[0::8,:].T, ls='--',label='monkey0: m1~pmd')
utility.shaded_errorbar(ax,CC[1::8,:].T,ls='--',label='monkey1: m1~pmd')
utility.shaded_errorbar(ax,CC[2::8,:].T,label='m1: mon0~mon1')
utility.shaded_errorbar(ax,CC[3::8,:].T,label='pmd: mon0~mon1')

ax.set_xlabel('components')
ax.set_ylabel('canonical correlation')
ax.legend()
ax.set_ylim([0,1])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_title('CCA --- average of all targets', usetex=True);

let's also add inter target CCA in individual animals

In [None]:
fig,ax = plt.subplots()

target_CC_M1=[]
target_CC_PMd=[]
for target0 in range(8):
    dfs0 = [pyal.select_trials(df_,df_.target_id==target0) for df_ in df_data__]
    for target1 in range(target0,8):
        dfs1 = [pyal.select_trials(df_,df_.target_id==target1) for df_ in df_data__]
        
        target_CC_M1.append(CCA_pyal(dfs0[0],'M1_pca', dfs1[0],'M1_pca'))
        target_CC_M1.append(CCA_pyal(dfs0[1],'M1_pca', dfs1[1],'M1_pca'))

        target_CC_PMd.append(CCA_pyal(dfs0[0],'PMd_pca', dfs1[0],'PMd_pca'))
        target_CC_PMd.append(CCA_pyal(dfs0[1],'PMd_pca', dfs1[1],'PMd_pca'))


target_CC_M1= np.array(target_CC_M1)
target_CC_PMd= np.array(target_CC_PMd)

# utility.shaded_errorbar(ax,CC[0::8,:].T,ls='--',label='monkey0: m1~pmd')
# utility.shaded_errorbar(ax,CC[1::8,:].T,ls='--',label='monkey1: m1~pmd')
# utility.shaded_errorbar(ax,CC[2::8,:].T,label='m1: mon1~mon2')
# utility.shaded_errorbar(ax,CC[3::8,:].T,label='pmd: mon1~mon2')

utility.shaded_errorbar(ax,target_CC_M1.T,ls='--',marker='o',label='m1: inter-target')
utility.shaded_errorbar(ax,target_CC_PMd.T,ls='--',marker='o',label='pmd: inter-target')


ax.set_xlabel('components')
ax.set_ylabel('canonical correlation')
ax.legend()
ax.set_ylim([0,1])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_title('CCA --- average of all target pairs', usetex=True);

## Same animal, different sessions

similar to the NN 2020 paper

In [None]:
pprint(goodFiles)

In [None]:
fileList2 = {'C':['/data/Chewie/Chewie_CO_CS_2016-10-14.mat', '/data/Chewie/Chewie_CO_CS_2016-10-21.mat'],
             'M':['/data/Mihili/Mihili_CO_CS_2015-05-12.mat', '/data/Mihili/Mihili_CO_CS_2015-05-11.mat']}
n_components = 10


df_data__ = {key:[] for key in fileList2}
for animal,fList in fileList2.items():
    for fname in fList:
        df = pyal.mat2dataframe(fname, shift_idx_fields=True)
        df = prep_pyalData_all_targets(df,n_components=n_components)
        df_data__[animal].append(df)

In [None]:
fig,ax = plt.subplots()

CC = {key:[] for key in fileList2}
for animal in df_data__:
    for target in range(8):
        dfs = [pyal.select_trials(df_,df_.target_id==target) for df_ in df_data__[animal]]

        CC[animal].append(CCA_pyal(dfs[0],'M1_pca', dfs[1],'M1_pca'))

CC0= np.array(CC['C'])
CC1= np.array(CC['M'])

utility.shaded_errorbar(ax, CC0.T, ls='--',label='monkeyC')
utility.shaded_errorbar(ax, CC1.T, ls='--',label='monkeyM')

ax.set_xlabel('components')
ax.set_ylabel('canonical correlation')
ax.legend()
ax.set_ylim([0,1.1])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_title(f'M1 CCA in 2 sessions', usetex=True);

check the date between the 2 sessions

In [None]:
from datetime import datetime
for animal, dfList in df_data__.items():
    date = []
    for df in dfList:
        date.append(datetime.strptime(df.date[0], "%m-%d-%Y"))
    print(f'For {animal=} the gap is:{date[1] - date[0]}')


check the number of neurons in the `MIHILI` dataset

In [None]:
df_data__['M'][0].M1_rates[0].shape, df_data__['M'][1].M1_rates[0].shape

maybe fewer neurons in `M` is the reason for its better performance!

It seems like it! with 60 components:

![image.png](attachment:fc93787a-ad8f-49ef-8a19-5f460bdd33e3.png)

# Ideas:
+ **Explained variance**:
Juan said:
>it'd be interesting to look at the variance explained by the shared dimensions though (both within monkey and across).

Calculating the explained variance could be through the method outlined in the paper: "Gallego, Nat Comm 2018" which I guess is the following process of encoding and decoding:
    
$$\% VAF = VAF\frac{X \rightarrow PC \rightarrow CC_{M0\sim M1} \rightarrow PC' \rightarrow X'}
{X \rightarrow PC \rightarrow X'}$$

- [x] **Fewer PCs**: So far, CCA has converged in every case.
Might be due to 20 PCs as input which gives CCA all the information it can ask for!
> check this [file](./CCA-fewPCs.ipynb).
Nothing too exciting!

+ **stability**: use the same transformations for aligning later sessions, hoping M1-PMd drops in accuracy and M1-M1 across animals, doesn't?!
    + doesn't make a lot of sense, because whatever the PMd-M1 relations are, they should remain similar later on during the same task

+ **non-movement epochs**: If the idea is that CCA reveals the underlying organisation, canonical axes should generalise across different time points in a trial;
i.e., post stimulus and post movement, ...
