# part0: imports

In [None]:
import os, sys, pathlib
from pprint import pprint 
from importlib import reload
import logging
from typing import Callable
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.simplefilter("ignore")



import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import scipy.linalg as linalg

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
from matplotlib.ticker import MaxNLocator


from tools import utilityTools as utility
from tools import dataTools as dt
import pyaldata as pyal

%matplotlib inline
reload(dt)

# Global params
root = pathlib.Path("/data")

BIN_SIZE = .03  # sec
WINDOW_prep = (-.39, .06)  # sec
WINDOW_exec = (-.12, .42)  # sec

In [None]:
def get_target_id(trial):
    return int(np.round((trial.target_direction + np.pi) / (0.25*np.pi))) - 1

In [None]:
%run dataset_selection.ipynb

## Control

let's find a lower-bound. Canonical correlation without matching the animal, the area, the epoch and the target!

### Details
- **animal**: $C_L$  is Chewie
- **window**: For *M1* it is $-120ms \sim +420ms$ and for *PMd* from $-390ms  \sim +60ms$
- **dim**: it is 10 for M1 and 15 for PMd
- **preprocessing**: 
    - remove firing rate below 1Hz overall
    - bin to 30ms
    - square root transform
    - smooth by Gaussian kernel, $\sigma=50ms$
    
I'm including only one session per animal

In [None]:
areas = ('M1', 'PMd')

pairFileList = []
for animal1 in GoodDataList[areas[0]]:
    for animal2 in GoodDataList[areas[1]]:
        if animal2  == animal1 or '2' in animal1 or '2' in animal2:  # to removre Chewie2
            continue
        path1 = root/animal1/GoodDataList[areas[0]][animal1][0]
        path2 = root/animal2/GoodDataList[areas[1]][animal2][0]
        pairFileList.append((pyal.mat2dataframe(path1, shift_idx_fields=True),
                             pyal.mat2dataframe(path2, shift_idx_fields=True)))
        
print(f'{len(pairFileList)=}')

In [None]:
def prep_general (df, area='M1'):
    "preprocessing general!"
    
    df["target_id"] = df.apply(get_target_id, axis=1)  # add a field `target_id` with int values

    df_ = pyal.remove_low_firing_neurons(df, f"{area}_spikes", 1)
    
    df_= pyal.select_trials(df, df.result== 'R')
    df_= pyal.select_trials(df_, df_.epoch=='BL')
    
    assert np.all(df_data_M1[0].bin_size == .01), 'bin size is not consistent!'
    df_ = pyal.combine_time_bins(df_, int(BIN_SIZE/.01))
    
    df_ = pyal.sqrt_transform_signal(df_, f"{area}_spikes")
        
    df_= pyal.add_firing_rates(df_, 'smooth', std=0.05)
    
    
    return df_


df_M1_ready = [prep_general(df) for  df in df_data_M1]

prep_epoch = pyal.generate_epoch_fun(start_point_name='idx_movement_on',
                                     rel_start=int(WINDOW_prep[0]/BIN_SIZE),
                                     rel_end=int(WINDOW_prep[1]/BIN_SIZE)
                                    )
exec_epoch = pyal.generate_epoch_fun(start_point_name='idx_movement_on', 
                                     rel_start=int(WINDOW_exec[0]/BIN_SIZE),
                                     rel_end=int(WINDOW_exec[1]/BIN_SIZE)
                                    )

Finding the minimum number of trials per target across all datasets

and some other parameters

collecting all the data in a matrix, `AllData`: $sessions \times targets \times  trials \times time \times PCs$

In [None]:
def get_data_array(data_list: list, epoch , area: str ='M1', n_components: int =10) -> np.ndarray:
    """
    Applies PCA to the data and return a data matrix of the shape: sessions x targets x  trials x time x PCs
    
    Parameters
    ----------
    `data_list`: list of pd.dataFrame datasets from pyal-data
    `epoch`: an epoch function of the type `pyal.generate_epoch_fun`
    
    `area`: area, either: 'M1', or 'S1', or 'PMd'

    Returns
    -------
    `AllData`: np.array

    Signature
    -------
    AllData = get_data_array(data_list, execution_epoch, n_components=10)
    all_data = np.reshape(AllData, (-1,10))
    """
    field = f'{area}_rates'
    n_shared_trial = np.inf
    for df in data_list:
        for target in range(8):
            df_ = pyal.select_trials(df, df.target_id== target)
            n_shared_trial = np.min((df_.shape[0], n_shared_trial))

    n_shared_trial = int(n_shared_trial)

    # finding the number of timepoints
    df_ = pyal.restrict_to_interval(df_,epoch_fun=epoch)
    n_timepoints = int(df_[field][0].shape[0])

    # pre-allocating the data matrix
    AllData = np.empty((len(data_list), 8, n_shared_trial, n_timepoints, n_components))

    rng = np.random.default_rng(12345)
    for session, df in enumerate(data_list):
        df_ = pyal.restrict_to_interval(df,epoch_fun=epoch)
        rates = np.concatenate(df_[field].values, axis=0)
        rates -= np.mean(rates,axis=0)
        rates_model = PCA(n_components=n_components, svd_solver='full').fit(rates)
        df_ = pyal.apply_dim_reduce_model(df_, rates_model, field, '_pca');

        for target in range(8):
            df__ = pyal.select_trials(df_, df_.target_id==target)
            all_id = df__.trial_id.to_numpy()
            rng.shuffle(all_id)
            # select the right number of trials to each target
            df__ = pyal.select_trials(df__, lambda trial: trial.trial_id in all_id[:n_shared_trial])
            for trial, trial_rates in enumerate(df__._pca):
                AllData[session,target,trial, :, :] = trial_rates
    
    return AllData

do the CCA calculation

In [None]:
n_iter = 100
trialList = np.arange(n_shared_trial)

CCs=[]
for session, sessionData in enumerate(AllData):
    r = []
    for n in range(n_iter):
        rng.shuffle(trialList)
        # non-overlapping randomised trials
        trial1 = trialList[:n_shared_trial//2]
        trial2 = trialList[-n_shared_trial//2:-1]
        data1 = np.reshape(sessionData[:,trial1,:,:], (-1,n_components))
        data2 = np.reshape(sessionData[:,trial2,:,:], (-1,n_components))
        r.append(np.mean(dt.canoncorr(data1, data2)[:4]))
    CCs.append(r)
CCs = np.array(CCs).T

In [None]:
plt.plot(CCs.T,'.')
plt.ylim([0,1])

the average distribution of CCs for each session

In [None]:
n_iter = 100
trialList = np.arange(n_shared_trial)

CCs=[]
for session, sessionData in enumerate(AllData):
    r = []
    for n in range(n_iter):
        rng.shuffle(trialList)
        # non-overlapping randomised trials
        trial1 = trialList[:n_shared_trial//2]
        trial2 = trialList[-n_shared_trial//2:-1]
        data1 = np.reshape(sessionData[:,trial1,:,:], (-1,n_components))
        data2 = np.reshape(sessionData[:,trial2,:,:], (-1,n_components))
        r.append(dt.canoncorr(data1, data2))
    CCs.append(np.array(r))
CCs = np.array(CCs).T

In [None]:
for i in range(4):
    plt.plot(np.mean(CCs[:,:,i],axis=1))