# In this notebook we compare 4 different NLP models

We chose:
 - GloVe as a baseline model
 - LSTM 
 - GPT2 (as a decoder transformer model)
 - BERT (as an encoder transformer model)

For each model, we extracted weights from the model dynamic (or static) as a way of representing artificial brain activations resulting from natural language processing.
These activations were shaped by the results of the section 'ActivationExtractionProtocol'.

In [1]:
import warnings
warnings.simplefilter(action='ignore')

import os
import gc
import glob
import itertools
from tqdm import tqdm
from itertools import combinations
from joblib import Parallel, delayed

import umap
import scipy
import hdbscan
import nistats
import numpy as np
import pandas as pd
from sklearn import manifold
from sklearn.decomposition import PCA, FastICA
from sklearn.neighbors import kneighbors_graph
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.cluster import AgglomerativeClustering, KMeans

%matplotlib inline
import matplotlib
import seaborn as sns
import matplotlib.cm as cmx
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import nibabel as nib
import nilearn
from nilearn.image import load_img, mean_img, index_img, threshold_img, math_img, smooth_img, new_img_like
from nilearn.input_data import NiftiMapsMasker, NiftiMasker, NiftiLabelsMasker, MultiNiftiMasker
from nistats.second_level_model import SecondLevelModel
from nistats.thresholding import map_threshold
from nilearn import plotting
from nilearn import datasets
from scipy.stats import norm
from nilearn.surface import vol_to_surf

import utils 
import reporting
from logger import Logger
from linguistics_info import *

### Paths

In [None]:
language = 'english'

In [None]:
PROJECT_PATH = f"/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/"
OUTPUT_PATH = f"/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/maps/{language}"
INPUT_PATH = f"/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{language}"
FMRIDATA_PATH = f"/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/fMRI/{language}"
MASKER_PATH = f"/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/global_masker_95%_{language}"


### Atlas

In [None]:
atlas_maps, labels = reporting.load_atlas()

## Explainable ceiling

Before starting to look at the results of the comparison, we investigated how much signal could be explained putting aside noise from the activation and among subjects.

In [2]:
gc.collect()

28

In [None]:
path_to_fast_srm_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/oldstuff/fastsrm/'

In [None]:
files = sorted(glob.glob(os.path.join(path_to_fast_srm_data, 'sub-*/R2_sub-*.npy')))
data = [new_masker.inverse_transform(np.load(file_)) for file_ in files]
img = mean_img(data)

In [None]:
plotting.plot_glass_brain(img, colorbar=True, display_mode='lzry', plot_abs=False, title='Predictable R2 value for each voxel with FastSRM')
plt.show()

We computed the explained variance by mean of a cross-validated R2 value for each voxel.
Cross-validation was done over sessions and then over sampled test-set (26 sampled test-set of 5 subjects with replacement) from all subjects.
We therefore have X session-cross-validated R2 brain maps.
We average these maps and transform its values to Pearson coefficients by taking the square root.

In [None]:
pearson_img = math_img('np.sqrt(img)', img=img)

In [None]:
plotting.plot_glass_brain(img, colorbar=True, display_mode='lzry', plot_abs=False, title='Predictable Pearson value for each voxel with FastSRM')
plt.show()

## Model comparison

Based on the results of the ActivationExtractionProtocol section, we compare the following models:
- Glove
- LSTM (1 hidden layer - 300 units)
- GPT-2 (small version - all hidden layers + PCA with 300 components - 20 sentences of pre-context - norm infinity normalization before the pipeline - norm infinity normalization before the ridge regression)
- BERT (small-version - cased - all hidden layers + PCA with 300 components - 7 sentences of pre-context - norm infinity normalization before the ridge regression)

In [None]:
model_names = [
    'glove_300_{}',
    'LSTM_embedding-size_600_nhid_300_nlayers_1_dropout_02_wiki_kristina_english_{}_all-hidden-layers',
    'gpt2_pre-20_1_norm-inf_norm-inf_{}_hidden-all-layers_pca_300',
    'bert-base-cased_pre-7_1_post-0_norm-None_norm-inf_temporal-shifting-0_{}_hidden-all-layers_pca_300',
]
legend_names = ['GloVe', 
                'LSTM-E600-H300-L1', 
                'GPT2-scaled-Hpca-pre-20',
                'BERT-Hpca-pre-7-post-0'
               ]

In [None]:
anova_comparison = reporting.prepare_data_for_anova(
    model_names, 
    atlas_maps, 
    labels, 
    MASKER_PATH,
    object_of_interest='Pearson_coeff', 
    language='english',
    OUTPUT_PATH='/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/maps/english'
    )

In [None]:
saving_folder = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/anovas/'
check_folder(saving_folder)

In [None]:
anova_comparison.to_csv(os.path.join(saving_path, 'anova_comparison.csv'))

TODO: Do the anova analysis on R.

In [None]:
data_full = reporting.get_model_data(model_names, language, OUTPUT_PATH)
data_model_comparison = { key.replace('_{}', ''): data_full[key.replace('_{}', '')] for key in model_names }
data_to_plot = reporting.get_data_per_roi(
                                data_model_comparison, 
                                atlas_maps,
                                labels,
                                analysis=None, 
                                language='english', 
                                object_of_interest='Pearson_coeff',
                                PROJECT_PATH='/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/'
                                )
mean = data_to_plot['mean']
third_quartile = data_to_plot['third_quartile']
maximum = data_to_plot['maximum']


In [None]:
clever_plot(third_quartile, labels, legend_names, save_folder=None, roi_filter=load_syntactic_roi())

In [None]:
clever_plot(third_quartile, labels, legend_names, save_folder=None, roi_filter=load_language_roi())

In [None]:
clever_plot(third_quartile, labels, legend_names, save_folder=None, roi_filter=load_interesting_rois())

In [None]:
clever_plot(third_quartile, labels, legend_names, save_folder=None, roi_filter=load_intriguing_rois())

In [None]:
reporting.vertical_plot(
                third_quartile, 
                labels, 
                'Third_Quartile_Pearson-coeff_per_ROI',
                save_folder=None, 
                object_of_interest='Pearson_coeff', 
                legend_names=legend_names, 
                syntactic_roi=load_syntactic_roi(), 
                language_roi=load_language_roi(), 
                figsize=(9,18), 
                count=False, 
                title=None, 
                ylabel='Regions of interest (ROI)', 
                xlabel='Pearson_coeff value', 
                model_name='Model_comparison'
                )