In [21]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import re
import glob
import h5py
from bids import BIDSLayout
import nibabel as nib
from PIL import Image
import json

from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torchvision.models import  ResNet50_Weights
import torchvision.transforms as transforms

dir2 = os.path.abspath('..')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: 
    sys.path.append(dir1)
from tc2see import load_data

### Global Variables

```
ROIs = {
    "FFC": [18],"V1": [1],"V2": [4],"V3": [5],"V3A": [13],"V3B": [19],"V3CD": [158],"V4": [6],"V6": [3],"V7": [16],
    "V8": [7], "VMV1": [153],"VMV2": [160],"VMV3": [154],"LO1": [20],"LO2": [21],"PIT": [22],"VVC": [163], "140": [140], "11":[11],
    "85": [85], "83":[83], "82": [82], "87": [87], "V1_V2_V3_V4": [1,4,5,6], "V1_V2": [1,4], "PIT_FFC_VVC": [22, 18, 163]
}
```

In [22]:
subjects = [ sub for sub in range(5,40) if sub not in [13]]
subject_strs = [ '0'+str(sub) if sub < 10 else str(sub) for sub in subjects]

ROIs = {
          "FFC": [18],"V1": [1],"V2": [4],"V3": [5],"V3A": [13],"V3B": [19],"V3CD": [158],"V4": [6],"V6": [3],"V7": [16],
          "V8": [7], "VMV1": [153],"VMV2": [160],"VMV3": [154],"LO1": [20],"LO2": [21],"PIT": [22],"VVC": [163], "PH": [140], "PEF":[11],
          "IFSa": [82], "p9-46v":[83], "a9-46v": [85], "9a": [87]
       }

In [23]:
dataset_root = Path('E:\\fmri_processing\\results')
dataset_path = dataset_root
dataset_layout = BIDSLayout(dataset_path / 'TC2See')
derivatives_path = dataset_path / 'derivatives_TC2See'
data_path = derivatives_path / 'fmriprep'

tc2see_version = 3
tr = 2
num_runs = 6

stimulus_images = h5py.File(derivatives_path / 'stimulus-images.hdf5', 'r')
stimulus_id_map = {i: name for i, name in enumerate(stimulus_images.attrs['stimulus_names'])}
images_dir = Path("E:/Decoding/bird_data/bird_images/docs/cropped")

load_data_params = dict(
    path = data_path / f'tc2see-v{tc2see_version}-fsaverage-surfs.hdf5', 
    tr_offset = num_runs / tr,
    run_normalize='linear_trend',
    interpolation=False,
)

### Representation Similarity Analysis

#### Create masks for relevant ROIs or ROI combinations

In [24]:
glasser_L = nib.freesurfer.io.read_annot("E:/fmri_processing/results/visualization/atlas/lh.HCPMMP1.annot")
glasser_R = nib.freesurfer.io.read_annot("E:/fmri_processing/results/visualization/atlas/rh.HCPMMP1.annot")

ROI_masks = {}

for key, vals in ROIs.items():

    # mask glasser atlas to mark current loop ROI as 1s
    L_mask = np.isin(glasser_L[0], vals) # vals is a list of ROIs to set as 1
    R_mask = np.isin(glasser_R[0], vals)
    
    # concatenate left and right hemispheres 
    L_R_concat_mask = np.concatenate([L_mask, R_mask], axis=0)
    ROI_masks[key] = L_R_concat_mask

#### Save ROI and layer representations to files if they haven't been saved already

In [30]:
for subject_str in subject_strs:
    sparrow_file_paths = []
    warbler_file_paths = []

    bold_run, stimulus_ids = load_data(
        **load_data_params,
        subject = f'sub-{subject_str}',
        run_ids = list(range(num_runs))
    )
    bold_run = pd.DataFrame(bold_run)
    bold_run.columns = bold_run.columns.astype(str)

    bold_run['stimulus_ids'] = stimulus_ids
    bold_run['stimulus_category'] = bold_run['stimulus_ids'].apply(lambda x: "Sparrow" if "Sparrow" in stimulus_id_map[x] else "Warbler")

    numeric_bold_run = bold_run.drop(columns=['stimulus_ids', 'stimulus_category'])
    stim_ids = bold_run['stimulus_ids'].values
    stim_cats = bold_run['stimulus_category'].values
            
    # Save ROI Representations
    for ROI, ROI_mask in ROI_masks.items():
        roi_path = Path("E:/Decoding/fmri-preprocessing/img_bold_arrays") / f"sub_{subject_str}" / f"img_roi_dfs" / "ROIs" / ROI

        if not roi_path.exists():
            roi_path.mkdir(parents=True, exist_ok=True)
            bold_file_name = roi_path / "roi_bold_for_imgs.parquet"
            
            bold_run_tmp = numeric_bold_run.loc[:, ROI_mask].copy()
            bold_run_tmp['stimulus_ids'] = stim_ids
            bold_run_tmp['stimulus_category'] = stim_cats

            bold_run_tmp.to_parquet(bold_file_name, index=False)

[133 183 182  22 253 131 130 180 184  21 180 251  24 250 254 181 131 130
 182 212 183 212  22 210 181 183  20 184 132 134 253 214 251  22  24 214
 210 250  23 131 211 213  20 251 180  23  20 212 213 250  24 211 181 184
 252  21 134 253 213 210 254 182 211 214  23  21 130 133 254 132 252 132
 133 252 134  93 190 192  31 290  92  91 191 194  33 191 293  30 292 291
 193  92  91 192 121 190 121  31 122 193 190  34 194  90  94 290 123 293
  31  30 123 122 292  32  92 120 124  34 293 191  32  34 121 124 292  30
 120 193 194 294  33  94 290 124 122 291 192 120 123  32  33  91  93 291
  90 294  90  93 294  94 114 163 160  41 273 113 112 164 161  42 164 271
  43 270 274 162 113 112 160 204 163 204  41 200 162 163  40 161 111 110
 273 202 271  41  43 202 200 270  44 113 203 201  40 271 164  44  40 204
 201 270  43 203 162 161 272  42 110 273 201 200 274 160 203 202  44  42
 112 114 274 111 272 111 114 272 110  50 220 222 142 263  53  51 223 224
 144 223 261 143 262 260 221  53  51 222  64 220  6

KeyboardInterrupt: 

In [27]:
# for subject_str in subject_strs:
#     sparrow_file_paths = []
#     warbler_file_paths = []

#     bold_run, stimulus_ids = load_data(
#         **load_data_params,
#         subject = f'sub-{subject_str}',
#         run_ids = list(range(num_runs))
#     )

#     bold_run = pd.DataFrame(bold_run)
#     bold_run['stimulus_ids'] = stimulus_ids
#     bold_run['stimulus_category'] = bold_run['stimulus_ids'].apply(lambda x: "S" if "Sparrow" in stimulus_id_map[x] else "W")

#     numerical_cols = bold_run.select_dtypes(include='number').columns
#     categorical_cols = bold_run.select_dtypes(exclude='number').columns

#     # Create the aggregation dictionary
#     agg_dict = {col: 'mean' for col in numerical_cols}
#     agg_dict.update({col: 'first' for col in categorical_cols})


#     # if duplicate_behaviour == 'average':
#     #     bold_run = bold_run.groupby('stimulus_ids').agg(agg_dict)
#     # elif duplicate_behaviour == 'average':
#     #     bold_run = bold_run

#     bold_run = bold_run.groupby('stimulus_ids').agg(agg_dict)
#     bold_run = bold_run.sort_values(by='stimulus_category')
    
#     bold_run = bold_run.drop(columns=['stimulus_ids', 'stimulus_category'])
#     bold_run = bold_run.to_numpy()
        
#     # Save ROI Representations
#     for ROI, ROI_mask in ROI_masks.items():
#         roi_path = Path("E:/Decoding/fmri-preprocessing/img_bold_arrays") / f"sub_{subject_str}" / f"all_runs_avg_duplicates" / "ROIs" / ROI

#         if not roi_path.exists():
#             roi_path.mkdir(parents=True, exist_ok=True)
#             bold_file_name = roi_path / "roi_bold_for_imgs.npy"
#             bold_run_tmp = bold_run.copy()

#             masked_bold_run = bold_run_tmp[:, ROI_mask]

#             np.save(bold_file_name, masked_bold_run)