In [3]:
# CLIP model for fMRI text mapping
## using simple Ridge regression
# worked on algonauts dataset 

In [4]:
import os
import argparse
import math
import torch
import torchvision
# from torch.utils.tensorboard import SummaryWriter
from torchvision.transforms import ToTensor, Compose, Normalize
from tqdm import tqdm
import random
import numpy as np
import timm
import numpy as np
from PIL import Image
import pandas as pd

from einops import repeat, rearrange
from einops.layers.torch import Rearrange
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from timm.models.layers import trunc_normal_
from timm.models.vision_transformer import Block
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from lightly.loss import BarlowTwinsLoss
from lightly.loss import NTXentLoss
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from pycocotools.coco import COCO

from matplotlib import pyplot as plt
%matplotlib inline

# import wandb

In [5]:
def setup_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed()

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Things dataset

In [7]:
# # load image features
# print('IMAGES -------')
# loaded = torch.load('../resources/clip_img_feat_224')
# train_feat_img = loaded['train_feat']
# test_feat_img = loaded['test_feat']
# train_feat_img = train_feat_img.squeeze(1)
# test_feat_img = test_feat_img.squeeze(1)
# train_feat_img = train_feat_img.detach().cpu().numpy()
# test_feat_img = test_feat_img.detach().cpu().numpy()
# print(train_feat_img.shape, test_feat_img.shape)
# print(np.max(train_feat_img), np.min(train_feat_img))
# scaler1 = MinMaxScaler()
# train_feat_img = scaler1.fit_transform(train_feat_img)
# test_feat_img = scaler1.transform(test_feat_img)
# print(np.max(train_feat_img), np.min(train_feat_img))
# train_feat_img = train_feat_img.astype('float32')
# test_feat_img = test_feat_img.astype('float32')

# # # load text features
# # print('TEXT -------')
# # loaded = torch.load('../resources/clip_txt_feat_224')
# # train_feat_img = loaded['train_feat']
# # test_feat_img = loaded['test_feat']
# # train_feat_img = train_feat_img.squeeze(1)
# # test_feat_img = test_feat_img.squeeze(1)
# # train_feat_img = train_feat_img.detach().cpu().numpy()
# # test_feat_img = test_feat_img.detach().cpu().numpy()
# # print(train_feat_img.shape, test_feat_img.shape)
# # print(np.max(train_feat_img), np.min(train_feat_img))
# # scaler1 = MinMaxScaler()
# # train_feat_img = scaler1.fit_transform(train_feat_img)
# # test_feat_img = scaler1.transform(test_feat_img)
# # print(np.max(train_feat_img), np.min(train_feat_img))
# # train_feat_img = train_feat_img.astype('float32')
# # test_feat_img = test_feat_img.astype('float32')


# # FMRI
# print('FMRI ----------')
# train_file = np.load('../resources/sub2-fmri-train.npy')
# test_file = np.load('../resources/sub2-fmri-test.npy')
# train_file = train_file.T
# test_file = test_file.T
# print(train_file.shape, test_file.shape)
# print(np.max(train_file), np.min(train_file))
# scaler2 = MinMaxScaler()
# train_file = scaler2.fit_transform(train_file)
# test_file = scaler2.transform(test_file)
# print(np.max(train_file), np.min(train_file))
# train_feat_fmri = train_file.astype('float32')
# test_feat_fmri = test_file.astype('float32')

# # # FMRI embeddings
# # train_file = np.load('../resources/ae_train_fmri_embed.npy')
# # test_file = np.load('../resources/ae_test_fmri_embed.npy')
# # print(train_file.shape, test_file.shape)
# # print(np.max(train_file), np.min(train_file))
# # train_file = train_file.astype('float32')
# # test_file = test_file.astype('float32')

### Algonauts NSD dataset

In [None]:
data_dir = '/home/Documents/research/projects/algonauts_2023/resources/algonauts_2023_challenge_data/'
data_dir = os.path.join(data_dir, 'subj01')

In [9]:
# # load image features
# loaded = torch.load(data_dir+'/clip_nsd_img_feat_224')
# feat_img = loaded['train_feat']
# feat_img = feat_img.squeeze(1)
# feat_img = feat_img.detach().cpu().numpy()
# print(feat_img.shape)

In [10]:
# # train test split because given test set does not have fMRI
# train_feat_img = feat_img[:7000]
# test_feat_img = feat_img[7000:]
# print(train_feat_img.shape, test_feat_img.shape)

In [11]:
# # normalize
# print(np.max(train_feat_img), np.min(test_feat_img))
# scaler1 = MinMaxScaler()
# train_feat_img = scaler1.fit_transform(train_feat_img)
# test_feat_img = scaler1.transform(test_feat_img)
# print(np.max(train_feat_img), np.min(train_feat_img))
# train_feat_img = train_feat_img.astype('float32')
# test_feat_img = test_feat_img.astype('float32')

In [12]:
nsd_captions = pd.read_csv("/home/Documents/research/projects/algonauts_2023/resources/nsd_captions.csv")
nsd_captions.shape

(49205, 6)

In [13]:
nsd_captions.head()

Unnamed: 0,image_id,nsd_id,coco_id,coco_caption_id,coco_caption,split
0,train-0001_nsd-00013.png,13,24610,162385,A disorderly living area is free from decorati...,Test
1,train-0001_nsd-00013.png,13,24610,168364,"A small living space with a couch, desk, chair...",Test
2,train-0001_nsd-00013.png,13,24610,170512,There is a laptop computer in a room with a co...,Test
3,train-0001_nsd-00013.png,13,24610,175837,A simple room with minimal decor and furniture.,Test
4,train-0001_nsd-00013.png,13,24610,178792,A small room with a computer and bookcase.,Test


In [14]:
def extract_nth_rows(dataframe, n):
    # Extract every nth row
    return dataframe.iloc[n-1::n]

nsd_captions_subset = extract_nth_rows(nsd_captions, 5)
nsd_captions_subset.shape

(9841, 6)

In [15]:
captions_list = nsd_captions_subset.coco_caption.tolist()
train_captions = captions_list[:7000]
test_captions = captions_list[7000:]

In [16]:
# load text feat
text_feat = np.load(data_dir+'/clip_nsd_caption_feat.npy')
text_feat = text_feat.squeeze(1)
text_feat.shape

(9841, 512)

In [17]:
# train test split because given test set does not have fMRI
train_feat_txt = text_feat[:7000]
test_feat_txt = text_feat[7000:]
print(train_feat_txt.shape, test_feat_txt.shape)

(7000, 512) (2841, 512)


In [18]:
# normalize
print(np.max(train_feat_txt), np.min(test_feat_txt))
scaler2 = MinMaxScaler()
train_feat_txt = scaler2.fit_transform(train_feat_txt)
test_feat_txt = scaler2.transform(test_feat_txt)
print(np.max(train_feat_txt), np.min(test_feat_txt))
train_feat_txt = train_feat_txt.astype('float32')
test_feat_txt = test_feat_txt.astype('float32')
# train_feat_txt, test_feat_txt

8.483438 -3.940516
1.0000001 -0.1722239


In [19]:
# load fMRI training data
fmri_dir = os.path.join(data_dir, 'training_split', 'training_fmri')
lh_fmri = np.load(os.path.join(fmri_dir, 'lh_training_fmri.npy'))
rh_fmri = np.load(os.path.join(fmri_dir, 'rh_training_fmri.npy'))

print('LH training fMRI data shape:')
print(lh_fmri.shape)
print('(Training stimulus images × LH vertices)')

print('\nRH training fMRI data shape:')
print(rh_fmri.shape)
print('(Training stimulus images × RH vertices)')

LH training fMRI data shape:
(9841, 19004)
(Training stimulus images × LH vertices)

RH training fMRI data shape:
(9841, 20544)
(Training stimulus images × RH vertices)


In [20]:
train_feat_fmri_lh = lh_fmri[:7000]
test_feat_fmri_lh = lh_fmri[7000:]
train_feat_fmri_rh = rh_fmri[:7000]
test_feat_fmri_rh = rh_fmri[7000:]
print(train_feat_fmri_lh.shape, test_feat_fmri_lh.shape)

(7000, 19004) (2841, 19004)


In [21]:
# normalize
print(np.max(train_feat_fmri_lh), np.min(train_feat_fmri_lh))
scaler3 = MinMaxScaler()
train_feat_fmri_lh = scaler3.fit_transform(train_feat_fmri_lh)
test_feat_fmri_lh = scaler3.transform(test_feat_fmri_lh)
print(np.max(train_feat_fmri_lh), np.min(train_feat_fmri_lh))
train_feat_fmri_lh = train_feat_fmri_lh.astype('float32')
test_feat_fmri_lh = test_feat_fmri_lh.astype('float32')

6.2208066 -5.5488534
1.0000001 0.0


In [22]:
print(np.max(train_feat_fmri_rh), np.min(train_feat_fmri_rh))
scaler4 = MinMaxScaler()
train_feat_fmri_rh = scaler4.fit_transform(train_feat_fmri_rh)
test_feat_fmri_rh = scaler4.transform(test_feat_fmri_rh)
print(np.max(train_feat_fmri_rh), np.min(train_feat_fmri_rh))
train_feat_fmri_rh = train_feat_fmri_rh.astype('float32')
test_feat_fmri_rh = test_feat_fmri_rh.astype('float32')

6.2803955 -6.224722
1.0000001 0.0


#### Model on text features

In [23]:
# train_feat_txt, test_feat_txt

In [29]:
%%time

clf_lh = Ridge(alpha=100)
# clf = ExtraTreesRegressor(n_estimators=500)
clf_lh.fit(train_feat_fmri_lh, train_feat_txt)
train_feat_img_pred_lh = clf_lh.predict(train_feat_fmri_lh)
test_feat_img_pred_lh = clf_lh.predict(test_feat_fmri_lh)
print(train_feat_img_pred_lh.shape, test_feat_img_pred_lh.shape)

(7000, 512) (2841, 512)
CPU times: user 26.4 s, sys: 2.26 s, total: 28.7 s
Wall time: 8.53 s


In [30]:
%%time

clf_rh = Ridge(alpha=100)
# clf = ExtraTreesRegressor(n_estimators=500)
clf_rh.fit(train_feat_fmri_rh, train_feat_txt)
train_feat_img_pred_rh = clf_rh.predict(train_feat_fmri_rh)
test_feat_img_pred_rh = clf_rh.predict(test_feat_fmri_rh)
print(train_feat_img_pred_rh.shape, test_feat_img_pred_rh.shape)

(7000, 512) (2841, 512)
CPU times: user 28.2 s, sys: 2.33 s, total: 30.5 s
Wall time: 9.07 s


In [31]:
train_feat_img_pred = np.mean([train_feat_img_pred_lh, train_feat_img_pred_rh],axis=0)
test_feat_img_pred = np.mean([test_feat_img_pred_lh, test_feat_img_pred_rh],axis=0)
print(train_feat_img_pred.shape, test_feat_img_pred.shape)

(7000, 512) (2841, 512)


In [32]:
# # normalize predictions
# test_feat_img_pred = scaler2.transform(test_feat_img_pred)      # didnt work
# print(np.max(test_feat_img_pred), np.min(test_feat_img_pred))
# test_feat_img_pred = test_feat_img_pred.astype('float32')
# print(np.max(test_feat_img_pred), np.min(test_feat_img_pred))

In [33]:
print(mean_squared_error(test_feat_txt, test_feat_img_pred))
print(mean_squared_error(train_feat_txt, train_feat_img_pred))

0.016338972
0.0082784835


In [35]:
clf_rh_weights = np.mean(clf_rh.coef_, axis=0)
clf_rh_weights.shape

(20544,)

In [36]:
clf_lh_weights = np.mean(clf_lh.coef_, axis=0)
clf_lh_weights.shape

(19004,)

### testing

In [37]:
test_num = 6

In [38]:
sample_test_img_embed = test_feat_img_pred[test_num]
sample_test_img_embed = sample_test_img_embed.reshape(1, -1)
sample_test_img_embed.shape

(1, 512)

In [39]:
similarities = cosine_similarity(sample_test_img_embed, train_feat_txt)
similarities.shape

(1, 7000)

In [40]:
pred_inx = 0
indices = np.argsort(similarities.flatten())[-1:]
print("predicted labels:")
for idx in indices:
    print(idx)
    pred_inx = idx
pred_inx

predicted labels:
4306


4306

In [41]:
print('Actual - ', test_captions[test_num])
print('predicted - ', train_captions[pred_inx])

Actual -  A man that is sitting down holding bread.
predicted -  A man is sitting with another and holding a plate of food.
