In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained(
    "./backbone.nosync/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained(
    "./backbone.nosync/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained(
    "./backbone.nosync/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "return_dict_in_generate": True, "output_hidden_states": True}

# ['a woman in a hospital bed with a woman in a hospital bed']
# predict_step(['doctor.e16ba4e4.jpg'])


In [None]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(
      images=images, return_tensors="pt")

  pixel_values = pixel_values.pixel_values
  
  pixel_values = pixel_values.to(device)
  
  #outputs = model(pixel_values, decoder_input_embeds=pixel_values)
  outputs = model.generate(pixel_values, **gen_kwargs)
  print(len(outputs.encoder_hidden_states))
  output_ids = outputs.sequences

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['../../data.nosync/subj01/test_split/test_images/test-0011_nsd-04735.png'])

In [None]:
import re
import os
import os.path as osp

import cv2
import numpy as np
from torch.utils.data import Dataset


class Algonauts2023Raw(Dataset):
    """
        Load original data for Algonauts2023 dataset
    """

    def __init__(self, data_path: str, hemisphere: str = "L", transform=None, train: bool = True, return_img_ids: bool = False):
        """
            Initialize a torch.utils.data.Dataset object for algonauts2023 dataset

            Args:
                data_path,              str, path to the algonauts2023 dataset which contains only ONE subject
                hemisphere,             str, select which hemisphere of the brain to be modeled
                                            can ONLY select "L" or "R"
                                            and ONLY applicable when train is TRUE
                transform,              torchvision.transform methods, apply normalization to the dataset
                train,                  bool, training data will be loaded if True. Test data otherwise.
                return_img_ids,         bool, return image ids, only used for feature extraction
        """

        # collect data paths
        path_struct = osp.join(data_path, "{}_split")
        self.dataset = list()
        self.transform = transform
        self.train = train
        self.return_img_ids = return_img_ids

        if train:
            shared_path = osp.join(
                path_struct.format("training"), "training_{}")
            if hemisphere == "L":
                self.fmri = np.load(osp.join(shared_path.format(
                    "fmri"), "lh_training_fmri.npy"))
            elif hemisphere == "R":
                self.fmri = np.load(osp.join(shared_path.format(
                    "fmri"), "rh_training_fmri.npy"))

            self.feature_path = shared_path.format("images")

        else:
            self.feature_path = osp.join(
                path_struct.format("test"), "test_images")

        self.dataset = list(os.listdir(self.feature_path))

        # sorted in ascending order if not train set
        if not train:
            self.dataset = sorted(self.dataset, key=lambda x: int(
                re.findall("\d{4}", x)[0]) - 1)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index: int):
        """
            Load designated sample

            Arg:
                index,          int, sample id

            Returns:
                image,          np.ndarray, the 3d numpy array of the image used to retrive fmri data
                fmri,           np.ndarray, the hemisphere FMRI data generated by the image
                img_ids,        str, image ids, only used for feature extraction
        """

        feat_file = self.dataset[index]
        feat_idx = int(re.findall("\d{4}", feat_file)[0]) - 1

        img = cv2.imread(osp.join(self.feature_path, feat_file)
                         ).astype(np.float32)

        # convert BGR to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform:
            img = self.transform(img)

        if self.return_img_ids:
            return img, self.fmri[feat_idx] if self.train else 0, feat_file
        else:
            return img, self.fmri[feat_idx] if self.train else 0


In [None]:
subj = "subj01"

path = "../../data.nosync/{}".format(subj)
save = "../../data.nosync/{}/training_split/training_features/vit-gpt2-image-captioning/decoder-raw".format(
    subj)


In [None]:
dset = Algonauts2023Raw(path, return_img_ids=True)

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

model.eval()

for img, _, id in tqdm(DataLoader(dset, batch_size=16)):

    pixel_values = feature_extractor(
        images=img, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    feats = model.generate(pixel_values, **gen_kwargs)
    feats = feats.encoder_hidden_states

    for i in range(len(id)):
        hs = [f[i] for f in feats]
        hs = torch.stack(hs).cpu().numpy().astype(np.float32)

        if not os.path.isdir(os.path.join(save)):
            os.makedirs(save)

        np.save(os.path.join(save, id[i].split(".")[0]+".npy"), hs)


In [None]:
import os
import numpy as np
from tqdm import tqdm

avg_save = "../../data.nosync/{}/training_split/training_features/vit-gpt2-image-captioning/encoder-last-768".format(
    subj)

if not os.path.isdir(avg_save):

    os.makedirs(avg_save)

for x in tqdm(os.listdir(save)):
    dat = np.load(os.path.join(save, x))
    dat = dat[:, 0].reshape(-1)
    
    #dat = np.mean(dat, axis=1).reshape(-1)
    np.save(os.path.join(avg_save, x), dat)


In [None]:
import os
import numpy as np

feats = list()
files = os.listdir(save)
for x in files:
    feats.append(np.load(os.path.join(save, x)).reshape(-1))

feats = np.vstack(feats)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=64)

pca.fit(X=feats)

In [None]:
reduced_feats = pca.transform(feats)

In [None]:
reduced_feats.shape

In [None]:
pca_save = "../../data.nosync/{}/training_split/training_features/vit-gpt2-image-captioning/encoder-pca-64".format(
    subj)

if not os.path.isdir(pca_save):

    os.makedirs(pca_save)

for f, x in zip(reduced_feats, files):
    np.save(os.path.join(pca_save, x), f.astype(np.float32))
