# Extracting Features with ViT
- https://huggingface.co/docs/transformers/model_doc/vit#vision-transformer-vit
- https://arxiv.org/abs/2010.11929\
- Only a subset of features as kernel times out

In [None]:
import os
import torch
import csv

import pandas as pd
import numpy as np
import torchvision.transforms as transforms

from PIL import Image
from transformers import ViTImageProcessor, ViTFeatureExtractor, ViTModel
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

In [None]:
BASE = '/Users/brinkley97/Documents/development/'
CLASS_PATH = 'classes/csci_535_multimodal_probabilistic_learning/'
DATASET_PATH = 'datasets/project/'

In [None]:
def load_data(file):
    original_data = pd.read_csv(file)
    # original_data = pd.DataFrame(file)
    copy_of_data = original_data.copy()
    return copy_of_data

In [None]:
file_paths = BASE + CLASS_PATH + DATASET_PATH + 'crema_d/paths_to_crema_d_emotion_folder_images.csv'
# file_paths

In [None]:
dataset_paths_copy = load_data(file_paths)
# dataset_paths_copy

In [None]:
def extract_features(specific_dataset_frames):
    """
    Parameters:
    faces_in_specific_folder_path -- py str (of all faces in a specific folder)
    faces_file_names -- py list (of all the file names in a specific folder)
    
    Return:
    extracted_features -- py
    """
    
    extracted_features_per_video = list() #torch.empty((0, 197, 768))

    for specific_dataset_frame_idx in range(len(specific_dataset_frames)):
  
        specific_frame_path = specific_dataset_frames[specific_dataset_frame_idx]
        print(specific_dataset_frame_idx, specific_frame_path)
        
        specific_frame = Image.open(specific_frame_path)

        inputs = processor(images=specific_frame, return_tensors="pt")
        outputs = model(**inputs)

        # last_hidden_states == representation (1 and 2 with GradCam)
        last_hidden_states = outputs.last_hidden_state
        
        extracted_features_per_video.append(last_hidden_states) #torch.vstack((extracted_features_per_video, last_hidden_states))
        
    return extracted_features_per_video

# CREMA-D

In [None]:
# ang_crema_d = list(dataset_paths_copy['ang'].dropna())
# len(ang_crema_d)

In [None]:
# ang_crema_d[10:21]

In [None]:
# ang_crema_d[20:30]

In [None]:
# ang_vit_features_crema_d_1000 = extract_features(ang_crema_d[0:1000])
# ang_vit_features_crema_d_2000 = extract_features(ang_crema_d[1000:2000])
# ang_vit_features_crema_d_3000 = extract_features(ang_crema_d[2000:3000])
# ang_vit_features_crema_d_4000 = extract_features(ang_crema_d[3000:4000])
# ang_vit_features_crema_d_5000 = extract_features(ang_crema_d[4000:5000])
# ang_vit_features_crema_d_6000 = extract_features(ang_crema_d[5000:6000])
# ang_vit_features_crema_d_7000 = extract_features(ang_crema_d[6000:7000])
# ang_vit_features_crema_d_8000 = extract_features(ang_crema_d[7000:8000])
# ang_vit_features_crema_d_9000 = extract_features(ang_crema_d[8000:9000])
# ang_vit_features_crema_d_10000 = extract_features(ang_crema_d[9000:10000])
# ang_vit_features_crema_d_11000 = extract_features(ang_crema_d[10000:11000])

# need to run


In [None]:
# ang_vit_features_crema_d

In [None]:
# save_extracted_features_crema_d = BASE + CLASS_PATH + DATASET_PATH + 'crema_d/extracted_features_crema_d/ang_12000_extracted_features_crema_d.pt'
# torch.save(ang_vit_features_crema_d_12000, save_extracted_features_crema_d)

In [None]:
# torch.load(save_extracted_features_crema_d)

In [None]:
dis_crema_d = list(dataset_paths_copy['dis'].dropna())
# len(dis_crema_d)

In [None]:
# dis_vit_features_crema_d_1000 = extract_features(dis_crema_d[0:1000])
# dis_vit_features_crema_d_2000 = extract_features(dis_crema_d[1000:2000])
# dis_vit_features_crema_d_3000 = extract_features(dis_crema_d[2000:3000])
# dis_vit_features_crema_d_4000 = extract_features(dis_crema_d[3000:4000])
# dis_vit_features_crema_d_5000 = extract_features(dis_crema_d[4000:5000])
# dis_vit_features_crema_d_6000 = extract_features(dis_crema_d[5000:6000])
# dis_vit_features_crema_d_7000 = extract_features(dis_crema_d[6000:7000])
# dis_vit_features_crema_d_8000 = extract_features(dis_crema_d[7000:8000])
# dis_vit_features_crema_d_9000 = extract_features(dis_crema_d[8000:9000])

# need to run
# dis_vit_features_crema_d_10000 = extract_features(dis_crema_d[9000:10000])
# dis_vit_features_crema_d_11000 = extract_features(dis_crema_d[10000:11000])


In [None]:
# save_extracted_features_crema_d = BASE + CLASS_PATH + DATASET_PATH + 'crema_d/extracted_features_crema_d/dis/dis_9000_extracted_features_crema_d.pt'
# torch.save(dis_vit_features_crema_d_9000, save_extracted_features_crema_d)

In [None]:
fea_crema_d = list(dataset_paths_copy['fea'].dropna())
len(fea_crema_d)

In [None]:
# fea_vit_features_crema_d_1000 = extract_features(fea_crema_d[0:1000])
# fea_vit_features_crema_d_2000 = extract_features(fea_crema_d[1000:2000])
# fea_vit_features_crema_d_3000 = extract_features(fea_crema_d[2000:3000])
# fea_vit_features_crema_d_4000 = extract_features(fea_crema_d[3000:4000])
fea_vit_features_crema_d_5000 = extract_features(fea_crema_d[4000:5000])

# need to run
# fea_vit_features_crema_d_6000 = extract_features(fea_crema_d[5000:6000])
# fea_vit_features_crema_d_7000 = extract_features(fea_crema_d[6000:7000])
# fea_vit_features_crema_d_8000 = extract_features(fea_crema_d[7000:8000])
# fea_vit_features_crema_d_9000 = extract_features(fea_crema_d[8000:9000])
# fea_vit_features_crema_d_10000 = extract_features(fea_crema_d[9000:10000])
# fea_vit_features_crema_d_11000 = extract_features(fea_crema_d[10000:11000])


In [None]:
save_extracted_features_crema_d = BASE + CLASS_PATH + DATASET_PATH + 'crema_d/extracted_features_crema_d/fea/fea_5000_extracted_features_crema_d.pt'
torch.save(fea_vit_features_crema_d_5000, save_extracted_features_crema_d)