# Extracting Features with ViT
- https://huggingface.co/docs/transformers/model_doc/vit#vision-transformer-vit
- https://arxiv.org/abs/2010.11929

In [1]:
import os
import torch
import csv

import pandas as pd
import numpy as np
# import torchvision.transforms as transforms

# from PIL import Image
# from transformers import ViTImageProcessor, ViTFeatureExtractor, ViTModel
# from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
BASE = '/Users/brinkley97/Documents/development/'
CLASS_PATH = 'classes/csci_535_multimodal_probabilistic_learning/'
DATASET_PATH = 'datasets/'

In [None]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

In [None]:
def load_data(file):
    original_data = pd.read_csv(file)
    # original_data = pd.DataFrame(file)
    copy_of_data = original_data.copy()
    return copy_of_data

In [None]:
file_paths = BASE + CLASS_PATH + DATASET_PATH + 'cremaD_mspR_mspT.csv'

In [None]:
dataset_paths_copy = load_data(file_paths)
dataset_paths_copy

In [None]:
def extract_features(specific_dataset_frames):
    """
    Parameters:
    faces_in_specific_folder_path -- py str (of all faces in a specific folder)
    faces_file_names -- py list (of all the file names in a specific folder)
    
    Return:
    extracted_features -- py
    """
    
    extracted_features_per_video = torch.empty((0, 197, 768))
    
    x = []
    all_last_hidden_states = []
    for specific_dataset_frame in range(len(specific_dataset_frames)):
        # print(specific_dataset_frame)
        
        specific_frame_path = specific_dataset_frames[specific_dataset_frame]
        specific_frame = Image.open(specific_frame_path)
        # print(np.shape(specific_frame))

        inputs = processor(images=specific_frame, return_tensors="pt")
        x.append(inputs)
        # outputs = model(**inputs)

#         # last_hidden_states == representation (1 and 2 with GradCam)
        # last_hidden_states = outputs.last_hidden_state
        # all_last_hidden_states.append(last_hidden_states)
#         extracted_features_per_video = torch.vstack((extracted_features_per_video, last_hidden_states))
        
        
    return x

# CREMA-D

## Sample

In [None]:
sample_crema_d = BASE + CLASS_PATH + DATASET_PATH + 'sampleCremaD.csv'
load_sample_crema_d = load_data(sample_crema_d)
load_sample_crema_d

In [None]:
crema_d_paths = load_sample_crema_d["CREMA-D Paths"]
crema_d_paths

In [None]:
update_crema_d_paths = list(crema_d_paths)
# update_crema_d_paths

In [None]:
x_crema_d = extract_features(update_crema_d_paths)

In [None]:
type(x_crema_d)

In [None]:
import pickle

In [None]:
sample_crema_d = BASE + CLASS_PATH + DATASET_PATH + 'inputs-sampleCremaD.txt'

# d = {'a':0,'b':1,'c':2}
with open(sample_crema_d, 'wb') as f:
    pickle.dump(x_crema_d,f)  

In [None]:
reading_sample_crema_d = pd.read_pickle(sample_crema_d)

In [None]:
# reading_sample_crema_d

In [None]:
all_last_hidden_states = []
for specific_frame in range(len(reading_sample_crema_d)):
    inputs = reading_sample_crema_d[specific_frame]
    outputs = model(**inputs)
    # last_hidden_states == representation (1 and 2 with GradCam)
    last_hidden_states = outputs.last_hidden_state
    all_last_hidden_states.append(last_hidden_states)

In [None]:
# all_last_hidden_states

In [None]:
sample_crema_d_hidden_states = BASE + CLASS_PATH + DATASET_PATH + 'hiddenStates-sampleCremaD.txt'

# d = {'a':0,'b':1,'c':2}
# with open(sample_crema_d_hidden_states, 'wb') as f:
#     pickle.dump(all_last_hidden_states, f)  

In [None]:
reading_sample_crema_d_hidden_states = pd.read_pickle(sample_crema_d_hidden_states)

In [None]:
# reading_sample_crema_d_hidden_states

In [None]:
extracted_features_per_video = torch.empty((0, 197, 768))
save_efs = []
for specific_frame_hs in range(len(reading_sample_crema_d_hidden_states)):
    
    hs = reading_sample_crema_d_hidden_states[specific_frame_hs]
    extracted_features_per_video = torch.vstack((extracted_features_per_video, hs))
    save_efs.append(extracted_features_per_video)

In [None]:
# save_efs

In [4]:
sample_crema_d_extracted_features = BASE + CLASS_PATH + DATASET_PATH + 'extractedFeatures-sampleCremaD.txt'

In [None]:

# np.savetxt(sample_crema_d_extracted_features, )
torch.save(save_efs, sample_crema_d_extracted_features)

# d = {'a':0,'b':1,'c':2}
# with open(sample_crema_d_extracted_features, 'wb') as f:
#     pickle.dump(save_efs, f)  

In [None]:
reading_sample_crema_d_efs = pd.read_pickle(sample_crema_d_extracted_features)

In [5]:
fe_crema_d = torch.load(sample_crema_d_extracted_features)

In [6]:
fe_crema_d

[tensor([[[ 0.1296,  0.1780, -0.1627,  ..., -0.4221,  0.1744, -0.0066],
          [-0.0081,  0.0847, -0.3951,  ..., -0.3195,  0.1200, -0.0123],
          [-0.0699,  0.1132, -0.4551,  ..., -0.3730,  0.2579,  0.0280],
          ...,
          [-0.0357,  0.0704, -0.1042,  ..., -0.1797,  0.3018,  0.0598],
          [-0.0190,  0.1394, -0.1630,  ..., -0.2395,  0.2587,  0.1013],
          [-0.1109,  0.1001, -0.1008,  ..., -0.3130,  0.2538,  0.0512]]],
        requires_grad=True),
 tensor([[[ 0.1296,  0.1780, -0.1627,  ..., -0.4221,  0.1744, -0.0066],
          [-0.0081,  0.0847, -0.3951,  ..., -0.3195,  0.1200, -0.0123],
          [-0.0699,  0.1132, -0.4551,  ..., -0.3730,  0.2579,  0.0280],
          ...,
          [-0.0357,  0.0704, -0.1042,  ..., -0.1797,  0.3018,  0.0598],
          [-0.0190,  0.1394, -0.1630,  ..., -0.2395,  0.2587,  0.1013],
          [-0.1109,  0.1001, -0.1008,  ..., -0.3130,  0.2538,  0.0512]],
 
         [[ 0.1527,  0.1561, -0.1705,  ..., -0.3990,  0.1612, -0.0234],


## Actual

In [None]:
crema_d_paths = dataset_paths_copy["CREMA-D Paths"]
crema_d_paths

In [None]:
x_crema_d, y_crema_d = extract_features(crema_d_paths)

# MSP

In [None]:
MSP_DATASET_PATH = 'msp/videos/r_and_t_frames/'

## R

In [None]:
R_FRAMES_PATH = BASE + CLASS_PATH + DATASET_PATH + MSP_DATASET_PATH + 'r_frames/'
# R_FRAMES_PATH

In [None]:
path_msp_r = get_video_path_for_feature_extraction(R_FRAMES_PATH)
# path_msp_r

In [None]:
msp_r_df = pd.DataFrame(path_msp_r, columns=['MSP R Paths'])

## T

In [None]:
T_FRAMES_PATH = BASE + CLASS_PATH + DATASET_PATH + MSP_DATASET_PATH + 't_frames/'
# T_FRAMES_PATH

In [None]:
path_msp_t = get_video_path_for_feature_extraction(T_FRAMES_PATH)
# path_msp_t

In [None]:
msp_t_df = pd.DataFrame(path_msp_t, columns=['MSP T Paths'])