# Extracting Features with ViT
- https://huggingface.co/docs/transformers/model_doc/vit#vision-transformer-vit
- https://arxiv.org/abs/2010.11929

In [24]:
import os
import torch
import csv

import pandas as pd
import numpy as np

from PIL import Image
from transformers import ViTImageProcessor, ViTFeatureExtractor, ViTModel
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
BASE = '/Users/brinkley97/Documents/development/'
CLASS_PATH = 'classes/csci_535_multimodal_probabilistic_learning/'
DATASET_PATH = 'datasets/'

In [3]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

In [8]:
# faces_in_specific_folder_path
# all_faces_in_specific_folder
def extract_features(faces_in_specific_folder_path, faces_file_names):
    """
    Parameters:
    faces_in_specific_folder_path -- py str (of all faces in a specific folder)
    faces_file_names -- py list (of all the file names in a specific folder)
    
    Return:
    extracted_features -- py
    """
    
    extracted_features_per_video = torch.empty((0, 197, 768))
        
    for faces_file_names_idx in range(len(faces_file_names)):
        path_to_specific_face = faces_in_specific_folder_path + faces_file_names[faces_file_names_idx]
        
        '''
        Start ViT
        '''
        specific_frame = Image.open(path_to_specific_face)
        inputs = processor(images=specific_frame, return_tensors="pt")
        # print(inputs)
        outputs = model(**inputs)
        
        # last_hidden_states == representation (1 and 2 with GradCam)
        last_hidden_states = outputs.last_hidden_state
        extracted_features_per_video = torch.vstack((extracted_features_per_video, last_hidden_states))
        
        
    return extracted_features_per_video

In [9]:
def get_video_path(path_to_faces):
    """
    
    Parameters:
    path_to_faces -- str (of a single path to all saved cropped faces)
    
    Function calls: 
    extract_features
    
    Return
    folder, features -- tuple (of the folder the extracted features are from and the extracted features)
    """
    list_folders_with_faces_name = []
    store_features_from_faces = []
    
    
    count_number_files_in_dir = 0 
    face_folder_files = os.listdir(path_to_faces)
    
    for face_folder_file_idx in range(len(face_folder_files)):
        specific_face_folder = face_folder_files[face_folder_file_idx]
        path_to_faces_in_specific_folder = path_to_faces + specific_face_folder + "/"
        
        folder_exists = os.path.isdir(path_to_faces_in_specific_folder)
        
        if folder_exists == True:
            list_folders_with_faces_name.append(specific_face_folder)
            all_faces_in_specific_folder = os.listdir(path_to_faces_in_specific_folder)
            
            features_extracted_from_specific_face_folder = extract_features(path_to_faces_in_specific_folder, all_faces_in_specific_folder)
            store_features_from_faces.append(features_extracted_from_specific_face_folder)
            
            count_number_files_in_dir += 1
        else:
            pass
        
    return list_folders_with_faces_name, store_features_from_faces

# CREMA-D

In [10]:
CREMA_D_PATH = BASE + CLASS_PATH + DATASET_PATH + 'git_lfs/CREMA-D/sample_VideoFlash/all_faces/'

In [11]:
folder_of_fe_crema_d, fe_crema_d = get_video_path(CREMA_D_PATH)
# fe_crema_d

In [12]:
folder_of_fe_crema_d

['1001_DFA_HAP_XX_frames_to_cropped_face',
 '1001_DFA_NEU_XX_frames_to_cropped_face',
 '1001_DFA_SAD_XX_frames_to_cropped_face',
 '1001_DFA_DIS_XX_frames_to_cropped_face',
 '1001_DFA_ANG_XX_frames_to_cropped_face',
 '1001_DFA_FEA_XX_frames_to_cropped_face']

In [21]:
# the folders of cropped faces to extract features from; #frames in single folder should match #features extracted
# size of extracted features (all should be 196, 768) 
fe_crema_d[5].shape

torch.Size([65, 197, 768])

In [26]:
def create_storage_for_file_and_extracted_features(folder_names, features_extracted):
    '''
    Parameters:
    folder_names -- py list (of file names, so 1 file per face)
    features_extracted -- py list (features from corresponding file)
    
    '''
    
    file_name_with_features_extracted_dict = {}

    for folder_name_idx in range(len(folder_names)):
        speficic_folder = folder_names[folder_name_idx]
        file_name_with_features_extracted_dict[speficic_folder] = features_extracted[folder_name_idx]

    return file_name_with_features_extracted_dict

In [27]:
file_with_fe_crema_d = create_storage_for_file_and_extracted_features(folder_of_fe_crema_d, fe_crema_d)
# file_with_fe_crema_d

In [40]:
def save_dictionary(dictionary_to_save, save_crema_d_dict_path):
    '''
    Parameters:
    dictionary_to_save -- py dic (of folder_names with corresponding features_extracted)
    
    '''
    save_location = save_dict_path + 'crema_d_extracted_features.csv'
    # print(save_location)
    
    write_to_csv = csv.writer(open(save_location, "w"))

    # loop over dictionary keys and values
    for key, val in dictionary_to_save.items():

        # write every key and value to file
        write_to_csv.writerow([key, val])


In [41]:
save_crema_d_dict_path = BASE + CLASS_PATH + DATASET_PATH + 'git_lfs/CREMA-D/sample_VideoFlash/' 
# save_crema_d_dict_path

In [42]:
save_dictionary(file_with_fe_crema_d, save_crema_d_dict_path)

In [None]:
# fe_np = get_video_path(sub_set_r_video_files)

In [None]:
# list_frames_extracted = fe_np[0]
# len(list_frames_extracted[1]), len(list_frames_extracted[2])

In [None]:
# folder_with_frames_name = fe_np[1]
# folder_with_frames_name[0]

In [None]:
# dicts = {}
# # keys = range(4)... folder_with_frames_name
# # values = ["Hi", "I", "am", "John"]... list_frames_extracted
# for i in range(len(folder_with_frames_name)):
#     print(folder_with_frames_name[i])
#     folder_with_frames_name[i] = list_frames_extracted[i]
# print(dicts)

In [None]:
np.shape(fe_crema_d[4]['pixel_values'])

In [None]:
# len(list_frames_extracted[3])

In [None]:
fe_crema_d[0][12].shape

In [None]:
# 132, 197, 768 => 1,2048

In [None]:
folder_names = fe_np[1]

In [None]:
folder_name_with_frame_dict = {}
for folder_name in range(len(folder_names)):
    folder = folder_names[folder_name]
    folder_name_with_frame_dict[folder] = fe_np[0]
    
# folder_name_with_frame_dict

In [None]:
# folder_name_with_frame_dict['MSP-IMPROV-S01A-F02-R-FF01_frames']

In [None]:
# folder_name_with_frame_dict['MSP-IMPROV-S01A-F03-R-FF01_frames']

In [None]:
# MSP_DATASET_PATH = 'msp/videos/'
# MSP_VIDEO_FILES = 'full_r_and_t_mspVideoPaths.csv'
# MSP_video_file_paths = BASE + CLASS_PATH + DATASET_PATH + MSP_DATASET_PATH + MSP_VIDEO_FILES

# r_and_t_video_files = BASE + CLASS_PATH + DATASET_PATH + MSP_DATASET_PATH + 'r_and_t_frames/'
# print(r_and_t_video_files)
# sub_set_r_video_files = r_and_t_video_files + 'sub_set_r_frames/'
# print(sub_set_r_video_files)
# sub_set_t_video_files = r_and_t_video_files + 'sub_set_t_frames/'
# print(sub_set_t_video_files)
# save_r_cropped_frames = BASE + CLASS_PATH + DATASET_PATH + MSP_DATASET_PATH + 'face_r_frames/'
# r_and_t_video_files