In [6]:
import numpy as np
import pickle as pkl
import json
import torch
from torch.utils.data import Dataset
import librosa
import os
import sys
import cv2
import shutil
from scipy.signal.windows import gaussian
from matplotlib import pyplot as plt
import csv
from datetime import datetime
import soundfile as sf
# import utility functions
sys.path.insert(0, '/Users/evanpan/Documents/GitHub/EvansToolBox/Utils')
sys.path.insert(0, '/Users/evanpan/Desktop/openpose/python/')
sys.path.insert(0, '/scratch/ondemand27/evanpan/EvansToolBox/Utils/')
sys.path.insert(0, '/scratch/ondemand27/evanpan/Gaze_project/')
from Signal_processing_utils import dx_dt
from Geometry_Util import rotation_angles_frome_positions

# Test per shot datasets

In [None]:
processed_dataset = "/Volumes/EVAN_DISK/MASC/shot_processed_dataset/"

In [10]:
class ShotDataSet_Selftape111(Dataset):
    def __init__(self, processed_data_path):
        # save dataset root path
        self.data_root_path = processed_data_path

        # load video names
        video_names_path = os.path.join(*[processed_data_path, "metadata.json"])
        self.video_metadata = {}
        with open(video_names_path, mode='r') as f:
            self.video_metadata = json.load(f)["data"]
    def __len__(self):
        return len(self.video_metadata)
    def __getitem__(self, idx):
        file_name = self.video_metadata[idx]["name"]
        fps = self.video_metadata[idx]["fps"]
        output_audio_onscreen_path = os.path.join(*[self.data_root_path, "audio", file_name+"_{}.wav".format(0)]) 
        output_audio_offscreen_path = os.path.join(*[self.data_root_path, "audio", file_name+"_{}.wav".format(1)]) 
        output_gaze_path = os.path.join(*[self.data_root_path, "gaze", file_name+".pkl"]) 
        output_head_path = os.path.join(*[self.data_root_path, "head", file_name+".pkl"]) 
        output_blinks_path = os.path.join(*[self.data_root_path, "blinks", file_name+".pkl"])

        gaze = pkl.load(open(output_gaze_path, "rb"))
        head = pkl.load(open(output_head_path, "rb"))
        blinks = pkl.load(open(output_blinks_path, "rb"))

        audio_onscreen, sr = librosa.load(output_audio_onscreen_path)
        audio_offscreen, sr = librosa.load(output_audio_offscreen_path)
        return [sr, audio_onscreen, audio_offscreen], [fps, gaze, head, blinks]  

In [11]:
class SegmentDataset_SelfTape111(Dataset):
    def __init__(self, processed_data_path, win_length=10, stride_length=5):
        # save dataset root path
        self.data_root_path = processed_data_path
        self.count = 0
        # load video names
        video_names_path = os.path.join(*[processed_data_path, "metadata.json"])
        self.video_metadata = {}
        with open(video_names_path, mode='r') as f:
            self.video_metadata = json.load(f)["data"]
        # each clip will be 
        clip_metadata = []
        for i in range(0, len(self.video_metadata)):
            metadata = self.video_metadata[i]
            fps = metadata["fps"] # this depends on the video
            sr = metadata["sr"] # they should all be 22500
            video_length = metadata["annotation_length"]
            audio_length = metadata["audio_length"]
            # get the length of the window size, and stride length in frames (fps and sr respectively)
            win_size_audio_per_segment = win_length * sr
            win_size_video_per_segment = int(np.round(win_length * fps))
            stride_length_audio_per_segment = stride_length * sr
            stride_length_video_per_segment = int(np.round(stride_length * fps))
            video_ranges = []
            audio_ranges = []
            # segment the annotation_files
            window_count = np.floor((video_length - (win_size_video_per_segment - stride_length_video_per_segment)) / stride_length_video_per_segment)
            for w in range(0, int(window_count)):
                video_window_start = stride_length_video_per_segment * w
                video_window_end = video_window_start + win_size_video_per_segment
                audio_window_start = stride_length_audio_per_segment * w
                audio_window_end = audio_window_start + win_size_audio_per_segment
                video_ranges.append([video_window_start, video_window_end])
                audio_ranges.append([audio_window_start, audio_window_end])
                self.count = self.count + 1
                clip_metadata.append({"video_range": [video_window_start, video_window_end],
                                      "audio_range": [audio_window_start, audio_window_end],
                                      "fps":fps,
                                      "sr":sr,
                                      "file_name": metadata["name"]})
                # clip_list.append([])
            video_ranges.append([video_length-win_size_video_per_segment, video_length])
            audio_ranges.append([audio_length-win_size_audio_per_segment, audio_length])
            clip_metadata.append({"video_range": video_ranges[-1],
                                  "audio_range": audio_ranges[-1],
                                  "fps":fps,
                                  "sr":sr, 
                                  "file_name": metadata["name"]})
            self.count = self.count + 1
        self.clip_metadata = clip_metadata
        # parse the data into 
    def __len__(self):
        return self.count
    def __getitem__(self, idx):
        file_name = self.clip_metadata[idx]["file_name"]
        fps = self.clip_metadata[idx]["fps"]
        v_range = self.clip_metadata[idx]["video_range"]
        a_range = self.clip_metadata[idx]["audio_range"]
        output_audio_onscreen_path = os.path.join(*[self.data_root_path, "audio", file_name+"_{}.wav".format(0)]) 
        output_audio_offscreen_path = os.path.join(*[self.data_root_path, "audio", file_name+"_{}.wav".format(1)]) 
        output_gaze_path = os.path.join(*[self.data_root_path, "gaze", file_name+".pkl"]) 
        output_head_path = os.path.join(*[self.data_root_path, "head", file_name+".pkl"]) 
        output_blinks_path = os.path.join(*[self.data_root_path, "blinks", file_name+".pkl"])

        gaze = pkl.load(open(output_gaze_path, "rb"))[v_range[0]:v_range[1]]
        head = pkl.load(open(output_head_path, "rb"))[v_range[0]:v_range[1]]
        blinks = pkl.load(open(output_blinks_path, "rb"))[v_range[0]:v_range[1]]

        audio_onscreen, sr = librosa.load(output_audio_onscreen_path)
        audio_offscreen, sr = librosa.load(output_audio_offscreen_path)
        audio_onscreen = audio_onscreen[a_range[0]:a_range[1]]
        audio_offscreen = audio_offscreen[a_range[0]:a_range[1]]
        return [sr, audio_onscreen, audio_offscreen], [fps, gaze, head, blinks]  

In [12]:
# usage
k = SegmentDataset_SelfTape111(processed_dataset)
[sr, audio_onscreen, audio_offscreen], [fps, gaze, head, blinks] = k[0]

FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/EVAN_DISK/MASC/shot_processed_dataset/metadata.json'

In [13]:
# Dataset for deep learning
class Aversion_SelfTap111(Dataset):
    def __init__(self, processed_data_path, videos_included=None):
        self.filler = np.array([-36.04365338911715,0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715,-36.04365338911715])
        # save dataset root path
        self.data_root_path = processed_data_path
        # load video names
        video_names_path = os.path.join(*[self.data_root_path, "video_to_window_metadata.json"])
        self.metadata = json.load(open(video_names_path, "r"))
        self.all_files_in_set = []
        if videos_included is None:
            videos_included = list(self.metadata.keys())
        for i in videos_included:
            self.all_files_in_set = self.all_files_in_set + self.metadata[i]

    def __len__(self):
        return len(self.all_files_in_set)
    def __getitem__(self, idx):
        onscreen_audio_feature_path = os.path.join(*[self.data_root_path, "audio", "clip_{}_speaker_{}.npy".format(idx, 0)])
        offscreen_audio_feature_path = os.path.join(*[self.data_root_path, "audio", "clip_{}_speaker_{}.npy".format(idx, 1)])
        onscreen_text_feature_path = os.path.join(*[self.data_root_path, "text", "clip_{}_speaker_{}.npy".format(idx, 0)])
        offscreen_text_feature_path = os.path.join(*[self.data_root_path, "text", "clip_{}_speaker_{}.npy".format(idx, 1)])
        aversion_label_path = os.path.join(*[self.data_root_path, "aversion_label", "clip_{}.npy".format(idx)])
        # see if we need to concat any thing
        input_audio_on_screen = np.load(onscreen_audio_feature_path)
        input_audio_off_screen = np.load(offscreen_audio_feature_path)
        input_text_on_screen = np.load(onscreen_text_feature_path)
        input_text_off_screen = np.load(offscreen_text_feature_path)
        # output_target
        output_target = np.load(aversion_label_path)
        print(input_text_on_screen.shape)
        if input_audio_on_screen.shape[0] < input_text_on_screen.shape[0]:
            missing_frames = input_text_on_screen.shape[0] - input_audio_on_screen.shape[0]
            padding = np.tile(np.expand_dims(self.filler, axis=0), [missing_frames, 1])
            input_audio_on_screen = np.concatenate([input_audio_on_screen, padding], axis=0)
            input_audio_off_screen = np.concatenate([input_audio_off_screen, padding], axis=0)

        input_vector_onscreen = np.concatenate([input_audio_on_screen, input_text_on_screen], axis=1)
        input_vector_offscreen = np.concatenate([input_audio_off_screen, input_text_off_screen], axis=1)
        # input_vector = np.concatenate([input_vector_onscreen, input_vector_offscreen], axis=1)
        return input_vector_onscreen, input_vector_offscreen, output_target

# Test per Runtime Parsing of pre-processed datasets

In [14]:
processed_data_path = "/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/"
meta_data = os.path.join(*[processed_data_path, "video_to_window_metadata.json"])
meta_data = json.load(open(meta_data))
video_include = list(meta_data.keys())
k = Aversion_SelfTap111(processed_data_path, video_include)
for i in range(0, len(k)):
    
    X, Y, Z = k[i]


(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)
(250, 772)

FileNotFoundError: [Errno 2] No such file or directory: '/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/audio/clip_4168_speaker_0.npy'

In [20]:
import random
class Runtime_parsing_Aversion_SelfTape111(Dataset):
    def __init__(self, processed_data_path, videos_included=None, prev_dataset=None, pos_labels=True, long_aversion_only=False, shuffle=True, window_length=250, with_gaze=False, normalize_MFCC=False, apply_frequency_mask=False, apply_time_mask=False):
        torch.set_default_tensor_type(torch.DoubleTensor)
        if prev_dataset is None:        
            self.data_root_path = processed_data_path
            self.shuffle = shuffle
            self.pos_labels = pos_labels
            self.window_length = window_length
            self.with_gaze = with_gaze
            self.long_aversion_only = long_aversion_only
            video_names_path = os.path.join(*[self.data_root_path, "video_to_window_metadata.json"])
            self.metadata = json.load(open(video_names_path, "r"))
            self.all_files_in_set = []
            if videos_included is None:
                videos_included = list(self.metadata.keys())
            self.all_files_in_set = videos_included
            self.gaussian_window = gaussian(5, 1)
            self.normalize_MFCC = normalize_MFCC
            self.apply_frequency_mask = apply_frequency_mask
            self.apply_time_mask = apply_time_mask
            # load all input features and aversionl labels to memory
            self.input_features = []
            self.aversion_labels = []
            self.velocity_labels = []
            self.gaze_labels = []
            self.interlocutor_positions = []
            self.load_IO_features_to_memory()
            # generate a map to map the index of the dataset to the video
            self.map = {}
            self.dataset_length = 0
            self.parse_dataset()
            # generate filler for input features:
            self.filler = np.array([-36.04365338911715,0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0])
            self.filler_back = np.concatenate([self.filler, np.zeros(6), self.filler, np.zeros(6)])
            if self.pos_labels:
                self.filler_back = np.concatenate([self.filler, np.zeros(20), self.filler, np.zeros(20)])
        else:
            self.data_root_path = prev_dataset.data_root_path
            self.shuffle = prev_dataset.shuffle
            self.pos_labels = prev_dataset.pos_labels
            self.window_length = prev_dataset.window_length
            self.window_length = window_length
            self.long_aversion_only = prev_dataset.long_aversion_only
            self.all_files_in_set = prev_dataset.all_files_in_set
            self.gaussian_window = prev_dataset.gaussian_window
            self.input_features = prev_dataset.input_features
            self.aversion_labels = prev_dataset.aversion_labels
            self.velocity_labels = prev_dataset.velocity_labels
            self.with_gaze = prev_dataset.with_gaze
            self.map = prev_dataset.map
            self.dataset_length = prev_dataset.dataset_length
            self.filler = prev_dataset.filler
            self.filler_back = prev_dataset.filler_back
            self.normalize_MFCC = prev_dataset.normalize_MFCC
            self.apply_time_mask = prev_dataset.apply_time_mask
            self.apply_frequency_mask = prev_dataset.apply_frequency_mask
            self.parse_dataset()
    def __len__(self):
        return self.dataset_length
    def parse_dataset(self):
        self.map = {}
        self.dataset_length = 0
        counter = 0
        for i in range(len(self.input_features)):
            # for randomly cutting the video
            random_offset = np.random.randint(0, self.window_length/2)
            # code starts here

            video_length = self.input_features[i].shape[0] - random_offset # if we start going through the video from the random offset, we will have this many frames left
            stride_length_video_per_segment = int(np.round(self.window_length/2))
            window_count = np.floor((video_length - (self.window_length - stride_length_video_per_segment)) / stride_length_video_per_segment)
            if self.input_features[i].shape[0] <= 25:
                continue
            if video_length <= 0:
                continue
            # add all the windows except the last window
            for w in range(0, int(window_count)):
                # start will be some offset away from the start
                video_window_start = stride_length_video_per_segment * w + random_offset
                video_window_end = video_window_start + self.window_length
                window_range = [video_window_start, video_window_end]
                self.map[counter] = [i, window_range]
                counter = counter + 1
            self.map[counter] = [i, [max(0, video_length-self.window_length), video_length]]
            counter += 1
        self.dataset_length = counter
    def time_mask(self, spec, T=30, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        len_spectro = cloned.shape[0]
        for i in range(0, num_masks):
            # I only have 250 ish samples so I'm masking 20 max
            t = random.randrange(0, T)
            t_zero = random.randrange(0, len_spectro - t)
            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned
            mask_end = random.randrange(t_zero, t_zero + t)
            if (replace_with_zero): cloned[t_zero:mask_end] = 0
            else: cloned[t_zero:mask_end] = cloned.mean()
        return cloned
    def freq_mask(self, spec, F=5, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
        for i in range(0, num_masks):        
            f = random.randrange(0, F)
            f_zero = random.randrange(0, num_mel_channels - f)
            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
        return cloned
    def load_IO_features_to_memory(self):
        for file_name in self.all_files_in_set:
            # get the aversion labels from the disk
            if self.long_aversion_only:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "long_aversion_label", file_name+".pkl"])
            else:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "aversion_label", file_name+".pkl"])
            if self.with_gaze:
                gaze_label_path = os.path.join(*[self.data_root_path, "gaze", file_name+".pkl"])
                self.gaze_labels.append(pkl.load(open(gaze_label_path, "rb")))
                interlocutor_position_path = os.path.join(*["/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/", "tinterlocutor_direction", file_name+".pkl"])
                self.interlocutor_positions.append(pkl.load(open(interlocutor_position_path, "rb")))
            output_aversion_label = pkl.load(open(output_aversion_label_path, "rb"))

            # get the input features from the disk 
            on_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_0.pkl"]) 
            off_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_1.pkl"])
            on_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_0.pkl"])
            off_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_1.pkl"])
            on_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_0.pkl"])
            off_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_1.pkl"])
            
            # load the input features from the disk
            on_screen_sentence_timing = pkl.load(open(on_screen_sentence_timing_path, "rb"))
            off_screen_sentence_timing = pkl.load(open(off_screen_sentence_timing_path, "rb"))
            on_screen_mfcc = pkl.load(open(on_screen_mfcc_path, "rb"))
            off_screen_mfcc = pkl.load(open(off_screen_mfcc_path, "rb"))
            if self.normalize_MFCC:
                mean = np.mean(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.std(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.where(std <= 1E-8, 1, std)
                on_screen_mfcc = (on_screen_mfcc - mean) / std  
                off_screen_mfcc = (off_screen_mfcc - mean) / std
                # now this is normalized to 0 mean and 1 std
            if self.pos_labels:
                on_screen_pos = pkl.load(open(on_screen_pos_path, "rb"))
                off_screen_pos = pkl.load(open(off_screen_pos_path, "rb")) 
            if on_screen_mfcc.shape[0] <= 50:
                continue
            # get input features
            input_features_on_screen = np.concatenate([on_screen_mfcc, on_screen_sentence_timing], axis=1)
            input_features_off_screen = np.concatenate([off_screen_mfcc, off_screen_sentence_timing], axis=1)
            if self.pos_labels: # the last 14 features are the POS tags
                input_features_on_screen = np.concatenate([input_features_on_screen, on_screen_pos], axis=1)
                input_features_off_screen = np.concatenate([input_features_off_screen, off_screen_pos], axis=1)
            input_feature = np.concatenate([input_features_on_screen, input_features_off_screen], axis=1)
            vel_output_target = dx_dt(output_aversion_label)
            vel_output_target = np.correlate(vel_output_target, self.gaussian_window, mode="same")
            self.input_features.append(input_feature)
            self.aversion_labels.append(output_aversion_label)
            self.velocity_labels.append(vel_output_target)
    def __getitem__(self, idx):
        # pad all audio to 250 frames
        input_feature = self.input_features[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        aversion_label = self.aversion_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        velocity_label = self.velocity_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        # print(self.window_length, self.map[idx][1][0], self.map[idx][1][1], self.input_features[self.map[idx][0]].shape, self.aversion_labels[self.map[idx][0]].shape, self.velocity_labels[self.map[idx][0]].shape)
        if input_feature.shape[0] < self.window_length:
            missing_frames = self.window_length - input_feature.shape[0]
            padding = np.tile(np.expand_dims(self.filler_back, axis=0), [missing_frames, 1])
            input_feature = np.concatenate([input_feature, padding], axis=0)
            final_aversion_frame = aversion_label[-1]
            repeated_final_aversion_frame = np.tile(np.expand_dims(final_aversion_frame, axis=0), [missing_frames])
            aversion_label = np.concatenate([aversion_label, repeated_final_aversion_frame], axis=0)
            velocity_label = np.concatenate([velocity_label, np.zeros(missing_frames)], axis=0)  

        
        input_feature = torch.from_numpy(input_feature).double()
        if self.apply_time_mask and self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(self.time_mask(input_feature[:, 0:26]))
            input_feature[:, 46:72] = self.freq_mask(self.time_mask(input_feature[:, 46:72]))
        elif self.apply_time_mask:
            input_feature[:, 0:26] = self.time_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.time_mask(input_feature[:, 46:72])
        elif self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.freq_mask(input_feature[:, 46:72])
        aversion_label = torch.from_numpy(aversion_label).double()
        velocity_label = torch.from_numpy(velocity_label).double()
        if self.with_gaze:
            gaze = self.gaze_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
            return input_feature, [aversion_label, gaze, self.interlocutor_positions[self.map[idx][0]], velocity_label] 
        return input_feature, [aversion_label, velocity_label]      
  

In [None]:
import random
class Runtime_parsing_Aversion_SelfTape111_validation_leak(Dataset):
    def __init__(self, processed_data_path, videos_included=None, prev_dataset=None, pos_labels=True, long_aversion_only=False, shuffle=True, window_length=250, with_gaze=False, normalize_MFCC=False, apply_frequency_mask=False, apply_time_mask=False, percent_leaked=0.10):
        torch.set_default_tensor_type(torch.DoubleTensor)
        if prev_dataset is None:        
            self.data_root_path = processed_data_path
            self.shuffle = shuffle
            self.pos_labels = pos_labels
            self.percent_leaked = percent_leaked
            self.window_length = window_length
            self.with_gaze = with_gaze
            self.long_aversion_only = long_aversion_only
            video_names_path = os.path.join(*[self.data_root_path, "video_to_window_metadata.json"])
            self.metadata = json.load(open(video_names_path, "r"))
            self.all_files_in_set = []
            if videos_included is None:
                videos_included = list(self.metadata.keys())
            self.all_files_val_and_trian = list(self.metadata.keys())
            self.train_set = []
            self.all_files_in_set = videos_included
            self.gaussian_window = gaussian(5, 1)
            self.normalize_MFCC = normalize_MFCC
            self.apply_frequency_mask = apply_frequency_mask
            self.apply_time_mask = apply_time_mask
            # load all input features and aversionl labels to memory
            self.input_features = []
            self.aversion_labels = []
            self.velocity_labels = []
            self.gaze_labels = []
            self.interlocutor_positions = []
            self.load_IO_features_to_memory()
            # generate a map to map the index of the dataset to the video
            self.map = {}
            self.dataset_length = 0
            self.parse_dataset()
            # generate filler for input features:
            self.filler = np.array([-36.04365338911715,0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0])
            self.filler_back = np.concatenate([self.filler, np.zeros(6), self.filler, np.zeros(6)])
            if self.pos_labels:
                self.filler_back = np.concatenate([self.filler, np.zeros(20), self.filler, np.zeros(20)])
        else:
            self.data_root_path = prev_dataset.data_root_path
            self.shuffle = prev_dataset.shuffle
            self.pos_labels = prev_dataset.pos_labels
            self.window_length = prev_dataset.window_length
            self.window_length = window_length
            self.long_aversion_only = prev_dataset.long_aversion_only
            self.all_files_in_set = prev_dataset.all_files_in_set
            self.gaussian_window = prev_dataset.gaussian_window
            self.input_features = prev_dataset.input_features
            self.aversion_labels = prev_dataset.aversion_labels
            self.velocity_labels = prev_dataset.velocity_labels
            self.with_gaze = prev_dataset.with_gaze
            self.map = prev_dataset.map
            self.dataset_length = prev_dataset.dataset_length
            self.filler = prev_dataset.filler
            self.filler_back = prev_dataset.filler_back
            self.normalize_MFCC = prev_dataset.normalize_MFCC
            self.apply_time_mask = prev_dataset.apply_time_mask
            self.apply_frequency_mask = prev_dataset.apply_frequency_mask
            self.all_files_val_and_trian = prev_dataset.all_files_val_and_trian
            self.train_set = prev_dataset.train_set
            self.percent_leaked = prev_dataset.percent_leaked
            self.parse_dataset()
    def __len__(self):
        return self.dataset_length
    def parse_dataset(self):
        self.map = {}
        self.dataset_length = 0
        counter = 0
        for i in range(len(self.input_features)):
            # for randomly cutting the video
            random_offset = np.random.randint(0, self.window_length/2)
            # code starts here
            video_length = self.input_features[i].shape[0] - random_offset # if we start going through the video from the random offset, we will have this many frames left
            stride_length_video_per_segment = int(np.round(self.window_length/2))
            window_count = np.floor((video_length - (self.window_length - stride_length_video_per_segment)) / stride_length_video_per_segment)
            if self.input_features[i].shape[0] <= 25:
                continue
            if video_length <= 0:
                continue
            # add all the windows except the last window
            if i in self.train_set:
                for w in range(0, int(window_count)):
                    # start will be some offset away from the start
                    video_window_start = stride_length_video_per_segment * w + random_offset
                    video_window_end = video_window_start + self.window_length
                    window_range = [video_window_start, video_window_end]
                    self.map[counter] = [i, window_range]
                    counter = counter + 1
                self.map[counter] = [i, [max(0, video_length-self.window_length), video_length]]
                counter += 1
            else:
                for w in range(0, int(window_count)):
                    if w/float(window_count) <= self.percent_leaked:
                        # start will be some offset away from the start
                        video_window_start = stride_length_video_per_segment * w + random_offset
                        video_window_end = video_window_start + self.window_length
                        window_range = [video_window_start, video_window_end]
                        self.map[counter] = [i, window_range]
                        counter = counter + 1
        self.dataset_length = counter
    def time_mask(self, spec, T=30, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        len_spectro = cloned.shape[0]
        for i in range(0, num_masks):
            # I only have 250 ish samples so I'm masking 20 max
            t = random.randrange(0, T)
            t_zero = random.randrange(0, len_spectro - t)
            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned
            mask_end = random.randrange(t_zero, t_zero + t)
            if (replace_with_zero): cloned[t_zero:mask_end] = 0
            else: cloned[t_zero:mask_end] = cloned.mean()
        return cloned
    def freq_mask(self, spec, F=5, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
        for i in range(0, num_masks):        
            f = random.randrange(0, F)
            f_zero = random.randrange(0, num_mel_channels - f)
            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
        return cloned
    def load_IO_features_to_memory(self):
        counter = 0
        for file_name in self.all_files_val_and_trian:
            if file_name in self.all_files_in_set:
                self.train_set.append(counter)
            # get the aversion labels from the disk
            if self.long_aversion_only:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "long_aversion_label", file_name+".pkl"])
            else:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "aversion_label", file_name+".pkl"])
            if self.with_gaze:
                gaze_label_path = os.path.join(*[self.data_root_path, "gaze", file_name+".pkl"])
                self.gaze_labels.append(pkl.load(open(gaze_label_path, "rb")))
                interlocutor_position_path = os.path.join(*["/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/", "tinterlocutor_direction", file_name+".pkl"])
                self.interlocutor_positions.append(pkl.load(open(interlocutor_position_path, "rb")))
            output_aversion_label = pkl.load(open(output_aversion_label_path, "rb"))

            # get the input features from the disk 
            on_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_0.pkl"]) 
            off_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_1.pkl"])
            on_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_0.pkl"])
            off_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_1.pkl"])
            on_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_0.pkl"])
            off_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_1.pkl"])
            
            # load the input features from the disk
            on_screen_sentence_timing = pkl.load(open(on_screen_sentence_timing_path, "rb"))
            off_screen_sentence_timing = pkl.load(open(off_screen_sentence_timing_path, "rb"))
            on_screen_mfcc = pkl.load(open(on_screen_mfcc_path, "rb"))
            off_screen_mfcc = pkl.load(open(off_screen_mfcc_path, "rb"))
            if self.normalize_MFCC:
                mean = np.mean(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.std(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.where(std <= 1E-8, 1, std)
                on_screen_mfcc = (on_screen_mfcc - mean) / std  
                off_screen_mfcc = (off_screen_mfcc - mean) / std
                # now this is normalized to 0 mean and 1 std
            if self.pos_labels:
                on_screen_pos = pkl.load(open(on_screen_pos_path, "rb"))
                off_screen_pos = pkl.load(open(off_screen_pos_path, "rb")) 
            if on_screen_mfcc.shape[0] <= 50:
                continue
            # get input features
            input_features_on_screen = np.concatenate([on_screen_mfcc, on_screen_sentence_timing], axis=1)
            input_features_off_screen = np.concatenate([off_screen_mfcc, off_screen_sentence_timing], axis=1)
            if self.pos_labels: # the last 14 features are the POS tags
                input_features_on_screen = np.concatenate([input_features_on_screen, on_screen_pos], axis=1)
                input_features_off_screen = np.concatenate([input_features_off_screen, off_screen_pos], axis=1)
            input_feature = np.concatenate([input_features_on_screen, input_features_off_screen], axis=1)
            vel_output_target = dx_dt(output_aversion_label)
            vel_output_target = np.correlate(vel_output_target, self.gaussian_window, mode="same")
            self.input_features.append(input_feature)
            self.aversion_labels.append(output_aversion_label)
            self.velocity_labels.append(vel_output_target)
            counter += 1
    def __getitem__(self, idx):
        # pad all audio to 250 frames
        input_feature = self.input_features[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        aversion_label = self.aversion_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        velocity_label = self.velocity_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        # print(self.window_length, self.map[idx][1][0], self.map[idx][1][1], self.input_features[self.map[idx][0]].shape, self.aversion_labels[self.map[idx][0]].shape, self.velocity_labels[self.map[idx][0]].shape)
        if input_feature.shape[0] < self.window_length:
            missing_frames = self.window_length - input_feature.shape[0]
            padding = np.tile(np.expand_dims(self.filler_back, axis=0), [missing_frames, 1])
            input_feature = np.concatenate([input_feature, padding], axis=0)
            final_aversion_frame = aversion_label[-1]
            repeated_final_aversion_frame = np.tile(np.expand_dims(final_aversion_frame, axis=0), [missing_frames])
            aversion_label = np.concatenate([aversion_label, repeated_final_aversion_frame], axis=0)
            velocity_label = np.concatenate([velocity_label, np.zeros(missing_frames)], axis=0)  

        
        input_feature = torch.from_numpy(input_feature).double()
        if self.apply_time_mask and self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(self.time_mask(input_feature[:, 0:26]))
            input_feature[:, 46:72] = self.freq_mask(self.time_mask(input_feature[:, 46:72]))
        elif self.apply_time_mask:
            input_feature[:, 0:26] = self.time_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.time_mask(input_feature[:, 46:72])
        elif self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.freq_mask(input_feature[:, 46:72])
        aversion_label = torch.from_numpy(aversion_label).double()
        velocity_label = torch.from_numpy(velocity_label).double()
        if self.with_gaze:
            gaze = self.gaze_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
            return input_feature, [aversion_label, gaze, self.interlocutor_positions[self.map[idx][0]], velocity_label] 
        return input_feature, [aversion_label, velocity_label]      

In [54]:
processed_data_path = "/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset_real_time_aug/"
meta_data = os.path.join(*[processed_data_path, "video_to_window_metadata.json"])
meta_data = json.load(open(meta_data))
all_videos = list(meta_data.keys())
training_set = []
testing_set = []
# get the name of the videos (this ensures no contamination because the same shot is split)
for i in range(0, len(all_videos)):
    if i / len(all_videos) < 0.9:
        training_set.append(all_videos[i])
    else:
        testing_set.append(all_videos[i])


video_include = list(meta_data.keys())
k_train_leak = Runtime_parsing_Aversion_SelfTape111_validation_leak(processed_data_path, training_set, pos_labels=True, normalize_MFCC=True, apply_frequency_mask=True, percent_leaked=0.1)
print(k_train_leak.train_set)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

# Test run time parsing of raw_audio datasets

In [33]:
import random
class Runtime_parsing_Aversion_SelfTape111_with_word_vec(Dataset):
    def __init__(self, processed_data_path, videos_included=None, prev_dataset=None, pos_labels=True, long_aversion_only=False, shuffle=True, window_length=250, with_gaze=False, normalize_MFCC=False, apply_frequency_mask=False, apply_time_mask=False, word_vec_location="word_embedding_WavLM"):
        torch.set_default_tensor_type(torch.DoubleTensor)
        if prev_dataset is None:        
            self.data_root_path = processed_data_path
            self.shuffle = shuffle
            self.pos_labels = pos_labels
            self.window_length = window_length
            self.with_gaze = with_gaze
            self.long_aversion_only = long_aversion_only
            video_names_path = os.path.join(*[self.data_root_path, "video_to_window_metadata.json"])
            self.metadata = json.load(open(video_names_path, "r"))
            self.all_files_in_set = []
            if videos_included is None:
                videos_included = list(self.metadata.keys())
            self.all_files_in_set = videos_included
            self.gaussian_window = gaussian(5, 1)
            self.normalize_MFCC = normalize_MFCC
            self.apply_frequency_mask = apply_frequency_mask
            self.apply_time_mask = apply_time_mask
            self.word_vec_location = word_vec_location
            # load all input features and aversionl labels to memory
            self.input_features = []
            self.aversion_labels = []
            self.velocity_labels = []
            self.gaze_labels = []
            self.interlocutor_positions = []
            self.load_IO_features_to_memory()
            # generate a map to map the index of the dataset to the video
            self.map = {}
            self.dataset_length = 0
            self.parse_dataset()
            # generate filler for input features:
            self.filler = np.array([-36.04365338911715,0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0,0.0,0.0,0.0,0.0,-3.432169450445466e-14,0.0,0.0,0.0,9.64028691651994e-15,0.0,0.0, 0.0])
            self.filler_back = np.concatenate([np.zeros(768), self.filler, np.zeros(6), np.zeros(768), self.filler, np.zeros(6)])
            if self.pos_labels:
                self.filler_back = np.concatenate([np.zeros(768), self.filler, np.zeros(20), np.zeros(768), self.filler, np.zeros(20)])
        else:
            self.data_root_path = prev_dataset.data_root_path
            self.shuffle = prev_dataset.shuffle
            self.pos_labels = prev_dataset.pos_labels
            self.window_length = prev_dataset.window_length
            self.window_length = window_length
            self.long_aversion_only = prev_dataset.long_aversion_only
            self.all_files_in_set = prev_dataset.all_files_in_set
            self.gaussian_window = prev_dataset.gaussian_window
            self.input_features = prev_dataset.input_features
            self.aversion_labels = prev_dataset.aversion_labels
            self.velocity_labels = prev_dataset.velocity_labels
            self.with_gaze = prev_dataset.with_gaze
            self.map = prev_dataset.map
            self.dataset_length = prev_dataset.dataset_length
            self.filler = prev_dataset.filler
            self.filler_back = prev_dataset.filler_back
            self.normalize_MFCC = prev_dataset.normalize_MFCC
            self.apply_time_mask = prev_dataset.apply_time_mask
            self.apply_frequency_mask = prev_dataset.apply_frequency_mask
            self.word_vec_location = prev_dataset.word_vec_location
            self.parse_dataset()
    def __len__(self):
        return self.dataset_length
    def parse_dataset(self):
        self.map = {}
        self.dataset_length = 0
        counter = 0
        for i in range(len(self.input_features)):
            # for randomly cutting the video
            random_offset = np.random.randint(0, self.window_length/2)
            # code starts here
            video_length = self.input_features[i].shape[0] - random_offset # if we start going through the video from the random offset, we will have this many frames left
            stride_length_video_per_segment = int(np.round(self.window_length/2))
            window_count = np.floor((video_length - (self.window_length - stride_length_video_per_segment)) / stride_length_video_per_segment)
            if self.input_features[i].shape[0] <= 25:
                continue
            if video_length <= 0:
                continue
            # add all the windows except the last window
            for w in range(0, int(window_count)):
                # start will be some offset away from the start
                video_window_start = stride_length_video_per_segment * w + random_offset
                video_window_end = video_window_start + self.window_length
                window_range = [video_window_start, video_window_end]
                self.map[counter] = [i, window_range]
                counter = counter + 1
            self.map[counter] = [i, [max(0, video_length-self.window_length), video_length]]
            counter += 1
        self.dataset_length = counter
    def time_mask(self, spec, T=30, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        len_spectro = cloned.shape[0]
        for i in range(0, num_masks):
            # I only have 250 ish samples so I'm masking 20 max
            t = random.randrange(0, T)
            t_zero = random.randrange(0, len_spectro - t)
            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned
            mask_end = random.randrange(t_zero, t_zero + t)
            if (replace_with_zero): cloned[t_zero:mask_end] = 0
            else: cloned[t_zero:mask_end] = cloned.mean()
        return cloned
    def freq_mask(self, spec, F=5, num_masks=1, replace_with_zero=False):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
        for i in range(0, num_masks):        
            f = random.randrange(0, F)
            f_zero = random.randrange(0, num_mel_channels - f)
            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
        return cloned
    def load_IO_features_to_memory(self):
        for file_name in self.all_files_in_set:
            # get the aversion labels from the disk
            print(file_name)
            if self.long_aversion_only:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "long_aversion_label", file_name+".pkl"])
            else:
                output_aversion_label_path = os.path.join(*[self.data_root_path, "aversion_label", file_name+".pkl"])
            if self.with_gaze:
                gaze_label_path = os.path.join(*[self.data_root_path, "gaze", file_name+".pkl"])
                self.gaze_labels.append(pkl.load(open(gaze_label_path, "rb")))
                interlocutor_position_path = os.path.join(*["/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/", "tinterlocutor_direction", file_name+".pkl"])
                self.interlocutor_positions.append(pkl.load(open(interlocutor_position_path, "rb")))
            output_aversion_label = pkl.load(open(output_aversion_label_path, "rb"))

            # get the input features from the disk 
            # on_screen_raw_audio_path = os.path.join
            on_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_0.pkl"]) 
            off_screen_sentence_timing_path = os.path.join(*[self.data_root_path, "sentence_timing", file_name+"_1.pkl"])
            on_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_0.pkl"])
            off_screen_mfcc_path = os.path.join(*[self.data_root_path, "audio", file_name+"_1.pkl"])
            on_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_0.pkl"])
            off_screen_pos_path = os.path.join(*[self.data_root_path, "word_POS", file_name+"_1.pkl"])
            # word vector path
            on_screen_word_vec_path = os.path.join(*[self.data_root_path, self.word_vec_location, file_name+"_0.pkl"])
            off_screen_word_vec_path = os.path.join(*[self.data_root_path, self.word_vec_location, file_name+"_1.pkl"])
            
            # load the input features from the disk
            on_screen_sentence_timing = pkl.load(open(on_screen_sentence_timing_path, "rb"))
            off_screen_sentence_timing = pkl.load(open(off_screen_sentence_timing_path, "rb"))
            on_screen_mfcc = pkl.load(open(on_screen_mfcc_path, "rb"))
            off_screen_mfcc = pkl.load(open(off_screen_mfcc_path, "rb"))
            if self.normalize_MFCC:
                mean = np.mean(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.std(on_screen_mfcc + off_screen_mfcc, axis=0)
                std = np.where(std <= 1E-8, 1, std)
                on_screen_mfcc = (on_screen_mfcc - mean) / std  
                off_screen_mfcc = (off_screen_mfcc - mean) / std
                # now this is normalized to 0 mean and 1 std
            if self.pos_labels:
                on_screen_pos = pkl.load(open(on_screen_pos_path, "rb"))
                off_screen_pos = pkl.load(open(off_screen_pos_path, "rb")) 
            if on_screen_mfcc.shape[0] <= 50:
                continue
            # get input features
            on_sreen_input_features_word_embedding = pkl.load(open(on_screen_word_vec_path, "rb"))
            off_sreen_input_features_word_embedding = pkl.load(open(off_screen_word_vec_path, "rb"))
            input_features_on_screen = np.concatenate([on_sreen_input_features_word_embedding, on_screen_mfcc, on_screen_sentence_timing], axis=1)
            input_features_off_screen = np.concatenate([off_sreen_input_features_word_embedding, off_screen_mfcc, off_screen_sentence_timing], axis=1)
            if self.pos_labels: # the last 14 features are the POS tags
                input_features_on_screen = np.concatenate([input_features_on_screen, on_screen_pos], axis=1)
                input_features_off_screen = np.concatenate([input_features_off_screen, off_screen_pos], axis=1)
            input_feature = np.concatenate([input_features_on_screen, input_features_off_screen], axis=1)
            vel_output_target = dx_dt(output_aversion_label)
            vel_output_target = np.correlate(vel_output_target, self.gaussian_window, mode="same")
            self.input_features.append(input_feature)
            self.aversion_labels.append(output_aversion_label)
            self.velocity_labels.append(vel_output_target)
    def __getitem__(self, idx):
        # pad all audio to 250 frames
        input_feature = self.input_features[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        aversion_label = self.aversion_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        velocity_label = self.velocity_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
        # print(self.window_length, self.map[idx][1][0], self.map[idx][1][1], self.input_features[self.map[idx][0]].shape, self.aversion_labels[self.map[idx][0]].shape, self.velocity_labels[self.map[idx][0]].shape)
        if input_feature.shape[0] < self.window_length:
            missing_frames = self.window_length - input_feature.shape[0]
            padding = np.tile(np.expand_dims(self.filler_back, axis=0), [missing_frames, 1])
            input_feature = np.concatenate([input_feature, padding], axis=0)
            final_aversion_frame = aversion_label[-1]
            repeated_final_aversion_frame = np.tile(np.expand_dims(final_aversion_frame, axis=0), [missing_frames])
            aversion_label = np.concatenate([aversion_label, repeated_final_aversion_frame], axis=0)
            velocity_label = np.concatenate([velocity_label, np.zeros(missing_frames)], axis=0)  

        
        input_feature = torch.from_numpy(input_feature).double()
        if self.apply_time_mask and self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(self.time_mask(input_feature[:, 0:26]))
            input_feature[:, 46:72] = self.freq_mask(self.time_mask(input_feature[:, 46:72]))
        elif self.apply_time_mask:
            input_feature[:, 0:26] = self.time_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.time_mask(input_feature[:, 46:72])
        elif self.apply_frequency_mask:
            input_feature[:, 0:26] = self.freq_mask(input_feature[:, 0:26])
            input_feature[:, 46:72] = self.freq_mask(input_feature[:, 46:72])
        aversion_label = torch.from_numpy(aversion_label).double()
        velocity_label = torch.from_numpy(velocity_label).double()
        if self.with_gaze:
            gaze = self.gaze_labels[self.map[idx][0]][self.map[idx][1][0]:self.map[idx][1][1]]
            return input_feature, [aversion_label, gaze, self.interlocutor_positions[self.map[idx][0]], velocity_label] 
        return input_feature, [aversion_label, velocity_label]      
  

In [34]:
processed_data_path = "/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset_real_time_aug/"
meta_data = os.path.join(*[processed_data_path, "video_to_window_metadata.json"])
meta_data = json.load(open(meta_data))
all_videos = list(meta_data.keys())
training_set = []
testing_set = []
# get the name of the videos (this ensures no contamination because the same shot is split)
for i in range(0, len(all_videos)):
    if i / len(all_videos) < 0.9:
        training_set.append(all_videos[i])
    else:
        testing_set.append(all_videos[i])


video_include = list(meta_data.keys())
k_train_leak = Runtime_parsing_Aversion_SelfTape111_with_word_vec(processed_data_path, training_set, pos_labels=True, normalize_MFCC=True, apply_frequency_mask=False)

Nicholas Sparrow - Self Tape -The Rock_0
Nicholas Sparrow - Self Tape -The Rock_1
Ronen Rubinstein Self Tape_0
Ronen Rubinstein Self Tape_1
‘SWEATER’ DANIEL SELF-TAPE - ZACK FERNANDEZ_0
dacre montgomery audition tape_0
A self tape I_m very very proud of_0
A self tape I_m very very proud of_1
A self tape I_m very very proud of_2
Natalia Dyer - Stranger Things ＂Nancy Wheeler＂  Audition Tape_0
Harrison Green self tape reel 2021_0
Harrison Green self tape reel 2021_1
Harrison Green self tape reel 2021_2
Harrison Green self tape reel 2021_3
Dramatic Audition Self-Tape “Shameless”_0
SELF-TAPE THAT GOT ME BOOKED ｜ Indie Short Film Audition_0
Dramatic Self Tape Reel_0
Dramatic Self Tape Reel_1
Dramatic Self Tape Reel_2
Dramatic Self Tape Reel_3
Self Tape Audition_0
The Audition That Got Me ACCEPTED Into Drama School!_0
Therapist - Acting - Audition - Self-tape - by Thain Wesley_0
Therapist - Acting - Audition - Self-tape - by Thain Wesley_1
Fabricio Suarez self-tape for Comedic Series_0
Self-T

In [35]:
k_train_leak[0]

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1628 and the array at index 1 has size 92