In [3]:
import numpy as np
import pickle as pkl
import json
import torch
import librosa
import os
import sys
import cv2
import shutil
from matplotlib import pyplot as plt
import csv
import shutil
from datetime import datetime
import soundfile as sf
import whisper_timestamped
# import utility functions
sys.path.insert(0, '/Users/evanpan/Documents/GitHub/EvansToolBox/Utils')
# sys.path.insert(0, "C:/Users/evansamaa/Documents/GitHub/EvansToolBox")

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [4]:
def rotation_angles_frome_positions(arr):
    """
    converts an array of positions to an array of rotation angles (azimuth, elevation)
    centered at the origin, where:
        azimuth: +right,-left
        elevation: +up,-down
    here we assume that the input vectors are in world coordinates
    :param arr: array with shape (N, 3)
    :return: array with shape (N, 2)
    """
    # F: arr (N, 3) -> arr (N, 2) or arr (3, ) -> (2, )
    # in the output is in the convention of (azimuth, elevation)
    if len(arr.shape) == 2:
        mag = np.sqrt(np.sum(arr * arr, axis=1, keepdims=True))
        out = arr / mag
        out[:, 0] = np.arcsin(out[:, 0])
        out[:, 1] = np.arcsin(out[:, 1])
        return out[:, 0:2] * 180 / np.pi
    else:
        mag = np.sqrt(np.sum(arr * arr))
        out = arr / mag
        out[0] = np.arcsin(out[0])
        out[1] = np.arcsin(out[1])
        return out[0:2] * 180 / np.pi
def get_valid_shots(shots, fps, shot_length_mininmum=5):
    t0 = datetime.strptime("00:00:00.0", '%H:%M:%S.%f').timestamp()
    for shot in shots:
        start = shot[0]
        end = shot[1]
    # load the input shots range
    valid_shots_time, valid_shots_frames = [], []
    t0 = datetime.strptime("00:00:00.0", '%H:%M:%S.%f').timestamp()
    for i in range(len(shots)):
        start = datetime.strptime(shots[i][0], '%H:%M:%S.%f').timestamp()
        end = datetime.strptime(shots[i][1], '%H:%M:%S.%f').timestamp()
        if (end-start) >= shot_length_mininmum:
            start_t = start-t0
            end_t = end - t0
            valid_shots_time.append([start-t0, end-t0])
            valid_shots_frames.append([int(np.round(start_t*fps)), int(np.round(end_t*fps))])

    return valid_shots_time, valid_shots_frames
def load_head_and_gaze_angles(all_gaze_data, all_head_data):

    # head data
    head_angle_data = all_head_data["HEAD"]
    head_rotmat_per_frame = head_angle_data["ROTMAT"]
    head_bbox_per_frame = all_head_data["BBOX"] # we are not using but having it here is nice
    head_angle_per_frame = []
    neutral_position = np.array([0, 0, 100])
    for i in range(0, head_rotmat_per_frame.shape[0]):
        pos = head_rotmat_per_frame[i] @ neutral_position
        head_angle_per_frame.append(rotation_angles_frome_positions(pos[:]))
    head_angle_per_frame = np.array(head_angle_per_frame)
    # getting rotation angle in z direction
    neutral_position2 = np.array([0, 100, 0])
    head_angle_z_per_frame = []
    for i in range(0, head_rotmat_per_frame.shape[0]):
        pos = head_rotmat_per_frame[i] @ neutral_position2
        pos = np.array([pos[1], pos[2], pos[0]])
        head_angle_z_per_frame.append(rotation_angles_frome_positions(pos)[1])
    head_angle_xy_per_frame = np.array(head_angle_per_frame)
    head_angle_z_per_frame = np.expand_dims(np.array(head_angle_z_per_frame), axis=1)
    head_angle_per_frame = np.concatenate([head_angle_xy_per_frame, head_angle_z_per_frame], axis=1)

    # getting gaze data
    gaze_angle_data = all_gaze_data["RAW_GAZE"]
    gaze_angle_per_frame = gaze_angle_data["EULER"]
    gaze_rotmat_per_frame = gaze_angle_data["ROTMAT"]
    blinks = all_head_data["BLINKS"]
    gaze_vec = np.array([0, 0, 100])
    eye_angle_per_frame = []
    for i in range(0, gaze_rotmat_per_frame.shape[0]):
        eye_line = gaze_rotmat_per_frame[i] @ gaze_vec
        eye_line = eye_line / eye_line[2] * 100
        eye_angle_per_frame.append(eye_line)
    eye_angle_per_frame = np.array(eye_angle_per_frame)
    eye_angle_per_frame = rotation_angles_frome_positions(eye_angle_per_frame[:])
    return eye_angle_per_frame, head_angle_per_frame

# Input Block

In [5]:
input_folder = "/Volumes/EVAN_DISK/MASC/Ribhav_processed_dataset/"
output_folder = "/Volumes/EVAN_DISK/MASC/deep_learning_processed_dataset/"
input_folder = "/scratch/ondemand27/evanpan/data/Ribshabh_processed_dataset/"
output_folder = "/scratch/ondemand27/evanpan/data/deep_learning_processed_dataset/"


redo = False

target_fps = 24
window_length = 20
stride_length = 10

In [6]:
output_json_path = os.path.join(output_folder, "metadata.json")

# Folder Structure Meta Data

In [14]:
# First Deal with the output folder structures:
# remove everything in the output folder
if redo:
    try:
        os.mkdir(output_folder)
    except:
        shutil.rmtree(output_folder)
        os.mkdir(output_folder)
        
    # this set is temporary
    os.mkdir(os.path.join(output_folder, "taudio")) # this one will have MFCC, intensity, 
    os.mkdir(os.path.join(output_folder, "ttext")) # this one give the text per 
    os.mkdir(os.path.join(output_folder, "tgaze")) # this will store the per time-stamp.
    os.mkdir(os.path.join(output_folder, "thead")) # this one one is also per time-stamp
    os.mkdir(os.path.join(output_folder, "tfixation")) # this will have the gaze fixation. 
    os.mkdir(os.path.join(output_folder, "tblinks")) # this one is 
    os.mkdir(os.path.join(output_folder, "taversion_label")) # this one is also one per time frame

    # this set is permanant
    os.mkdir(os.path.join(output_folder, "audio")) # this one will have MFCC, intensity, 
    os.mkdir(os.path.join(output_folder, "text")) # this one give the text per 
    os.mkdir(os.path.join(output_folder, "gaze")) # this will store the per time-stamp.
    os.mkdir(os.path.join(output_folder, "head")) # this one one is also per time-stamp
    os.mkdir(os.path.join(output_folder, "fixation")) # this will have the gaze fixation. 
    os.mkdir(os.path.join(output_folder, "blinks")) # this one is 
    os.mkdir(os.path.join(output_folder, "aversion_label")) # this one is also one per time frame


In [30]:
# generate metadata file
if redo:
    video_list_path = os.path.join(*[input_folder, "video"])
    video_list = os.listdir(video_list_path)
    all_metadata = {}
    for video in video_list:
        if video[0:2] != "._" and video != "." and video != "..":
            cap = cv2.VideoCapture(os.path.join(*[video_list_path, video]))
            fps = cap.get(cv2.CAP_PROP_FPS)
            width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
            height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            metadata = {"fps": fps,
                        "width": width, 
                        "height": height,
                        "frame_count": frame_count}
            all_metadata[video] = metadata
    video_metadata_path = os.path.join(*[input_folder, "local_metadata.json"])
    json.dump(all_metadata, open(video_metadata_path, "w"))

In [7]:

# obtain all the file_paths
video_metadata_path = os.path.join(*[input_folder, "local_metadata.json"])
video_metadatas = json.load(open(video_metadata_path))
video_names = list(video_metadatas.keys())

In [25]:
metadata = {}
output_file_names = []
output_file_fps = []
output_file_sr = []
output_file_audio_length = []
output_file_annotation_length = []
output_file_video_interval = []
output_file_audio_interval = []
# for i in range(0, 3):
for i in range(0, len(video_names)):
    print("currently on video {}, {}".format(i, video_names[i]))
    TESTING = True
    # load the data for one video
    file_name_video = video_names[i]
    file_name = file_name_video.split(".")[0]
    metadata = video_metadatas[file_name_video]
    fps = metadata["fps"]

    # get file_paths
    audio_path = os.path.join(*[input_folder, "audio", file_name + ".wav"])
    # annotation
    gaze_direction_path = os.path.join(*[input_folder, "ETHGaze-Mod", file_name+".pkl"])
    head_direction_path = os.path.join(*[input_folder, "pose", file_name+".pkl"])
    diarization_path = os.path.join(*[input_folder, "tracklets", file_name+"_Speakers.json"])
    
    # video
    try:
        all_gaze_data = pkl.load(open(gaze_direction_path, "rb"))
        all_head_data = pkl.load(open(head_direction_path, "rb"))
        gaze, neck = load_head_and_gaze_angles(all_gaze_data, all_head_data) # each one is of shape [N, 2] (the two angles are asimuth)
        blinks = all_head_data["BLINKS"] # of shapa [N, ], 1 = eye close, 0 = eye open
    except:
        print("failed for video: {}, file not found".format(i))
        continue
    # shots
    shot_path = os.path.join(*[input_folder, "shots", file_name, "shot_cuts.json"])
    try:
        shots = json.load(open(shot_path))["shots"]
    except:
        shot_path = os.path.join(*[input_folder, "shots", file_name, file_name, "shot_cuts.json"])
        shots = json.load(open(shot_path))["shots"]

    # audio
    try:
        audio, sr = librosa.load(str(audio_path))
    except:
        print("failed for video: {}".format(i))
        continue
    speaker = json.load(open(diarization_path))["aligned"] # {speaker_id: [{"start": t, "end": t, "start_frame":frame, "end_frame":frame}]}

    # obtain the valid shots for this 
    valid_shots_time, valid_shots_frame = get_valid_shots(shots, fps, 5)

    # all the annotation data
    gazes_per_shot = []
    head_per_shot = []
    blink_per_shot = []
    for j in range(0, len(valid_shots_time)):
        time_range = valid_shots_time[j]
        frame_range = valid_shots_frame[j]
        gaze_in_shot = gaze[frame_range[0]:frame_range[1]]
        gazes_per_shot.append(gaze_in_shot)
        head_in_shot = neck[frame_range[0]:frame_range[1]]
        head_per_shot.append(head_in_shot)
        blink_in_shot = blinks[frame_range[0]:frame_range[1]]
        blink_per_shot.append(blink_in_shot)
        output_file_video_interval.append(frame_range)
        # do stuff to them here

    # identify the speaker in each shot
    speaker_id_per_shot = []
    speaker_ids_with_off_screen = list(speaker.keys())
    # here we wish to ignore Off-Screen
    speaker_ids = []
    for i in range(0, len(speaker_ids_with_off_screen)):
        if speaker_ids_with_off_screen[i] != "OFF-SCREEN":
            speaker_ids.append(speaker_ids_with_off_screen[i])
    auds = []
    if len(speaker_ids) == 0:
        for j in range(0, len(valid_shots_time)):
            # get the duration of the shot
            shot_range_time = valid_shots_time[j]
            shot_range_frames = valid_shots_frame[j]
            # get the audio to only include the shot range (two tracks. one for speaker one for listener) 
            audio_start = int(shot_range_time[0] * sr)
            audio_end = np.minimum(int(shot_range_time[1] * sr), audio.shape[0])
            audio_of_shot = audio[audio_start:audio_end]
            on_screen_bitmap = np.ones(audio_of_shot.shape)
            off_screen_bitmap = np.zeros(audio_of_shot.shape)
            audio_on_screen = audio_of_shot * on_screen_bitmap
            audio_off_screen = audio_of_shot * off_screen_bitmap
            # store this for later
            output_file_audio_interval.append([int(audio_start), int(audio_end)])
            auds.append([audio_on_screen, audio_off_screen])
    elif len(speaker_ids) > 0:
        for j in range(0, len(valid_shots_time)):
            valid_shot_turn = valid_shots_frame[j]
            # create list to store the percentage overlap between the speaker's turn and the current shot
            speaker_overlaps = []
            for id in range(0, len(speaker_ids)):
                speaker_overlaps.append(0)
            # iterate through each speaker to find their overlap
            for id in range(0, len(speaker_ids)):
                speaker_activities = speaker[speaker_ids[id]]
                # iterate through each speech to sum up the overlapp
                for turn in range(0, len(speaker_activities)):
                    speech_interval = [speaker_activities[turn]["start_frame"], speaker_activities[turn]["end_frame"]]
                    # find overlapp
                    if np.maximum(speech_interval[0], valid_shot_turn[0]) <= np.minimum(speech_interval[1], valid_shot_turn[1]):
                        speaker_overlaps[id] = speaker_overlaps[id] + 1
            speaker_id_per_shot.append(speaker_ids[np.argmax(speaker_overlaps)])

        # parse audio for each shot (2 audio per shot)
        t0 = datetime.strptime("00:00:00.0", '%H:%M:%S.%f').timestamp()
        # start = datetime.strptime(shots[i][0], '%H:%M:%S.%f').timestamp()
        for j in range(0, len(valid_shots_time)):
            # get the duration of the shot
            shot_range_time = valid_shots_time[j]
            shot_range_frames = valid_shots_frame[j]
            # get the speaker activity of the speaker 
            speaker_activity = speaker[speaker_id_per_shot[j]]
            # get the audio to only include the shot range (two tracks. one for speaker one for listener) 
            audio_start = int(shot_range_time[0] * sr)
            audio_end = np.minimum(int(shot_range_time[1] * sr), audio.shape[0])
            audio_of_shot = audio[audio_start:audio_end]
            on_screen_bitmap = np.zeros(audio_of_shot.shape)
            off_screen_bitmap = np.ones(audio_of_shot.shape)
            output_file_audio_interval.append([int(audio_start), int(audio_end)])
            # parse the audio to get a bitmap of speech turn 
            for interval_i in range(0,len(speaker_activity)):
                # get the start and end of the current speaker turn
                turn_start = speaker_activity[interval_i]["start"]
                turn_end = speaker_activity[interval_i]["end"]
                # turn it into numbers, and make sure that 0 is the start of the shot not the video
                turn_start = datetime.strptime(turn_start, '%H:%M:%S.%f').timestamp() - t0
                turn_end = datetime.strptime(turn_end, '%H:%M:%S.%f').timestamp() - t0
                # get the same thing in frames
                turn_start_frame = int(turn_start * sr) - audio_start
                turn_end_frame = int(turn_end * sr) - audio_start
                on_screen_bitmap[turn_start_frame:turn_end_frame] = on_screen_bitmap[turn_start_frame:turn_end_frame] + 1
            off_screen_bitmap = off_screen_bitmap - on_screen_bitmap
            audio_on_screen = audio_of_shot * on_screen_bitmap
            audio_off_screen = audio_of_shot * off_screen_bitmap
            auds.append([audio_on_screen, audio_off_screen])

    for j in range(0, len(auds)):
        output_audio_onscreen_path = os.path.join(*[output_folder, "taudio", file_name+"_{}_{}.wav".format(j, 0)]) 
        output_audio_offscreen_path = os.path.join(*[output_folder, "taudio", file_name+"_{}_{}.wav".format(j, 1)]) 
        output_gaze_path = os.path.join(*[output_folder, "tgaze", file_name+"_{}.pkl".format(j)]) 
        output_head_path = os.path.join(*[output_folder, "thead", file_name+"_{}.pkl".format(j)]) 
        output_blinks_path = os.path.join(*[output_folder, "tblinks", file_name+"_{}.pkl".format(j)]) 
        # annotation files
        pkl.dump(gazes_per_shot[j], open(output_gaze_path,  "wb"))
        pkl.dump(head_per_shot[j], open(output_head_path,  "wb"))
        pkl.dump(blink_per_shot[j], open(output_blinks_path,  "wb"))
        sf.write(output_audio_onscreen_path, auds[j][0], sr)
        sf.write(output_audio_offscreen_path, auds[j][1], sr)
        output_file_names.append(file_name+"_{}".format(j))
        output_file_fps.append(fps)
        output_file_sr.append(sr)
        output_file_audio_length.append(int(auds[j][0].shape[0]))
        output_file_annotation_length.append(int(gazes_per_shot[j].shape[0]))
output_json = {"data":[]}
for i in range(0, len(output_file_names)):
    output_json["data"].append({"name":output_file_names[i],
                               "fps":output_file_fps[i],
                               "sr":output_file_sr[i],
                               "audio_length":output_file_audio_length[i],
                               "annotation_length":output_file_annotation_length[i], 
                               "audio_range": output_file_audio_interval[i],
                               "video_range": output_file_video_interval[i]})
json.dump(output_json, open(output_json_path, "w"))

currently on video 0, Ronen Rubinstein Self Tape.mp4
currently on video 1, ‘SWEATER’ DANIEL SELF-TAPE - ZACK FERNANDEZ.mp4
currently on video 2, dacre montgomery audition tape.mp4
currently on video 3, A self tape I_m very very proud of.mp4
currently on video 4, Natalia Dyer - Stranger Things ＂Nancy Wheeler＂  Audition Tape.mp4
currently on video 5, Harrison Green self tape reel 2021.mp4
currently on video 6, Dramatic Audition Self-Tape “Shameless”.mp4
currently on video 7, SELF-TAPE THAT GOT ME BOOKED ｜ Indie Short Film Audition.mp4
currently on video 8, Dramatic Self Tape Reel.mp4
currently on video 9, Self Tape Audition.mp4
currently on video 10, The Audition That Got Me ACCEPTED Into Drama School!.mp4
currently on video 11, Therapist - Acting - Audition - Self-tape - by Thain Wesley.mp4
currently on video 12, Fabricio Suarez self-tape for Comedic Series.mp4
currently on video 13, Self-Tape Demo Reel.mp4
currently on video 14, Kelsey Boze Self Tape Reel.mp4
currently on video 15, Ste

# Generate text transcripts

In [8]:
print(os.getenv("XDG_CACHE_HOME")) 

None


In [12]:
model_word = whisper_timestamped.load_model("base.en", download_root=os.path.join(output_folder, "whisper"))
output_json = json.load(open(output_json_path, "r"))

100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 21.5MiB/s]


In [13]:
# [sr, audio_oEnscreen, audio_offscreen], [fps, gaze, head, blinks, aversion], [file_name, shot_range] = dataset.get_video(29)
for i in range(0, len(output_json["data"])):
    file_name = output_json["data"][i]["name"]
    for speaker in range(0, 2):
        file_path = os.path.join(*[output_folder, "taudio", file_name+"_{}.wav".format(speaker)])
        output_text_file_path = os.path.join(*[output_folder, "ttext", file_name+"_{}.json".format(speaker)])
        # get word alignment result
        result_word = whisper_timestamped.transcribe(model_word, file_path, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), vad=True)
        word_alignment = []
        for s in range(0,len(result_word["segments"])):
            word_alignment = word_alignment + result_word["segments"][s]["words"]
        trascript_json = {"text":word_alignment}
        json.dump(trascript_json, open(output_text_file_path, "w"))

100%|██████████| 4807/4807 [00:03<00:00, 1590.35frames/s]
100%|██████████| 3166/3166 [00:01<00:00, 1932.72frames/s]
100%|██████████| 4364/4364 [00:02<00:00, 1620.49frames/s]
100%|██████████| 3174/3174 [00:01<00:00, 1856.01frames/s]
100%|██████████| 1227/1227 [00:01<00:00, 921.71frames/s]
  0%|          | 0/2300 [00:01<?, ?frames/s]
100%|██████████| 3113/3113 [00:02<00:00, 1513.28frames/s]
 19%|█▉        | 697/3697 [00:00<00:04, 744.90frames/s]
100%|██████████| 3109/3109 [00:01<00:00, 1579.79frames/s]
100%|██████████| 5019/5019 [00:03<00:00, 1407.00frames/s]
100%|██████████| 4920/4920 [00:03<00:00, 1260.68frames/s]
100%|██████████| 4378/4378 [00:03<00:00, 1435.42frames/s]
100%|██████████| 735/735 [00:00<00:00, 1671.86frames/s]
100%|██████████| 191/191 [00:00<00:00, 943.83frames/s]
100%|██████████| 715/715 [00:00<00:00, 1642.40frames/s]
100%|██████████| 2660/2660 [00:01<00:00, 1863.28frames/s]
100%|██████████| 2155/2155 [00:01<00:00, 1745.69frames/s]
100%|██████████| 2491/2491 [00:01<00:

In [26]:
output_text_file_path

'/Volumes/EVAN_DISK/MASC/deep_learning_processed_dataset/ttext/Ronen Rubinstein Self Tape_0_0.json'