In [1]:
import os

from utils import PATHS, FPS

In [30]:
mode = "train"
VIDEO_DIR = PATHS["train_video_clips_dir"]

video_id = "vfjywN5CN0Y"
seg_id = "0900_0960"


# Number of frames per segment
T = 2 * FPS


START = 900
END = 960

video_dir = os.path.join(VIDEO_DIR, video_id, seg_id)
audio_file = os.path.join(PATHS["orig_audios"], mode, video_id+".wav")
assert os.path.exists(audio_file), f"Audio file {audio_file} does not exist."

os.listdir(video_dir)

['vfjywN5CN0Y_0900_0960_1',
 'vfjywN5CN0Y_0900_0960_14',
 'vfjywN5CN0Y_0900_0960_2',
 'vfjywN5CN0Y_0900_0960_20',
 'vfjywN5CN0Y_0900_0960_21',
 'vfjywN5CN0Y_0900_0960_22',
 'vfjywN5CN0Y_0900_0960_23',
 'vfjywN5CN0Y_0900_0960_3',
 'vfjywN5CN0Y_0900_0960_8',
 'vfjywN5CN0Y_0900_0960_9']

In [31]:
entities_to_num = {}
id_to_entities = []
transitions = []

for entity_id in os.listdir(video_dir):
    image_files = os.listdir(os.path.join(video_dir, entity_id))
    image_files.sort()
    
    # If the entity_id is not in the dictionary, add it
    if entity_id not in entities_to_num:
        entities_to_num[entity_id] = len(entities_to_num)
        id_to_entities.append((entity_id))
        
    # Frames are named as <time in s>.<time in ms>.jpg
    # Get the start of the first frame and the end of the last frame
    first_frame = image_files[0]
    last_frame = image_files[-1]
    start_time = float(".".join(first_frame.split(".")[:-1]))
    end_time = float(".".join(last_frame.split(".")[:-1]))
        
    # Add the transition to the list
    # Positive transition at start_time and negative transition at end_time
    transitions.append((start_time, entities_to_num[entity_id], 1))
    transitions.append((end_time, entities_to_num[entity_id], -1))
    

# Sort the transitions by time
transitions.sort(key=lambda x: x[0])

transitions    

[(900.0, 0, 1),
 (902.04, 0, -1),
 (902.08, 2, 1),
 (903.4501, 2, -1),
 (903.87, 7, 1),
 (906.4501, 7, -1),
 (917.7, 8, 1),
 (924.12, 8, -1),
 (924.16, 9, 1),
 (930.33, 9, -1),
 (933.29, 1, 1),
 (935.5, 1, -1),
 (935.79, 3, 1),
 (937.8701, 3, -1),
 (937.9101, 4, 1),
 (947.87, 4, -1),
 (950.91, 5, 1),
 (951.54, 6, 1),
 (954.37, 5, -1),
 (954.37, 6, -1)]

In [47]:
# Create segments (start_time, end_time, active_entities)

segments = []
active_entities = [0] * len(entities_to_num)

for i, transition in enumerate(transitions[:-1]):
    time, entity_id, transition_type = transition
    if transition_type == 1:
        active_entities[entity_id] = 1
    else:
        active_entities[entity_id] = 0
    end_time = transitions[i + 1][0]
    if sum(active_entities) > 0:
        segments.append((time, end_time, active_entities.copy()))
        
segments

[(900.0, 902.04, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 (902.08, 903.4501, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 (903.87, 906.4501, [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 (917.7, 924.12, [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 (924.16, 930.33, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 (933.29, 935.5, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (935.79, 937.8701, [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 (937.9101, 947.87, [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 (950.91, 951.54, [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 (951.54, 954.37, [0, 0, 0, 0, 0, 1, 1, 0, 0, 0]),
 (954.37, 954.37, [0, 0, 0, 0, 0, 0, 1, 0, 0, 0])]

In [48]:
segments_to_save = []
for segment in segments:
    start_time, end_time, active_entities = segment
    duration = end_time - start_time
    num_frames = int(duration * FPS)
    
    # If the segment is very short (< T/10), discard it
    if num_frames < T / 10:
        continue
    
    # If the segment is shorter than T frames, save it as is
    if num_frames < T:
        segments_to_save.append((start_time, end_time, active_entities.copy()))
        continue
    
    # Split into segments of T frames
    t = 0
    while t < num_frames:
        start = start_time + t / FPS
        end = start_time + (t + T) / FPS
        segments_to_save.append((start, end, active_entities.copy()))
        t += T
            
segments = segments_to_save
segments

[(900.0, 902.0, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 (902.08, 903.4501, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 (903.87, 905.87, [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 (905.87, 907.87, [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 (917.7, 919.7, [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 (919.7, 921.7, [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 (921.7, 923.7, [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 (923.7, 925.7, [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 (924.16, 926.16, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 (926.16, 928.16, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 (928.16, 930.16, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 (930.16, 932.16, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 (933.29, 935.29, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (935.29, 937.29, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (935.79, 937.79, [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 (937.79, 939.79, [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 (937.9101, 939.9101, [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 (939.9101, 941.9101, [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 (941.9101, 943.9101, [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 (943.9101, 945.9101, [0, 0

In [49]:
import pandas as pd

segments_df = pd.DataFrame(columns=["start_time", "end_time", "audio_file", "frames_paths"])


for segment in segments:
    start_time, end_time, active_entities = segment
        
    # Get the frames that fall in the segment for each entity
    frames_segment = []
    for entity_id in range(len(active_entities)):
        if active_entities[entity_id] == 0:
            continue
        entity = id_to_entities[entity_id]
        entity_dir = os.path.join(video_dir, entity)
        # Get the frames that fall in the segment
        image_files = []
        for image_file in os.listdir(entity_dir):
            # Get the time of the frame
            time = float(".".join(image_file.split(".")[:-1]))
            label = int(image_file.split("_")[-1].split(".")[0])
            if start_time - .5/FPS <= time <= end_time + .5/FPS:
                image_files.append((os.path.join(entity_dir, image_file), label))
            image_files.sort(key=lambda x: x[0])
        frames_segment.append(image_files.copy())
    
    # Save the segment in the dataframe
    segments_df.loc[len(segments_df)] = [start_time, end_time, audio_file, frames_segment]


In [51]:
segments_df

Unnamed: 0,start_time,end_time,audio_file,frames_paths
0,900.0,902.0,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
1,902.08,903.4501,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
2,903.87,905.87,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
3,905.87,907.87,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
4,917.7,919.7,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
5,919.7,921.7,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
6,921.7,923.7,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
7,923.7,925.7,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
8,924.16,926.16,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...
9,926.16,928.16,d:\Documents\GitHub\VisualSAD\data\orig_audios...,[[(d:\Documents\GitHub\VisualSAD\data\clips_vi...


In [60]:
i = 0

start_time, end_time, audio_file, frames_paths = segments_df.iloc[i]

In [68]:
import torchaudio

sample_rate = 16000
%timeit audio_signal, _ = torchaudio.load(audio_file, frame_offset=int(start_time * sample_rate), num_frames=int((end_time - start_time) * sample_rate))

368 μs ± 25.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
import torch


targets = torch.zeros((T, len(frames_paths)), dtype=torch.bool)
images = torch.zeros((T, len(frames_paths), 3, 224, 224), dtype=torch.uint8)