In [64]:
import numpy as np
import pandas as pd
import os
import h5py
from tqdm import tqdm
import time
from datetime import timedelta
import pickle

In [32]:
SAMPLE_RATE = 24_000
ANNOTATIONS_PATH = "../data/epic-kitchens-100-annotations/EPIC_100_validation.pkl"
VERBS_PATH = "../data/epic-kitchens-100-annotations/EPIC_100_verb_classes.csv"
AUDIO_FILE_PATH = "/scratch/work/ptg/EPIC-KITCHENS/EPIC_audio.hdf5"
VIDEO_DESC_PATH = "../data/epic-kitchens-100-annotations/EPIC_100_video_info.csv"

## 1. Load the annotations for the validation set

In [5]:
df = pd.read_pickle(ANNOTATIONS_PATH)
df.head(10)

Unnamed: 0_level_0,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
narration_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
P01_11_0,P01,P01_11,00:00:00.560,00:00:00.00,00:00:01.89,1,113,take plate,take,0,plate,2,[plate],[2]
P01_11_1,P01,P01_11,00:00:01.700,00:00:01.56,00:00:02.45,93,147,put down plate,put-down,1,plate,2,[plate],[2]
P01_11_10,P01,P01_11,00:00:48.500,00:00:49.15,00:00:50.95,2949,3057,take paper,take,0,paper,49,[paper],[49]
P01_11_100,P01,P01_11,00:05:27.840,00:05:27.28,00:05:31.97,19636,19918,wash cloth,wash,2,cloth,17,[cloth],[17]
P01_11_101,P01,P01_11,00:05:26.840,00:05:27.37,00:05:29.86,19642,19791,take cloth,take,0,cloth,17,[cloth],[17]
P01_11_102,P01,P01_11,00:05:32.500,00:05:30.62,00:05:35.02,19837,20101,squeeze cloth,squeeze,18,cloth,17,[cloth],[17]
P01_11_103,P01,P01_11,00:05:37.000,00:05:35.56,00:05:41.14,20133,20468,wipe counter,wipe,2,counter,42,[counter],[42]
P01_11_104,P01,P01_11,00:05:41.590,00:05:41.24,00:05:48.23,20474,20893,wipe sink,wipe,2,sink,63,[sink],[63]
P01_11_105,P01,P01_11,00:05:50.099,00:05:48.67,00:05:56.10,20920,21366,continue wiping sink,wipe,2,sink,63,[sink],[63]
P01_11_106,P01,P01_11,00:05:57.240,00:05:56.18,00:05:57.82,21370,21469,squeeze cloth,squeeze,18,cloth,17,[cloth],[17]


In [13]:
verbs_df = pd.read_csv(VERBS_PATH, index_col=0)

all_verb_classes = verbs_df.index
all_verbs = verbs_df["key"]

verbs_df.head(10)

Unnamed: 0_level_0,key,instances,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,take,"['collect-from', 'collect-into', 'draw', 'fetc...",retrieve
1,put,"['create', 'dose', 'lay', 'lay-down', 'lay-on'...",leave
2,wash,"['clean', 'clean-around', 'clean-from', 'clean...",clean
3,open,"['lever-open', 'open', 'open-in', 'open-on', '...",access
4,close,"['close', 'close-off', 'close-with', 'screw-on...",block
5,insert,"['drop-into', 'fit', 'fit-inside', 'insert', '...",leave
6,turn-on,"['activate', 'begin', 'ignite', 'light', 'play...",access
7,cut,"['chop', 'chop-in', 'chop-off', 'chop-up', 'ch...",split
8,turn-off,"['shut-off', 'switch-of', 'switch-off', 'switc...",block
9,pour,"['drizzle', 'drizzle-into', 'drizzle-on', 'pou...",merge


In [33]:
video_df = pd.read_csv(VIDEO_DESC_PATH, index_col=0)
video_df.head()

Unnamed: 0_level_0,duration,fps,resolution
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P01_01,1652.152817,59.94006,1920x1080
P01_02,502.134967,59.94006,1920x1080
P01_03,118.852067,59.94006,1920x1080
P01_04,105.238467,59.94006,1920x1080
P01_05,1271.988033,59.94006,1920x1080


In [24]:
# Load the h5py file for the audio
audio_dataset = h5py.File(AUDIO_FILE_PATH, "r")

In [28]:
df.head()

Unnamed: 0_level_0,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
narration_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
P01_11_0,P01,P01_11,00:00:00.560,00:00:00.00,00:00:01.89,1,113,take plate,take,0,plate,2,[plate],[2]
P01_11_1,P01,P01_11,00:00:01.700,00:00:01.56,00:00:02.45,93,147,put down plate,put-down,1,plate,2,[plate],[2]
P01_11_10,P01,P01_11,00:00:48.500,00:00:49.15,00:00:50.95,2949,3057,take paper,take,0,paper,49,[paper],[49]
P01_11_100,P01,P01_11,00:05:27.840,00:05:27.28,00:05:31.97,19636,19918,wash cloth,wash,2,cloth,17,[cloth],[17]
P01_11_101,P01,P01_11,00:05:26.840,00:05:27.37,00:05:29.86,19642,19791,take cloth,take,0,cloth,17,[cloth],[17]


## 1. Loop over all verbs

In [45]:
def timestamp_to_sec(timestamp):
    time_parts = timestamp.split(".")
    base_time = time_parts[0]
    microsecond_part = time_parts[1].rstrip("0") if len(time_parts) > 1 else "0"

    if not microsecond_part:
        microsecond_part = "0"

    x = time.strptime(base_time, "%H:%M:%S")

    # Calculate the divisor based on the length of the microsecond part
    divisor = 10 ** len(microsecond_part)

    sec = (
        float(
            timedelta(
                hours=x.tm_hour,
                minutes=x.tm_min,
                seconds=x.tm_sec,
                microseconds=int(microsecond_part),
            ).total_seconds()
        )
        + int(microsecond_part) / divisor
    )
    return sec

In [73]:
output_dir = "waveforms"
os.makedirs(output_dir, exist_ok=True)

count = 0
# for verb in tqdm(all_verb_classes, unit=" verb"):
for i, verb in enumerate(all_verb_classes, 1):
    verb_actions = df[df["verb_class"] == verb]
    verb_name = verbs_df.loc[verb]["key"]

    verb_waveforms = []
    
    for ii, action in tqdm(
        verb_actions.iterrows(), 
        total=verb_actions.shape[0], 
        unit=" action", 
        desc=f"[{i}/{len(all_verb_classes)}] Getting waveforms for '{verb_name}' ({verb})\t",
    ):
        video_audio = audio_dataset[action.video_id]
        
        # Get start and end indices in the processed audio
        start_idx = int(timestamp_to_sec(action.start_timestamp) * SAMPLE_RATE)
        end_idx = int(timestamp_to_sec(action.stop_timestamp) * SAMPLE_RATE)
        
        # Get the waveform and append it per verb
        wf = video_audio[start_idx:end_idx][()]
        verb_waveforms.append(wf)
        
        count += 1
        
    # Save the verb_waveforms to pickle
    with open(os.path.join(output_dir, f"{verb_name}.pkl"), "wb") as f:
        pickle.dump(verb_waveforms, f)      
        
print(f"Done saving {count:,} actions to {output_dir}/!")

[1/97] Getting waveforms for 'take' (0)	: 100%|██████████| 1937/1937 [00:02<00:00, 732.96 action/s]
[2/97] Getting waveforms for 'put' (1)	: 100%|██████████| 1709/1709 [00:02<00:00, 590.22 action/s]
[3/97] Getting waveforms for 'wash' (2)	: 100%|██████████| 1141/1141 [00:01<00:00, 684.15 action/s]
[4/97] Getting waveforms for 'open' (3)	: 100%|██████████| 810/810 [00:01<00:00, 592.72 action/s]
[5/97] Getting waveforms for 'close' (4)	: 100%|██████████| 514/514 [00:00<00:00, 902.68 action/s]
[6/97] Getting waveforms for 'insert' (5)	: 100%|██████████| 608/608 [00:00<00:00, 807.04 action/s]
[7/97] Getting waveforms for 'turn-on' (6)	: 100%|██████████| 303/303 [00:00<00:00, 731.50 action/s]
[8/97] Getting waveforms for 'cut' (7)	: 100%|██████████| 292/292 [00:00<00:00, 730.63 action/s]
[9/97] Getting waveforms for 'turn-off' (8)	: 100%|██████████| 211/211 [00:00<00:00, 755.00 action/s]
[10/97] Getting waveforms for 'pour' (9)	: 100%|██████████| 242/242 [00:00<00:00, 662.63 action/s]
[11/9

[86/97] Getting waveforms for 'choose' (85)	: 100%|██████████| 4/4 [00:00<00:00, 1008.91 action/s]
[87/97] Getting waveforms for 'lock' (86)	: 0 action [00:00, ? action/s]
[88/97] Getting waveforms for 'flatten' (87)	: 0 action [00:00, ? action/s]
[89/97] Getting waveforms for 'switch' (88)	: 0 action [00:00, ? action/s]
[90/97] Getting waveforms for 'carry' (89)	: 100%|██████████| 2/2 [00:00<00:00, 1571.78 action/s]
[91/97] Getting waveforms for 'season' (90)	: 100%|██████████| 7/7 [00:00<00:00, 1372.10 action/s]
[92/97] Getting waveforms for 'unlock' (91)	: 0 action [00:00, ? action/s]
[93/97] Getting waveforms for 'prepare' (92)	: 0 action [00:00, ? action/s]
[94/97] Getting waveforms for 'bake' (93)	: 100%|██████████| 2/2 [00:00<00:00, 1423.25 action/s]
[95/97] Getting waveforms for 'mark' (94)	: 0 action [00:00, ? action/s]
[96/97] Getting waveforms for 'bend' (95)	: 0 action [00:00, ? action/s]
[97/97] Getting waveforms for 'unfreeze' (96)	: 0 action [00:00, ? action/s]

Done saving 9,668 actions to waveforms/!



