# Data curation on `EPIC-KITCHENS-100` annotations


## 0. Install dependencies & clone datasets


In [1]:
%pip install -U pandas
%pip install -U plotly

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
%%bash
# Clone the repositories if they don't exist
DATA_DIR="./data"
EK_REPO_NAME="epic-kitchens-100-annotations"
ES_REPO_NAME="epic-sounds-annotations"

cd .. && make data

## 1. Imports


In [3]:
import pandas as pd

# Use plotly as backend for pandas
pd.options.plotting.backend = "plotly"


## 2. Load annotations


In [3]:
ek_verbs_path = "../data/epic-kitchens-100-annotations/EPIC_100_verb_classes.csv"

verbs_df = pd.read_csv(ek_verbs_path, header=0, index_col=0)
verbs_df.head()

Unnamed: 0_level_0,key,instances,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,take,"['collect-from', 'collect-into', 'draw', 'fetc...",retrieve
1,put,"['create', 'dose', 'lay', 'lay-down', 'lay-on'...",leave
2,wash,"['clean', 'clean-around', 'clean-from', 'clean...",clean
3,open,"['lever-open', 'open', 'open-in', 'open-on', '...",access
4,close,"['close', 'close-off', 'close-with', 'screw-on...",block


In [4]:
verbs = sorted(verbs_df.key.unique())
print(f"Number of verbs: {len(verbs)}")

Number of verbs: 97


We want to select for now only the verbs that relate a clear and dramatic audio event (i.e. something that breaks). Here are some of them:


In [5]:
INTERESTING_VERB_CLASSES = {
    "break",
    "crush",
    "pat",
    "put",
    # "shake",
    "sharpen",
    "smell",
    "throw",
    "water",
}

Make sure all the selected verb classes are well in the dataset (i.e. there are no typos)


In [6]:
assert INTERESTING_VERB_CLASSES.issubset(
    verbs
), f"Some verb classes are not in the list of verbs: {INTERESTING_VERB_CLASSES - set(verbs)}"

Get all the IDs corresponding to the verb classes we want to keep


In [7]:
INTERESTING_VERB_CLASSES_IDS = verbs_df[verbs_df.key.isin(INTERESTING_VERB_CLASSES)].index.to_list()
INTERESTING_VERB_CLASSES_IDS

[1, 13, 30, 32, 49, 58, 62, 75]

## 3. Plot how frequent these verbs show up in the dataset


In [8]:
ek_train_path = "../data/epic-kitchens-100-annotations/EPIC_100_train.pkl"

ek_train_df = pd.read_pickle(ek_train_path)
ek_train_df["start_ts_td"] = pd.to_timedelta(ek_train_df["start_timestamp"])
ek_train_df["stop_ts_td"] = pd.to_timedelta(ek_train_df["stop_timestamp"])
ek_train_df["duration_in_s"] = (ek_train_df["stop_ts_td"] - ek_train_df["start_ts_td"]).dt.total_seconds()
del ek_train_df["start_ts_td"]
del ek_train_df["stop_ts_td"]

ek_train_df.head(20)

Unnamed: 0_level_0,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,duration_in_s
narration_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,[door],[3],3.23
P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,[light],[114],1.8
P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,[drawer],[8],1.23
P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,[cup],[13],1.13
P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,[cupboard],[3],0.74
P01_01_102,P01,P01_01,00:08:01.229,00:08:02.13,00:08:03.00,28927,28980,put cup into cupboard,put-into,5,cup,13,"[cup, cupboard]","[13, 3]",0.87
P01_01_103,P01,P01_01,00:08:03.919,00:08:05.22,00:08:07.21,29113,29232,take container and lid,take,0,container,21,"[container, lid]","[21, 6]",1.99
P01_01_104,P01,P01_01,00:08:07.610,00:08:08.38,00:08:09.12,29302,29347,put container on top of counter,put-on,1,container,21,"[container, top:counter]","[21, 42]",0.74
P01_01_105,P01,P01_01,00:08:09.860,00:08:12.00,00:08:12.74,29520,29564,open container,open,3,container,21,[container],[21],0.74
P01_01_106,P01,P01_01,00:08:12.900,00:08:13.22,00:08:14.37,29593,29662,put container inside container,put-inside,5,container,21,"[container, container]","[21, 21]",1.15


Compute a summed duration for each verb class and sort them by duration


In [9]:
agg_df = ek_train_df.groupby("verb_class").agg({"duration_in_s": "sum"}).sort_values("duration_in_s", ascending=False)
agg_df["verb_class_name"] = agg_df.index.map(lambda x: verbs_df.loc[x].key)
agg_df

Unnamed: 0_level_0,duration_in_s,verb_class_name
verb_class,Unnamed: 1_level_1,Unnamed: 2_level_1
2,38479.65,wash
0,25655.79,take
1,21787.86,put
7,17386.92,cut
10,16973.70,mix
...,...,...
83,16.24,uncover
90,15.22,season
91,11.73,unlock
93,1.43,bake


In [10]:
import plotly.express as px

n = 50
px.bar(
    agg_df[:n],
    y="duration_in_s",
    x="verb_class_name",
    labels={
        "verb_class_name": "Verb class",
        "duration_in_s": "Total aggregated duration (in s)",
    },
    title=f"Top {n} most present verb classes in the whole dataset",
    text="duration_in_s",
    template="plotly_white",
    text_auto=True,
    height=600,
)


In [11]:
agg_df.loc[INTERESTING_VERB_CLASSES_IDS].sort_values("duration_in_s", ascending=False)

Unnamed: 0_level_0,duration_in_s,verb_class_name
verb_class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,21787.86,put
13,1677.13,throw
30,1182.57,break
49,670.44,crush
32,366.08,pat
58,261.4,sharpen
62,78.73,water
75,54.04,smell


Now we plot the distribution of instances of the selected verb classes in the dataset


In [12]:
px.bar(
    agg_df.loc[INTERESTING_VERB_CLASSES_IDS].sort_values(
        "duration_in_s", ascending=False
    ),
    x="verb_class_name",
    y="duration_in_s",
    labels={
        "verb_class_name": "Verb class",
        "duration_in_s": "Total aggregated duration (in s)",
    },
    title="Interesting verb classes in the whole dataset",
    text="duration_in_s",
    template="plotly_white",
    text_auto=True,
    height=600,
)


## 4. Check the proportions of these actions which have a clear audio event associated with them


First we load `EPIC-SOUNDS` dataset


In [13]:
es_train_path = "../data/epic-sounds-annotations/EPIC_Sounds_train.csv"

es_train_df = pd.read_csv(es_train_path, header=0, index_col=0)
es_train_df["start_ts_td"] = pd.to_timedelta(es_train_df["start_timestamp"])
es_train_df["stop_ts_td"] = pd.to_timedelta(es_train_df["stop_timestamp"])
es_train_df["duration_in_s"] = (
    es_train_df["stop_ts_td"] - es_train_df["start_ts_td"]
).dt.total_seconds()
del es_train_df["start_ts_td"]
del es_train_df["stop_ts_td"]

es_train_df.head()


Unnamed: 0_level_0,participant_id,video_id,start_timestamp,stop_timestamp,start_sample,stop_sample,description,class,class_id,duration_in_s
annotation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P01_01_0,P01,P01_01,00:00:02.466,00:00:05.315,59184,127560,clang / clatter,plastic-only collision,6,2.849
P01_01_1,P01,P01_01,00:00:08.981,00:00:09.871,215544,236904,put object on surface,open / close,3,0.89
P01_01_2,P01,P01_01,00:00:10.190,00:00:12.731,244560,305544,footstep,footstep,7,2.541
P01_01_3,P01,P01_01,00:00:14.023,00:00:15.044,336552,361056,scrub / scrape / scour / wipe,scrub / scrape / scour / wipe,1,1.021
P01_01_4,P01,P01_01,00:00:15.148,00:00:19.729,363552,473496,paper rustle,rustle,4,4.581


In [14]:
import os
from tqdm import tqdm


def strict_overlap(row_k: pd.Series, row_s: pd.Series) -> bool:
    """
    Returns `True` only if the sound event and the action are strictly overlapping,
    i.e. the sound event is contained in the action, temporally speaking.

    Parameters
    ----------
    `row_k` : `pd.Series`
        The row from EPIC-KITCHENS
    `row_s` : `pd.Series`
        The row from EPIC-SOUNDS

    Returns
    -------
    `bool`
        `True` if the sound event and the action are strictly overlapping, `False` otherwise
    """
    s_k, e_k = row_k["start_timestamp"], row_k["stop_timestamp"]
    s_s, e_s = row_s["start_timestamp"], row_s["stop_timestamp"]

    overlap = False
    overlap |= (s_k <= s_s) and (e_s <= e_k)

    return overlap


def loose_overlap(row_k: pd.Series, row_s: pd.Series) -> bool:
    """
    Returns `True` only if the sound event and the action are loosely overlapping,
    namely:
    1. The sound event is strictly contained in the action
    2. The action is strictly contained in the sound event
    3. The sound event starts before the action and ends after the action
    4. The action starts before the sound event and ends after the sound event

    Parameters
    ----------
    `row_k` : `pd.Series`
        The row from EPIC-KITCHENS
    `row_s` : `pd.Series`
        The row from EPIC-SOUNDS

    Returns
    -------
    `bool`
        `True` if the EPIC-KITCHEN action as a loosely overlapping corresponding sound event
        in EPIC-SOUNDS, `False` otherwise
    """
    s_k, e_k = row_k["start_timestamp"], row_k["stop_timestamp"]
    s_s, e_s = row_s["start_timestamp"], row_s["stop_timestamp"]

    overlap = False
    overlap |= strict_overlap(row_k, row_s)
    overlap |= strict_overlap(row_s, row_k)
    overlap |= (s_s <= s_k) and (s_k <= e_s)
    overlap |= (s_k <= s_s) and (s_s <= e_k)

    return overlap


# Run this cell only if the output file does not exist
if not os.path.exists("output/ek_es_overlaps.csv"):
    filtered_df = (
        ek_train_df[ek_train_df.verb_class.isin(INTERESTING_VERB_CLASSES_IDS)]
        .reset_index()
        .copy()
    )
    filtered_df["strict_es_overlap"] = False
    filtered_df["loose_es_overlap"] = False

    for i, row_k in tqdm(
        filtered_df.iterrows(), total=len(filtered_df), desc="EPIC-KITCHENS", unit="row"
    ):
        for _, row_s in es_train_df.reset_index().iterrows():
            if row_s["start_timestamp"] > row_k["stop_timestamp"]:
                break
            strict = strict_overlap(row_k=row_k, row_s=row_s)
            loose = loose_overlap(row_k=row_k, row_s=row_s)
            if strict or loose:
                filtered_df.loc[i, "strict_es_overlap"] = strict
                filtered_df.loc[i, "loose_es_overlap"] = loose
                break

    os.makedirs("output", exist_ok=True)
    filtered_df.to_csv("output/ek_es_overlaps.csv")


In [15]:
overlap_df = pd.read_csv("output/ek_es_overlaps.csv", index_col=0, header=0)
overlap_df.head()

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,duration_in_s,strict_es_overlap,loose_es_overlap
0,P01_01_104,P01,P01_01,00:08:07.610,00:08:08.38,00:08:09.12,29302,29347,put container on top of counter,put-on,1,container,21,"['container', 'top:counter']","[21, 42]",0.74,False,True
1,P01_01_110,P01,P01_01,00:08:25.669,00:08:24.66,00:08:27.92,30279,30475,put plate down,put-down,1,plate,2,['plate'],[2],3.26,True,True
2,P01_01_113,P01,P01_01,00:08:32.200,00:08:32.66,00:08:34.43,30759,30865,put down something,put-down,1,drawer,8,['drawer'],[8],1.77,False,False
3,P01_01_116,P01,P01_01,00:08:37.650,00:08:39.34,00:08:40.25,31160,31215,put down fork,put-down,1,fork,14,['fork'],[14],0.91,False,False
4,P01_01_12,P01,P01_01,00:00:29.539,00:00:36.68,00:00:37.78,2200,2266,put down vegetables,put-down,1,vegetable,94,['vegetable'],[94],1.1,False,True


We now plot again the distribution of instances of the selected verb classes in the dataset, with the percentage of instances that have a clear audio event associated with them


In [16]:
overlap_df["duration_w_strict_overlap"] = overlap_df["duration_in_s"].where(overlap_df["strict_es_overlap"], None)
overlap_df["duration_w_loose_overlap"] = overlap_df["duration_in_s"].where(overlap_df["loose_es_overlap"], None)
overlap_df.head()

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,duration_in_s,strict_es_overlap,loose_es_overlap,duration_w_strict_overlap,duration_w_loose_overlap
0,P01_01_104,P01,P01_01,00:08:07.610,00:08:08.38,00:08:09.12,29302,29347,put container on top of counter,put-on,1,container,21,"['container', 'top:counter']","[21, 42]",0.74,False,True,,0.74
1,P01_01_110,P01,P01_01,00:08:25.669,00:08:24.66,00:08:27.92,30279,30475,put plate down,put-down,1,plate,2,['plate'],[2],3.26,True,True,3.26,3.26
2,P01_01_113,P01,P01_01,00:08:32.200,00:08:32.66,00:08:34.43,30759,30865,put down something,put-down,1,drawer,8,['drawer'],[8],1.77,False,False,,
3,P01_01_116,P01,P01_01,00:08:37.650,00:08:39.34,00:08:40.25,31160,31215,put down fork,put-down,1,fork,14,['fork'],[14],0.91,False,False,,
4,P01_01_12,P01,P01_01,00:00:29.539,00:00:36.68,00:00:37.78,2200,2266,put down vegetables,put-down,1,vegetable,94,['vegetable'],[94],1.1,False,True,,1.1


In [17]:
agg_df_overlap = (
    overlap_df.groupby("verb_class")
    .agg(
        {
            "duration_w_strict_overlap": "sum",
            "duration_w_loose_overlap": "sum",
            "duration_in_s": "sum",
        }
    )
    .sort_values("duration_w_loose_overlap", ascending=False)
)
agg_df_overlap["verb_class_name"] = agg_df_overlap.index.map(lambda x: verbs_df.loc[x].key)
agg_df_overlap["percentage_loose_overlap"] = (
    agg_df_overlap["duration_w_loose_overlap"] / agg_df_overlap["duration_in_s"] * 100
).round(2)
agg_df_overlap["percentage_strict_overlap"] = (
    agg_df_overlap["duration_w_strict_overlap"] / agg_df_overlap["duration_in_s"] * 100
).round(2)

for col in ["duration_w_strict_overlap", "duration_w_loose_overlap", "duration_in_s"]:
    agg_df_overlap[col] = agg_df_overlap[col].round(2)

agg_df_overlap

Unnamed: 0_level_0,duration_w_strict_overlap,duration_w_loose_overlap,duration_in_s,verb_class_name,percentage_loose_overlap,percentage_strict_overlap
verb_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2141.54,19510.41,21787.86,put,89.55,9.83
13,243.83,1535.2,1677.13,throw,91.54,14.54
30,284.94,1149.39,1182.57,break,97.19,24.09
49,103.36,667.53,670.44,crush,99.57,15.42
32,68.61,348.0,366.08,pat,95.06,18.74
58,85.79,252.94,261.4,sharpen,96.76,32.82
62,4.81,70.55,78.73,water,89.61,6.11
75,4.92,53.01,54.04,smell,98.09,9.1


In [18]:
fig = px.bar(
    agg_df_overlap,
    y=[
        "duration_in_s",
        "duration_w_loose_overlap",
        "duration_w_strict_overlap",
    ],
    x="verb_class_name",
    labels={"verb_class_name": "Verb class", "value": "Total aggregated duration (in s)"},
    barmode="group",
    text="percentage_loose_overlap",
    text_auto=True,
)

# Update legend labels
fig.data[0].name = "Duration (in s)"
fig.data[1].name = "Duration with 'Loose' EPIC-SOUND Overlap"
fig.data[2].name = "Duration with 'Strict' EPIC-SOUND Overlap"

fig.show()

- **"Loose"** overlap means that the audio event is not necessarily contained within the action timestamps, but is at least partially overlapping with it.
- **"Strict"** overlap means that the audio event is fully contained within the action timestamps.


In [19]:
for i in INTERESTING_VERB_CLASSES_IDS:
    print(f"{i}: {verbs_df.loc[i].key}")

1: put
13: throw
30: break
32: pat
49: crush
58: sharpen
62: water
75: smell


In [25]:
overlap_df[overlap_df.verb_class.isin(INTERESTING_VERB_CLASSES_IDS)].query("loose_es_overlap == True and duration_w_loose_overlap < 2 and video_id == 'P10_04'").reset_index(drop=True)

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,duration_in_s,strict_es_overlap,loose_es_overlap,duration_w_strict_overlap,duration_w_loose_overlap
0,P10_04_111,P10,P10_04,00:12:01.699,00:12:04.87,00:12:06.59,43492,43595,throw away the bottle,throw,13,bottle,15,['bottle'],[15],1.72,False,True,,1.72
1,P10_04_123,P10,P10_04,00:13:52.509,00:13:56.07,00:13:57.58,50164,50254,put the ginger back into the plastic container,put,1,ginger,131,"['ginger', 'container:plastic']","[131, 21]",1.51,False,True,,1.51
2,P10_04_167,P10,P10_04,00:19:05.970,00:19:12.13,00:19:13.65,69127,69219,put the cover on top of the water filter jar,put-on,1,cover,89,"['cover', 'top', 'jar:water:filter']","[89, 42, 66]",1.52,False,True,,1.52
3,P10_04_183,P10,P10_04,00:20:08.870,00:20:16.14,00:20:17.92,72968,73075,put it on one side,put-on,1,side,42,['side'],[42],1.78,False,True,,1.78
4,P10_04_198,P10,P10_04,00:21:31.380,00:21:38.01,00:21:39.85,77880,77991,put the bowl on the utensil shelf,put-on,1,bowl,7,"['bowl', 'shelf:utensil']","[7, 247]",1.84,False,True,,1.84
5,P10_04_206,P10,P10_04,00:22:03.280,00:22:10.66,00:22:12.08,79839,79924,put some liquid on the brusher,put-on,1,liquid,150,"['liquid', 'brusher']","[150, 103]",1.42,False,True,,1.42
6,P10_04_213,P10,P10_04,00:22:42.150,00:22:49.79,00:22:51.13,82187,82267,put the pot on the hob,put-on,1,pot,29,"['pot', 'hob']","[29, 24]",1.34,False,True,,1.34
7,P10_04_215,P10,P10_04,00:22:45.490,00:22:55.85,00:22:57.47,82551,82648,put the bowl of lettuce on the table,put-on,1,bowl:lettuce,7,"['bowl:lettuce', 'table']","[7, 42]",1.62,False,True,,1.62
8,P10_04_234,P10,P10_04,00:25:16.960,00:25:24.50,00:25:25.60,91470,91536,throw away the skin of garlic,throw-away,13,skin:garlic,48,['skin:garlic'],[48],1.1,False,True,,1.1
9,P10_04_278,P10,P10_04,00:30:32.720,00:30:38.91,00:30:40.08,110334,110404,put the pot on the hob,put-on,1,pot,29,"['pot', 'hob']","[29, 24]",1.17,False,True,,1.17
