In [6]:
import os
import json

# Ego4D Data

In [7]:
EGO4D_DATA_PATH = os.path.join("..", "ego4d_data", "v1")

## NLQ Annotations

In [8]:
nlq_annotations = dict()

NLQ_ANNOTATIONS_PATH = os.path.join(EGO4D_DATA_PATH, "annotations")

for file_name in os.listdir(NLQ_ANNOTATIONS_PATH):
    if file_name.startswith("nlq"):
        print("File name:", file_name)
        split = "train" if "train" in file_name else "test" if "test" in file_name else "val"
        
        file_path = os.path.join(NLQ_ANNOTATIONS_PATH, file_name)
        
        with open(file_path, "r") as f:
            raw_data = json.load(f)
        
        nlq_annotations[split] = raw_data

print("Annotations split:", nlq_annotations.keys())

File name: nlq_test_unannotated.json
File name: nlq_train.json
File name: nlq_val.json
Annotations split: dict_keys(['test', 'train', 'val'])


### Data Structure Exploration

#### Train


Train annotations are characterized by:
- version: which in this case is v1 as we have chosen 
- date: date of the data last revision (I guess)
- description: refers to the data benchmark (NLQ) and split (train)
- manifest: path to the split manifest

In [9]:
nlq_annotations_train = nlq_annotations["train"]
print("nlq_annotations_train.keys()", nlq_annotations_train.keys())

nlq_annotations_train.keys() dict_keys(['version', 'date', 'description', 'manifest', 'videos'])


In [10]:
print("nlq_annotations_train['version']", nlq_annotations_train["version"])
print("nlq_annotations_train['date']", nlq_annotations_train["date"])
print("nlq_annotations_train['description']", nlq_annotations_train["description"])
print("nlq_annotations_train['manifest']", nlq_annotations_train["manifest"])

nlq_annotations_train['version'] 1
nlq_annotations_train['date'] 220216
nlq_annotations_train['description'] NLQ Annotations (train)
nlq_annotations_train['manifest'] s3://ego4d-consortium-sharing/public/v1/full_scale/manifest.csv


Each item of the videos list contains informations about a specific video
- video uid: unique identifier of the videos
- clips: 
- split: train as we are working with the train annotations

In [11]:
# explore videos data
nlq_annotations_train_videos = nlq_annotations_train["videos"]

print(f"type(nlq_annotations_train_videos): {type(nlq_annotations_train_videos)}")
print(f"len(nlq_annotations_train_videos): {len(nlq_annotations_train_videos)}")

type(nlq_annotations_train_videos): <class 'list'>
len(nlq_annotations_train_videos): 754


In [12]:
# video data structure
video_0 = nlq_annotations_train_videos[0]
print(f"type(video_0): {type(video_0)}")
print(f"video_0.keys(): {video_0.keys()}")

type(video_0): <class 'dict'>
video_0.keys(): dict_keys(['video_uid', 'clips', 'split'])


In [13]:
# videop unique identifier
video_0_uid = video_0["video_uid"]
print(video_0_uid)

d250521e-5197-44aa-8baa-2f42b24444d2


In [14]:
# video split group
video_0_split = video_0["split"]
print(video_0_split)

train


Question:
- By how many clips is every video characterized by?

In [15]:
video_0_clips = video_0["clips"]
print("type(video_0_clips)", type(video_0_clips))
print("len(video_0_clips)", len(video_0_clips))

type(video_0_clips) <class 'list'>
len(video_0_clips) 1


In [16]:
video_0_clips_0 = video_0_clips[0]
print("type(video_0_clips_0)", type(video_0_clips_0))
print("video_0_clips_0.keys()", video_0_clips_0.keys())

type(video_0_clips_0) <class 'dict'>
video_0_clips_0.keys() dict_keys(['clip_uid', 'video_start_sec', 'video_end_sec', 'video_start_frame', 'video_end_frame', 'clip_start_sec', 'clip_end_sec', 'clip_start_frame', 'clip_end_frame', 'source_clip_uid', 'annotations'])


In [17]:
print("video_0_clips_0['clip_uid']", video_0_clips_0["clip_uid"])
print("video_0_clips_0['source_clip_uid']", video_0_clips_0["source_clip_uid"])

video_0_clips_0['clip_uid'] fae92e70-88aa-4b77-b41a-5879b74c804c
video_0_clips_0['source_clip_uid'] 51e04dae-3ad0-48c1-b94b-c3ba0edaa99e


Exploration of video/clips timesteps and frame indices.

Interesting fact, there is a delay in the start of the **video**, which is not present in the clip itself.

**Question:**
- What is this the origin of this delay?

In [18]:
print(">> Video timesteps:")
print(f"  >> video_0_clips_0['video_start_sec']: {video_0_clips_0["video_start_sec"]}")
print(f"  >> video_0_clips_0['video_end_sec']: {video_0_clips_0["video_end_sec"]}\n")

print(">> Clip timesteps:")
print(f"  >> video_0_clips_0['clip_start_sec']: {video_0_clips_0["clip_start_sec"]}")
print(f"  >> video_0_clips_0['clip_end_sec']: {video_0_clips_0["clip_end_sec"]}")

>> Video timesteps:
  >> video_0_clips_0['video_start_sec']: 0.0210286
  >> video_0_clips_0['video_end_sec']: 480.0210286

>> Clip timesteps:
  >> video_0_clips_0['clip_start_sec']: 0
  >> video_0_clips_0['clip_end_sec']: 480.0


In [19]:
print(">> Video frames:")
print(f"  >> video_0_clips_0['video_start_frame']: {video_0_clips_0["video_start_frame"]}")
print(f"  >> video_0_clips_0['video_end_frame']: {video_0_clips_0["video_end_frame"]}\n")

print(">> Clip frames:")
print(f"  >> video_0_clips_0['clip_start_frame']: {video_0_clips_0["clip_start_frame"]}")
print(f"  >> video_0_clips_0['clip_end_frame']: {video_0_clips_0["clip_end_frame"]}")

>> Video frames:
  >> video_0_clips_0['video_start_frame']: 1
  >> video_0_clips_0['video_end_frame']: 14401

>> Clip frames:
  >> video_0_clips_0['clip_start_frame']: 0
  >> video_0_clips_0['clip_end_frame']: 14400


Clips annotations exploration

**Question:**
- How many annotations per clip?

In [20]:
video_0_clips_0_annotations = video_0_clips_0["annotations"]
print("type(video_0_clips_0_annotations)", type(video_0_clips_0_annotations))
print("len(video_0_clips_0_annotations)", len(video_0_clips_0_annotations))

type(video_0_clips_0_annotations) <class 'list'>
len(video_0_clips_0_annotations) 2


In [21]:
video_0_clips_0_annotations_0 = video_0_clips_0_annotations[0]
print("type(video_0_clips_0_annotations_0)", type(video_0_clips_0_annotations_0))
print("video_0_clips_0_annotations_0.keys()", video_0_clips_0_annotations_0.keys())

type(video_0_clips_0_annotations_0) <class 'dict'>
video_0_clips_0_annotations_0.keys() dict_keys(['language_queries', 'annotation_uid'])


In [22]:
video_0_clips_0_annotations_0_language_queries = video_0_clips_0_annotations_0["language_queries"]
print("type(video_0_clips_0_annotations_0_language_queries)", type(video_0_clips_0_annotations_0_language_queries))
print("len(video_0_clips_0_annotations_0_language_queries)", len(video_0_clips_0_annotations_0_language_queries))

type(video_0_clips_0_annotations_0_language_queries) <class 'list'>
len(video_0_clips_0_annotations_0_language_queries) 8


In [23]:
video_0_clips_0_annotations_0_language_queries_0 = video_0_clips_0_annotations_0_language_queries[0]
video_0_clips_0_annotations_0_language_queries_0

{'clip_start_sec': 0.0,
 'clip_end_sec': 43.6657,
 'video_start_sec': 0.0210286,
 'video_end_sec': 43.6867286,
 'video_start_frame': 1,
 'video_end_frame': 1311,
 'template': 'Objects: How many X’s? (quantity question)',
 'query': 'How many frying pans can i see on the shelf?',
 'slot_x': 'frying pans',
 'verb_x': '[verb_not_applicable]',
 'slot_y': 'i See on the shelf',
 'verb_y': 'see',
 'raw_tags': ['Objects: How many X’s? (quantity question)',
  'How many frying pans can i see on the shelf?',
  'frying pans',
  '[verb_not_applicable]',
  'i See on the shelf',
  'see']}

#### Validation

In [27]:
nlq_annotations_val = nlq_annotations["val"]
print("nlq_annotations_val.keys()", nlq_annotations_val.keys())

nlq_annotations_val.keys() dict_keys(['version', 'date', 'description', 'manifest', 'videos'])


In [28]:
print("nlq_annotations_val['version']", nlq_annotations_val["version"])
print("nlq_annotations_val['date']", nlq_annotations_val["date"])
print("nlq_annotations_val['description']", nlq_annotations_val["description"])
print("nlq_annotations_val['manifest']", nlq_annotations_val["manifest"])

nlq_annotations_val['version'] 1
nlq_annotations_val['date'] 220216
nlq_annotations_val['description'] NLQ Annotations (val)
nlq_annotations_val['manifest'] s3://ego4d-consortium-sharing/public/v1/full_scale/manifest.csv


In [29]:
# explore videos data
nlq_annotations_val_videos = nlq_annotations_val["videos"]

print(f"type(nlq_annotations_train_videos): {type(nlq_annotations_val_videos)}")
print(f"len(nlq_annotations_train_videos): {len(nlq_annotations_val_videos)}")

type(nlq_annotations_train_videos): <class 'list'>
len(nlq_annotations_train_videos): 247


In [30]:
# video data structure
video_0 = nlq_annotations_val_videos[0]
print(f"type(video_0): {type(video_0)}")
print(f"video_0.keys(): {video_0.keys()}")

type(video_0): <class 'dict'>
video_0.keys(): dict_keys(['video_uid', 'clips', 'split'])


In [31]:
# videop unique identifier
video_0_uid = video_0["video_uid"]
print(video_0_uid)

38737402-19bd-4689-9e74-3af391b15feb


In [32]:
# video split group
video_0_split = video_0["split"]
print(video_0_split)

val


In [33]:
video_0_clips = video_0["clips"]
print("type(video_0_clips)", type(video_0_clips))
print("len(video_0_clips)", len(video_0_clips))

type(video_0_clips) <class 'list'>
len(video_0_clips) 1


In [34]:
video_0_clips_0 = video_0_clips[0]
print("type(video_0_clips_0)", type(video_0_clips_0))
print("video_0_clips_0.keys()", video_0_clips_0.keys())

type(video_0_clips_0) <class 'dict'>
video_0_clips_0.keys() dict_keys(['clip_uid', 'video_start_sec', 'video_end_sec', 'video_start_frame', 'video_end_frame', 'clip_start_sec', 'clip_end_sec', 'clip_start_frame', 'clip_end_frame', 'source_clip_uid', 'annotations'])


In [35]:
print("video_0_clips_0['clip_uid']", video_0_clips_0["clip_uid"])
print("video_0_clips_0['source_clip_uid']", video_0_clips_0["source_clip_uid"])

video_0_clips_0['clip_uid'] 93231c7e-1cf4-4a20-b1f8-9cc9428915b2
video_0_clips_0['source_clip_uid'] cb3bf9d7-7f6b-4567-9446-45c6b493d721


In [36]:
print(">> Video timesteps:")
print(f"  >> video_0_clips_0['video_start_sec']: {video_0_clips_0["video_start_sec"]}")
print(f"  >> video_0_clips_0['video_end_sec']: {video_0_clips_0["video_end_sec"]}\n")

print(">> Clip timesteps:")
print(f"  >> video_0_clips_0['clip_start_sec']: {video_0_clips_0["clip_start_sec"]}")
print(f"  >> video_0_clips_0['clip_end_sec']: {video_0_clips_0["clip_end_sec"]}")

>> Video timesteps:
  >> video_0_clips_0['video_start_sec']: 630.0
  >> video_0_clips_0['video_end_sec']: 1110.0366739908854

>> Clip timesteps:
  >> video_0_clips_0['clip_start_sec']: 0
  >> video_0_clips_0['clip_end_sec']: 480.03667399088545


In [37]:
print(">> Video frames:")
print(f"  >> video_0_clips_0['video_start_frame']: {video_0_clips_0["video_start_frame"]}")
print(f"  >> video_0_clips_0['video_end_frame']: {video_0_clips_0["video_end_frame"]}\n")

print(">> Clip frames:")
print(f"  >> video_0_clips_0['clip_start_frame']: {video_0_clips_0["clip_start_frame"]}")
print(f"  >> video_0_clips_0['clip_end_frame']: {video_0_clips_0["clip_end_frame"]}")

>> Video frames:
  >> video_0_clips_0['video_start_frame']: 35999
  >> video_0_clips_0['video_end_frame']: 35999

>> Clip frames:
  >> video_0_clips_0['clip_start_frame']: 0
  >> video_0_clips_0['clip_end_frame']: 14401


In [38]:
video_0_clips_0_annotations = video_0_clips_0["annotations"]
print("type(video_0_clips_0_annotations)", type(video_0_clips_0_annotations))
print("len(video_0_clips_0_annotations)", len(video_0_clips_0_annotations))

type(video_0_clips_0_annotations) <class 'list'>
len(video_0_clips_0_annotations) 2


In [39]:
video_0_clips_0_annotations_0 = video_0_clips_0_annotations[0]
print("type(video_0_clips_0_annotations_0)", type(video_0_clips_0_annotations_0))
print("video_0_clips_0_annotations_0.keys()", video_0_clips_0_annotations_0.keys())

type(video_0_clips_0_annotations_0) <class 'dict'>
video_0_clips_0_annotations_0.keys() dict_keys(['language_queries', 'annotation_uid'])


In [41]:
video_0_clips_0_annotations_0_language_queries = video_0_clips_0_annotations_0["language_queries"]
print("type(video_0_clips_0_annotations_0_language_queries)", type(video_0_clips_0_annotations_0_language_queries))
print("len(video_0_clips_0_annotations_0_language_queries)", len(video_0_clips_0_annotations_0_language_queries))

type(video_0_clips_0_annotations_0_language_queries) <class 'list'>
len(video_0_clips_0_annotations_0_language_queries) 4


In [42]:
video_0_clips_0_annotations_0_language_queries_0 = video_0_clips_0_annotations_0_language_queries[0]
video_0_clips_0_annotations_0_language_queries_0

{'clip_start_sec': 425.0,
 'clip_end_sec': 431.0,
 'video_start_sec': 1055.0,
 'video_end_sec': 1061.0,
 'video_start_frame': 35999,
 'video_end_frame': 35999,
 'template': 'Objects: What did I put in X?',
 'query': 'what did I put in the black dustbin?',
 'slot_x': 'back dustbin',
 'verb_x': 'put',
 'raw_tags': ['Objects: What did I put in X?',
  'what did I put in the black dustbin?',
  'back dustbin',
  'put']}

#### Test

In [44]:
nlq_annotations_test = nlq_annotations["test"]
print("nlq_annotations_test.keys()", nlq_annotations_test.keys())

nlq_annotations_test.keys() dict_keys(['version', 'date', 'description', 'manifest', 'videos'])


In [45]:
print("nlq_annotations_test['version']", nlq_annotations_test["version"])
print("nlq_annotations_test['date']", nlq_annotations_test["date"])
print("nlq_annotations_test['description']", nlq_annotations_test["description"])
print("nlq_annotations_test['manifest']", nlq_annotations_test["manifest"])

nlq_annotations_test['version'] 1
nlq_annotations_test['date'] 220216
nlq_annotations_test['description'] NLQ Annotations (test unannotated)
nlq_annotations_test['manifest'] s3://ego4d-consortium-sharing/public/v1/full_scale/manifest.csv


In [46]:
# explore videos data
nlq_annotations_test_videos = nlq_annotations_test["videos"]

print(f"type(nlq_annotations_test_videos): {type(nlq_annotations_test_videos)}")
print(f"len(nlq_annotations_test_videos): {len(nlq_annotations_test_videos)}")

type(nlq_annotations_test_videos): <class 'list'>
len(nlq_annotations_test_videos): 258


In [47]:
# video data structure
video_0 = nlq_annotations_test_videos[0]
print(f"type(video_0): {type(video_0)}")
print(f"video_0.keys(): {video_0.keys()}")

type(video_0): <class 'dict'>
video_0.keys(): dict_keys(['video_uid', 'clips', 'split'])


In [48]:
# videop unique identifier
video_0_uid = video_0["video_uid"]
print(video_0_uid)

c9c44dea-c37b-461d-aa14-20e934126df5


In [49]:
# video split group
video_0_split = video_0["split"]
print(video_0_split)

test


In [50]:
video_0_clips = video_0["clips"]
print("type(video_0_clips)", type(video_0_clips))
print("len(video_0_clips)", len(video_0_clips))

type(video_0_clips) <class 'list'>
len(video_0_clips) 3


In [51]:
video_0_clips_0 = video_0_clips[0]
print("type(video_0_clips_0)", type(video_0_clips_0))
print("video_0_clips_0.keys()", video_0_clips_0.keys())

type(video_0_clips_0) <class 'dict'>
video_0_clips_0.keys() dict_keys(['clip_uid', 'video_start_sec', 'video_end_sec', 'video_start_frame', 'video_end_frame', 'clip_start_sec', 'clip_end_sec', 'clip_start_frame', 'clip_end_frame', 'source_clip_uid', 'annotations'])


In [52]:
print("video_0_clips_0['clip_uid']", video_0_clips_0["clip_uid"])
print("video_0_clips_0['source_clip_uid']", video_0_clips_0["source_clip_uid"])

video_0_clips_0['clip_uid'] a603669a-57f9-4db4-8a81-0a6720946d45
video_0_clips_0['source_clip_uid'] 4ee7dc88-3d7f-4607-a110-9419fb0eb93d


In [53]:
print(">> Video timesteps:")
print(f"  >> video_0_clips_0['video_start_sec']: {video_0_clips_0["video_start_sec"]}")
print(f"  >> video_0_clips_0['video_end_sec']: {video_0_clips_0["video_end_sec"]}\n")

print(">> Clip timesteps:")
print(f"  >> video_0_clips_0['clip_start_sec']: {video_0_clips_0["clip_start_sec"]}")
print(f"  >> video_0_clips_0['clip_end_sec']: {video_0_clips_0["clip_end_sec"]}")

>> Video timesteps:
  >> video_0_clips_0['video_start_sec']: 1489.0943619333332
  >> video_0_clips_0['video_end_sec']: 1969.1310359242186

>> Clip timesteps:
  >> video_0_clips_0['clip_start_sec']: 0
  >> video_0_clips_0['clip_end_sec']: 480.03667399088545


In [54]:
print(">> Video frames:")
print(f"  >> video_0_clips_0['video_start_frame']: {video_0_clips_0["video_start_frame"]}")
print(f"  >> video_0_clips_0['video_end_frame']: {video_0_clips_0["video_end_frame"]}\n")

print(">> Clip frames:")
print(f"  >> video_0_clips_0['clip_start_frame']: {video_0_clips_0["clip_start_frame"]}")
print(f"  >> video_0_clips_0['clip_end_frame']: {video_0_clips_0["clip_end_frame"]}")

>> Video frames:
  >> video_0_clips_0['video_start_frame']: 66429
  >> video_0_clips_0['video_end_frame']: 66429

>> Clip frames:
  >> video_0_clips_0['clip_start_frame']: 0
  >> video_0_clips_0['clip_end_frame']: 14401


In [55]:
video_0_clips_0_annotations = video_0_clips_0["annotations"]
print("type(video_0_clips_0_annotations)", type(video_0_clips_0_annotations))
print("len(video_0_clips_0_annotations)", len(video_0_clips_0_annotations))

type(video_0_clips_0_annotations) <class 'list'>
len(video_0_clips_0_annotations) 2


In [56]:
video_0_clips_0_annotations_0 = video_0_clips_0_annotations[0]
print("type(video_0_clips_0_annotations_0)", type(video_0_clips_0_annotations_0))
print("video_0_clips_0_annotations_0.keys()", video_0_clips_0_annotations_0.keys())

type(video_0_clips_0_annotations_0) <class 'dict'>
video_0_clips_0_annotations_0.keys() dict_keys(['language_queries', 'annotation_uid'])


In [57]:
video_0_clips_0_annotations_0_language_queries = video_0_clips_0_annotations_0["language_queries"]
print("type(video_0_clips_0_annotations_0_language_queries)", type(video_0_clips_0_annotations_0_language_queries))
print("len(video_0_clips_0_annotations_0_language_queries)", len(video_0_clips_0_annotations_0_language_queries))

type(video_0_clips_0_annotations_0_language_queries) <class 'list'>
len(video_0_clips_0_annotations_0_language_queries) 5


In [58]:
video_0_clips_0_annotations_0_language_queries_0 = video_0_clips_0_annotations_0_language_queries[0]
video_0_clips_0_annotations_0_language_queries_0

{'query': 'What did in put in the sack?'}

### Exploratory Data Analysis

In [None]:
# Explore number of videos per split
n_videos_train = nlq_annotations["train"]
n_videos_val = nlq_annotations["val"]
n_videos_test = nlq_annotations["test"]

## Omnivore

In [25]:
OMNIVORE_DATA_PATH = os.path.join(EGO4D_DATA_PATH, "omnivore_video_swinl_fp16")

# EgoVLP Data

In [26]:
EGOVLP_DATA = os.path.join("egovlp_data")