In [1]:
import torch, torchvision
from torchvision.datasets.video_utils import VideoClips
import os, json
import numpy as np
from torch.utils.data import Dataset, DataLoader
import ffmpeg
from collections import Counter as C

In [7]:
class prober_class(Dataset):
    def __init__(self, paths):
        super(prober_class, self).__init__()
        self.paths = paths

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        k,p = self.paths[idx]
        try:
            res = ffmpeg.probe(p)
        except:
            res = "Error"
        
        return [res, k]

In [51]:
basepath = '/data/datasets/kinetics700_2020/'

def read_dataset_details(label_path, batch_size, vid_path):
    dataset = json.load(open(basepath + label_path))
    print(list(dataset.items())[0], len(dataset))
    collate = lambda x: x
    loader = DataLoader(prober_class([(k, basepath + vid_path + k + '.mp4') for k, v in dataset.items()]),
                         batch_size=batch_size,
                         num_workers=batch_size,
                         collate_fn=collate)
    print("Example paths:", loader.dataset.paths[:3])
    probes = []
    for i,batch in enumerate(loader):
        if i % 50 == 0:
            print(i*batch_size,'/',len(dataset))
        for b in batch:
            probes.append(b)
    print(C([type(p[0]) for p in probes]))
    return probes

In [57]:
def filter_dataset(label_path, probes):
    dataset = json.load(open(basepath + label_path))
    print(list(dataset.items())[0], len(dataset))
    
    new_dataset = {}
    error_remove_ids = []
    stream_err_ids = []

    for p, k in probes:
        if p == "Error":
            error_remove_ids.append(k)
        elif len(p['streams']) != 2:
            stream_err_ids.append(k)
        else:
            dataset[k]['nb_frames'] = p['streams'][0]['nb_frames']
            dataset[k]['hw'] = (p['streams'][0]['height'], p['streams'][0]['width'])
            dataset[k]['true_duration'] = p['streams'][0]['duration']
            new_dataset[k] = dataset[k]
            
    return new_dataset, error_remove_ids, stream_err_ids

In [54]:
probess = read_dataset_details('labels/val_full.json', 50, 'val/')

('---QUuC4vJs', {'annotations': {'label': 'testifying', 'segment': [84.0, 94.0]}, 'duration': 10.0, 'subset': 'validate', 'url': 'https://www.youtube.com/watch?v=---QUuC4vJs'}) 34077
Example paths: [('---QUuC4vJs', '/data/datasets/kinetics700_2020/val/---QUuC4vJs.mp4'), ('--GkrdYZ9Tc', '/data/datasets/kinetics700_2020/val/--GkrdYZ9Tc.mp4'), ('--nQbRBEz2s', '/data/datasets/kinetics700_2020/val/--nQbRBEz2s.mp4')]
0 / 34077
2500 / 34077
5000 / 34077
7500 / 34077
10000 / 34077
12500 / 34077
15000 / 34077
17500 / 34077
20000 / 34077
22500 / 34077
25000 / 34077
27500 / 34077
30000 / 34077
32500 / 34077
Counter({<class 'dict'>: 32733, <class 'str'>: 1344})


In [58]:
new_dataset, error_remove_ids, stream_err_ids = filter_dataset('labels/val_full.json', probess)

('---QUuC4vJs', {'annotations': {'label': 'testifying', 'segment': [84.0, 94.0]}, 'duration': 10.0, 'subset': 'validate', 'url': 'https://www.youtube.com/watch?v=---QUuC4vJs'}) 34077


In [62]:
json.dump(new_dataset, open(basepath + 'labels/val.json', 'w+'), indent=2)

In [64]:
probess = read_dataset_details('labels/train_full.json', 60, 'train/')

('---0dWlqevI', {'annotations': {'label': 'clay pottery making', 'segment': [19.0, 29.0]}, 'duration': 10.0, 'subset': 'train', 'url': 'https://www.youtube.com/watch?v=---0dWlqevI'}) 541621
Example paths: [('---0dWlqevI', '/data/datasets/kinetics700_2020/train/---0dWlqevI.mp4'), ('---aQ-tA5_A', '/data/datasets/kinetics700_2020/train/---aQ-tA5_A.mp4'), ('---j12rm3WI', '/data/datasets/kinetics700_2020/train/---j12rm3WI.mp4')]
0 / 541621
3000 / 541621
6000 / 541621
9000 / 541621
12000 / 541621
15000 / 541621
18000 / 541621
21000 / 541621
24000 / 541621
27000 / 541621
30000 / 541621
33000 / 541621
36000 / 541621
39000 / 541621
42000 / 541621
45000 / 541621
48000 / 541621
51000 / 541621
54000 / 541621
57000 / 541621
60000 / 541621
63000 / 541621
66000 / 541621
69000 / 541621
72000 / 541621
75000 / 541621
78000 / 541621
81000 / 541621
84000 / 541621
87000 / 541621
90000 / 541621
93000 / 541621
96000 / 541621
99000 / 541621
102000 / 541621
105000 / 541621
108000 / 541621
111000 / 541621
11400

In [65]:
json.dump(probess, open("tmp.json", 'w+'))

In [108]:
new_dataset, error_remove_ids, stream_err_ids = filter_dataset('labels/train_full.json', probess)

('---0dWlqevI', {'annotations': {'label': 'clay pottery making', 'segment': [19.0, 29.0]}, 'duration': 10.0, 'subset': 'train', 'url': 'https://www.youtube.com/watch?v=---0dWlqevI'}) 541621


In [106]:
len(new_dataset)

523099

In [73]:
len(error_remove_ids)

18472

In [74]:
json.dump(error_remove_ids, open("corrupt_missing_train.json", 'w+'))

In [75]:
json.dump(stream_err_ids, open("stream_err_train.json", 'w+'))

In [77]:
json.dump(['xsGtbUnp9tw', '141fJ89Ed2k', 'q-S0pBZDhZU', 'wN8qYmPv5yk',
           'uz6rjbw0ZA0', 'kUVnT7Ld80M', 'XY5FDVay5_A', 'bOU2oGVBM_o',
           'fg2BS7H_dAU', 'KTCQpjUrCe8', 'erh2ngRZxs0', '7hIAtSLdAUo',
           'h_5SZwWFg1c', 'z8qEtdr1ZuU', 'VYZPozZ5Eig', '6iuD3pSgBcw',
           'NPNP-7B9P3M', 'yhA_TTKetyM', 'YjUcA9zOp5g', 'hKqCUWTQQxU'
          ],
         open("download_err_corrupt.json", 'w+')
        )

In [78]:
len(new_dataset)

523099

In [79]:
json.dump(new_dataset, open(basepath + 'labels/train.json', 'w+'), indent=2)

In [90]:
# Create mini kinetics train
import random
label_class = list(set(m['annotations']['label'] for m in new_dataset.values()))
assert len(label_class) == 700
classwise_keys = {l:[] for l in label_class}
for k, d in new_dataset.items():
    classwise_keys[d['annotations']['label']].append(k)


In [91]:
classwise_ixs['sword swallowing']

['-64SFG45MGc',
 '-IRZQEPpXm4',
 '-J6e6tmHfG0',
 '-PH-LmbbhTs',
 '-UkYLBqAIhM',
 '-Zjg_1gu_iU',
 '-eSjkYVfhkc',
 '0-jShm5vXTs',
 '0BQ1igghWA0',
 '0HZwle1ARew',
 '0Lkbe_-r6pM',
 '0NiufOLGoA0',
 '0XnSZRQzqK8',
 '0_yEEH9mR40',
 '0aS8cRgtAy4',
 '0jADsepo9YU',
 '0nK8obdz-R0',
 '0nXW_392t7U',
 '0q3gxEunti4',
 '0ttcKxfMkIY',
 '0tzXnhl9fL8',
 '1CHOhK5iVnA',
 '1YDQ9VFRuqQ',
 '1aOvJdnUd4c',
 '1c9XeAWUPPc',
 '1eO0VT5JN_Q',
 '1sVnKLmEL6A',
 '1tO_esc3Sm4',
 '1ymLC3bM9MY',
 '1zX-Z5-fgF0',
 '2FB3lFEsuIg',
 '2FVCBPcpqEM',
 '2ITyO3EZQkM',
 '2dnQQNf-5Ik',
 '2x8m37BjkzQ',
 '36VCcm5hDJs',
 '3K2aLO6QQt4',
 '3hVgN5i6S4A',
 '3lg6sjV9uf4',
 '3wR1k8zQnck',
 '45b037tFguk',
 '4AsWlBRzJV4',
 '4AubXyZXBto',
 '4YY7xYlIbOE',
 '4brLLAO3l50',
 '4y41ZJjdleA',
 '548No4eOFpY',
 '5I98U_mxpJg',
 '5_3YjiUqeRY',
 '5yirj3QZKfU',
 '5zDKDX18o9I',
 '60_xGlF735w',
 '65TtkqJFPqc',
 '6C2LasQ5ep8',
 '6M1k0XdiK7c',
 '6O0LyhSqymY',
 '6_cw1bhWdcQ',
 '6xPf0z-A_gM',
 '6zP7J6weGjc',
 '6zrClfiSoDk',
 '72gfy8QN1Ag',
 '79l5AyYTdQM',
 '7_ryhF

In [109]:
mini_train = dict(random.sample(list(new_dataset.items()), 200000))

In [110]:
dict(sorted(C(m['annotations']['label'] for m in mini_train.values()).items(),key=lambda x: x[1]))

{'putting on sari': 160,
 'sword swallowing': 174,
 'combing hair': 176,
 'flipping bottle': 178,
 'playing nose flute': 187,
 'dyeing eyebrows': 187,
 'arresting': 188,
 'shooting off fireworks': 189,
 'closing door': 190,
 'skiing mono': 191,
 'coloring in': 192,
 'cracking neck': 192,
 'treating wood': 193,
 'chasing': 193,
 'chiseling wood': 195,
 'doing jigsaw puzzle': 195,
 'grinding meat': 195,
 'saluting': 196,
 'shredding paper': 196,
 'listening with headphones': 197,
 'mushroom foraging': 197,
 'moving baby': 197,
 'cracking back': 197,
 'playing mahjong': 197,
 'playing with trains': 198,
 'smoking': 198,
 'milking goat': 198,
 'dealing cards': 199,
 'petting horse': 200,
 'carving wood with a knife': 200,
 'pinching': 200,
 'chiseling stone': 201,
 'checking watch': 201,
 'bouncing ball (not juggling)': 201,
 'slicing onion': 202,
 'swimming front crawl': 202,
 'texting': 202,
 'pouring milk': 203,
 'being in zero gravity': 203,
 'playing road hockey': 204,
 'repairing pun

In [100]:
label_to_ix = {k:i for i, k in enumerate(label_class)}

In [102]:
json.dump(label_to_ix, open(basepath + 'mini/class_to_index.json', 'w+'), indent=2)

In [113]:
for k in mini_train.keys():
    l = mini_train[k]['annotations']['label']
    mini_train[k]['annotations']['label'] = (l, label_to_ix[l])

In [115]:
list(mini_train.items())[1], len(mini_train)

(('jBacN4yLLyQ',
  {'annotations': {'label': ('mountain climber (exercise)', 213),
    'segment': [0.0, 10.0]},
   'duration': 10.0,
   'subset': 'train',
   'url': 'https://www.youtube.com/watch?v=jBacN4yLLyQ',
   'nb_frames': '300',
   'hw': (240, 426),
   'true_duration': '10.010000'}),
 200000)

In [116]:
json.dump(mini_train, open(basepath + 'mini/train.json', 'w+'), indent=2)

In [117]:
mini_val = json.load(open(basepath + 'labels/val.json'))

for k in mini_val.keys():
    l = mini_val[k]['annotations']['label']
    mini_val[k]['annotations']['label'] = (l, label_to_ix[l])

In [118]:
list(mini_val.items())[1], len(mini_val)

(('--GkrdYZ9Tc',
  {'annotations': {'label': ('washing feet', 295), 'segment': [0.0, 10.0]},
   'duration': 10.0,
   'subset': 'validate',
   'url': 'https://www.youtube.com/watch?v=--GkrdYZ9Tc',
   'nb_frames': '240',
   'hw': [1080, 1920],
   'true_duration': '10.010667'}),
 32733)

In [119]:
json.dump(mini_val, open(basepath + 'mini/val.json', 'w+'), indent=2)