In [1]:
import pandas as pd
import json
import os
import shutil

from tqdm import tqdm
from io import BytesIO
from tarfile import TarFile
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
root = '/content/gdrive/MyDrive/Dataset/'

In [3]:
with open(os.path.join(root, 'something-something-v2-labels.json'), 'r') as f:
    labels_dict = json.load(f)

# Write labels dict to classInd.txt
with open(os.path.join(root, 'classInd.txt'), 'w') as f:
    f.writelines([f'{id}\t{label}\n' for label, id in labels_dict.items()])

In [4]:
def load_data(json_path):
    df = pd.read_json(os.path.join(root, json_path))
    
    ids = list(df.id.apply(str))
    labels = df.template.apply( # Labels are in 'template' column
        lambda tmpl: labels_dict[tmpl.replace('[', '').replace(']', '')]
    ) # Get rid of '[' and ']'

    return dict(zip(ids, labels))

# Values are mappings of ids to labels
datasets = {
    'train': load_data('something-something-v2-train.json'),
    'test': load_data('something-something-v2-validation.json'),
}

def get_dsname(id):
    for name, dataset in datasets.items():
        if id in dataset:
            return name
    return None

path_lists = {name: [] for name in datasets}

In [5]:
class MultipleFileReader:
    def __init__(self, fnames):
        self.fnames = list(reversed(fnames))
        self.curr_fobj = None
    
    def read(self, size=None):
        if not self.ready(): # No more data left to read
            return b''
        remaining = -1 if size is None else size
        data = BytesIO()
        while self.ready() and remaining != 0:
            if self.curr_fobj is None:
                self.curr_fobj = open(self.fnames.pop(), 'rb')
            data_read = self.curr_fobj.read(remaining)
            data.write(data_read)
            num_bytes = len(data_read)
            if num_bytes < remaining or num_bytes == 0: # File exhausted
                self.close()
            remaining -= num_bytes
        return data.getvalue()
    
    def ready(self):
        return self.fnames or self.curr_fobj
    
    def close(self):
        if self.curr_fobj is None:
            return
        self.curr_fobj.close()
        self.curr_fobj = None

In [6]:
src_dir = os.path.join(root, 'ZippedFiles')
fnames = sorted([os.path.join(src_dir, fname) for fname in os.listdir(src_dir) \
                 if fname.startswith('20bn-something-something-v2')])
fobj = MultipleFileReader(fnames)

In [7]:
with TarFile.open(fileobj=fobj, mode='r|gz') as tarfile:
    for member in tqdm(tarfile, total=220848, desc='Extracting', unit=' files'):
        if not member.isfile():
            continue
        
        fname = os.path.basename(member.name)
        id, _ = os.path.splitext(fname)
        dsname = get_dsname(id)
        if dsname is None:
            continue
        label = datasets[dsname][id]
        
        dst_dir = os.path.join(root, dsname, label)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        dst_path = os.path.join(dst_dir, fname)

        member.name = fname
        tarfile.extract(member, path=dst_dir)
        path_lists[dsname].append(dst_path + '\n')

Extracting: 100%|██████████| 220848/220848 [57:51<00:00, 63.62 files/s]


In [8]:
# Write path lists to trainlist.txt or testlist.txt
for name, path_list in path_lists.items():
    with open(os.path.join(root, f'{name}list.txt'), 'w') as f:
        f.writelines(path_list)