### Code to Download MSR-VTT dataset

In [1]:
import json
import os, os.path

train_val = json.load(open('videodatainfo_2017.json', 'r'))


# combine all images and annotations together
videos = train_val['videos']
sentences = train_val['sentences']

# for efficiency lets group annotations by image
itoa = {}
for s in sentences:
    videoid_buf = s['video_id']
    videoid = int(videoid_buf[5:])
    if not videoid in itoa: itoa[videoid] = []
    itoa[videoid].append(s)

f = open('videos_list.csv', 'w')
# create a csv containing video
for i,img in enumerate(videos):
    line = str(img["video_id"])+','+img['url']+','+str(img['start time'])+','+str(img['end time'])+','+str(img['category'])+'\n'
    #print(line)
    f.write(line)
f.close()
    

In [2]:
!head videos_list.csv

video0,https://www.youtube.com/watch?v=9lZi22qLlEo,137.72,149.44,9
video1,https://www.youtube.com/watch?v=w4JM08PDEng,184.33,206.89,16
video2,https://www.youtube.com/watch?v=QA7KVQq9vKA,31.17,41.24,9
video3,https://www.youtube.com/watch?v=QFmJZ0GU6yc,48.26,58.51,8
video4,https://www.youtube.com/watch?v=2q-dONPhzis,268.58,278.83,14
video5,https://www.youtube.com/watch?v=b-3_7iglTbg,0.0,30.0,13
video6,https://www.youtube.com/watch?v=YvF-ZTH28yI,143.93,160.97,13
video7,https://www.youtube.com/watch?v=y6sBoW139Sc,81.06,92.61,17
video8,https://www.youtube.com/watch?v=X-aaASj9-u0,0.0,30.0,3
video9,https://www.youtube.com/watch?v=oxswNqeujeY,0.0,30.0,5


In [3]:
import argparse
import fnmatch
import glob
import json
import os
import shutil
import subprocess
import uuid

from joblib import delayed
from joblib import Parallel
import pandas as pd


def create_video_folders(dataset, output_dir, tmp_dir):
    """Creates a directory for each label name in the dataset."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    label_to_dir = {}
    for label_name in dataset['label-name'].unique():
        this_dir = os.path.join(output_dir, label_name)
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        label_to_dir[label_name] = this_dir
    return label_to_dir


def construct_video_filename(row, label_to_dir, trim_format='%06d'):
    """Given a dataset row, this function constructs the 
       output filename for a given video.
    """

    base_name = row['video-id']+'.mp4'
    output_filename = os.path.join(label_to_dir,base_name)
    return output_filename


def download_clip(video_identifier, output_filename,
                  start_time, end_time, 
                  tmp_dir='/tmp/kinetics',
                  num_attempts=5,
                  url_base='https://www.youtube.com/watch?v='):
    """Download a video from youtube if exists and is not blocked.
    
    arguments:
    ---------
    video_identifier: str
        Unique YouTube video identifier (11 characters)
    output_filename: str
        File path where the video will be stored.
    start_time: float
        Indicates the begining time in seconds from where the video 
        will be trimmed.
    end_time: float
        Indicates the ending time in seconds of the trimmed video.
    """
    # Defensive argument checking.
    assert isinstance(video_identifier, str), 'video_identifier must be string'
    assert isinstance(output_filename, str), 'output_filename must be string'
    status = False
    # Construct command line for getting the direct video link.
    tmp_filename = os.path.join(tmp_dir,
                                '%s.%%(ext)s' % uuid.uuid4())
    command = ['youtube-dl',
               '--quiet', '--no-warnings',
               '-f', 'mp4',
               '-o', '"%s"' % tmp_filename, 
               '"%s"' % (video_identifier)]
    command = ' '.join(command)
    attempts = 0
    while True:
        try:
            output = subprocess.check_output(command, shell=True, 
                                             stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            attempts += 1
            if attempts == num_attempts:
                return status, err.output
        else:
            break
            
    tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
    # Construct command to trim the videos (ffmpeg required).
    command = ['ffmpeg',
               '-i', '"%s"' % tmp_filename,
               '-ss', str(start_time),
               '-t', str(end_time - start_time),
               '-c:v', 'libx264', '-c:a', 'copy',
               '-threads', '1',
               '-loglevel', 'panic',
               '"%s"' % output_filename]
    command = ' '.join(command)
    try:
        output = subprocess.check_output(command, shell=True,
                                         stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as err:
        return status, err.output

    # Check if the video was successfully saved.
    status = os.path.exists(output_filename)
    os.remove(tmp_filename)
    return status, 'Downloaded'


def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
    """Wrapper for parallel processing purposes."""
    output_filename = construct_video_filename(row, label_to_dir,
                                               trim_format)
    clip_id = os.path.basename(output_filename).split('.mp4')[0]
    if os.path.exists(output_filename):
        status = tuple([clip_id, True, 'Exists'])
        return status

    downloaded, log = download_clip(row['url'], output_filename,
                                    row['start-time'], row['end-time'],
                                    tmp_dir=tmp_dir)
    status = tuple([clip_id, downloaded, log])
    return status
 

def parse_kinetics_annotations(input_csv):
    """Returns a parsed DataFrame.
    
    arguments:
    ---------
    input_csv: str
        Path to CSV file containing the following columns:
          'YouTube Identifier,Start time,End time,Class label'
    returns:
    -------
    dataset: DataFrame
        Pandas with the following columns:
            'video-id', 'start-time', 'end-time', 'label-name'
    """
    df = pd.read_csv(input_csv, names=['video-id', 'url', 'start-time', 'end-time', 'label-name'], dtype={'video-id':str, 'url':str, 'start-time':float, 'end-time':float, 'label-name':int})
    #df.rename(columns={'youtube_id': 'video-id',
    #                   'time_start': 'start-time',
    #                   'time_end': 'end-time',
    #                   'label': 'label-name',
    #                   'is_cc': 'is-cc'}, inplace=True)
    return df

def main(input_csv, output_dir,
         trim_format='%06d', num_jobs=24, tmp_dir='temp'):

    # Reading and parsing Kinetics.
    dataset = parse_kinetics_annotations(input_csv)
    dataset.head()


    # Creates folders where videos will be saved later.
    #label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
    label_to_dir = output_dir

    # Download all clips.
    if num_jobs==1:
        status_lst = []
        for i, row in dataset.iterrows():
            status_lst.append(download_clip_wrapper(row, label_to_dir, 
                                                    trim_format, tmp_dir))
    else:
        status_lst = Parallel(n_jobs=num_jobs)(delayed(download_clip_wrapper)(
            row, label_to_dir,
            trim_format, tmp_dir) for i, row in dataset.iterrows())

    # Clean tmp dir.
    shutil.rmtree(tmp_dir)

    # Save download report.
    with open('download_report.json', 'w') as fobj:
        #print(status_lst)
        fobj.write(json.dumps(status_lst))




In [None]:
main(input_csv='videos_list.csv', output_dir='videos', num_jobs=16)