In [6]:
import pickle
import json

caption_file = './data/task_of_interest_raw_caption.json'
captions = json.load(open(caption_file,  'r'))

s3d_file = './data/s3d_feature_hub.pkl'
s3d_feat = pickle.load(open(s3d_file, 'rb'), encoding="utf-32")

def list_to_sent(sent_list):
    re = ''
    for sent in sent_list:
        re += sent + ' '
    return re

In [11]:
total_ann

{'q_XDwC6lyu0_0': {'time': [0.0, 23.125],
  'cap': "lock up your house your valuables but just when you think you're safe think again yeah the bad guys are using something called bump keys to slip right past most locks i'd never heard of this before a lot of people have and many locksmiths we talked to you though didn't even want to tell us about them but the information is widely available on the internet and we thought we would ",
  'feature': array([[5.86054812e-04, 2.63510767e-04, 1.00554171e-04, ...,
          6.97470896e-05, 1.17098723e-04, 2.56437669e-03],
         [2.63944094e-04, 2.15934022e-04, 3.99022902e-05, ...,
          2.90688127e-04, 0.00000000e+00, 2.01212079e-03],
         [1.37045135e-04, 2.42693432e-06, 3.15577381e-05, ...,
          1.39207332e-04, 2.60463821e-05, 1.92868663e-03],
         ...,
         [5.33025479e-04, 3.91734473e-04, 1.16973846e-04, ...,
          3.26815236e-04, 4.86458885e-05, 2.68877810e-03],
         [2.87345843e-04, 2.49849021e-04, 3.500190

## Generate the total annotation file


In [9]:
total_ann = {}
num_feat = 0
count = 0

for num_idx in captions['id']:
    video_idx = captions['id'][num_idx]
    
    # If videos are not available in S3D Hub
    if video_idx not in s3d_feat:
        break
        
    # Extract captions, time and s3d features for this video
    cap_list = captions['raw_caption'][num_idx]
    s_list = captions['start'][num_idx]
    e_list = captions['end'][num_idx]
    total_time = e_list[-1]
    total_feat = s3d_feat[video_idx].shape[0]
    
    # Iterate the video clips
    clip_idx = 0
    length = len(cap_list)
    interval = 10
    
    for i in range(0, length, interval):
        clip_ann = {}
        start = s_list[i]
        end = e_list[min(i+interval, len(e_list)-1)]
        
        clip_ann['time'] = [start, end]
        clip_ann['cap'] = list_to_sent(cap_list[i: i+interval])
        clip_ann['feature'] = s3d_feat[video_idx][round((start/total_time)*total_feat):
                                                  round((end/total_time)*total_feat)]
        _ = video_idx + '_' + str(clip_idx)
        total_ann[_] = clip_ann
        clip_idx += 1
        num_feat += round((end/total_time)*total_feat)- round((start/total_time)*total_feat)
        count += 1

In [9]:
# Save the annotation file
pickle.dump(total_ann, open('1600captioning.pkl', 'wb'))

In [10]:
len(total_ann.keys())

4249

## Get the word_to_id and id_to_word


In [90]:
word_to_ix = {}
word_to_ix["<sep>"] = 4
word_to_ix["<eos>"] = 3
word_to_ix["<sos>"] = 2
word_to_ix["<UNK>"] = 1
word_to_ix["<PAD>"] = 0

idx = 5
for num_idx in captions['id']:
    video_idx = captions['id'][num_idx]
    
    # If videos are not available in S3D Hub
    if video_idx not in s3d_feat:
        break
        
    # Extract captions, time and s3d features for this video
    cap_list = captions['raw_caption'][num_idx]
    for sentence in cap_list:
        words = sentence.lower().split(' ')
        for word in words:
            if word not in word_to_ix:
                word_to_ix[word] = idx
                idx += 1
                
ix_to_word = {}
for word in word_to_ix:
    ix = word_to_ix[word]
    ix_to_word[str(ix)] = word

In [94]:
vocab = {}
vocab['word_to_ix'] = word_to_ix
vocab['ix_to_word'] = ix_to_word

# Save the annotation file
pickle.dump(vocab, open('vocab.pkl', 'wb'))

## Split the training and testing split

In [35]:
import pickle

_ = './data/1600captioning.pkl'
ann = pickle.load(open(_, 'rb'), encoding="utf-32")

In [36]:
import random

video_idx = list(ann.keys())
random.shuffle(video_idx)

train_idx = video_idx[0:3000]
test_idx = video_idx[3000:]

In [37]:
splits = {}
splits['train'] = train_idx
splits['test'] = test_idx

pickle.dump(splits, open('splits.pkl', 'wb'))

In [21]:
len(test_idx)

2109

In [22]:
len(train_idx)

6223

## Dataloader File

In [27]:
splits.keys()

dict_keys(['train', 'test'])

In [97]:
'''
    Author: Zhiyuan Jacob Fang
    Dataset script to load and collate batch files.
'''
import os
import cv2
import pickle
import torch
import random
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset


class How2100MDataset(Dataset):

    # Auxiliary functions
    def get_vocab_size(self):
        return len(self.get_vocab())

    def get_vocab(self):
        return self.word_to_ix

    def get_ix_to_word(self):
        return self.ix_to_word

    def __init__(self, opt, mode='train'):
        super(How2100MDataset, self).__init__()
        self.opt = opt
        self.mode = mode

        # Load the caption annotations
        self.caption_hub = pickle.load(open(os.path.join(opt['ann_path'], opt['cap_file']), 'rb'))

        # Read the training/testing split
        self.splits = pickle.load(open(os.path.join(opt['ann_path'], opt['splits']), 'rb')) [mode]
        
        # Read the vocabulary file
        vocab = pickle.load(open(os.path.join(opt['ann_path'], opt['vocab']), 'rb')) 
            
        # Load the vocabulary dictionary
        self.word_to_ix = vocab['word_to_ix']
        self.ix_to_word = vocab['ix_to_word']

    def __getitem__(self, video_id=False):

        video_id = self.splits[video_id]

        # Load the caption and time boundaries
        data = {}
        annotation = self.caption_hub[video_id]
        time = annotation['time']
        cap = annotation['cap']
        feature = annotation['feature']

        data['video_feat'] = torch.tensor(feature)
        data['time'] = time
        data['caption'] = cap
        data['video_id'] = video_id
        data['arr_length'] = data['video_feat'].shape[0]
        return data

    def __len__(self):
        length = len(self.splits)
        return length


# batch collate function
def collate_fn(batch_lst):
    '''
    :param batch_lst: Raw instance level annotation from YC2 Dataset class.
    :return: batch annotations that include: features, captions, segments, length of features and length of videos.
    '''
    batch_lens = [_['arr_length'] for _ in batch_lst]
    max_length = max(batch_lens)
    batch_feat = torch.zeros((len(batch_lst), max_length, 512))
    captions = []
    video_ids = []

    for batch_id, batch_data in enumerate(batch_lst):
        batch_feat[batch_id][: batch_data['arr_length']] = batch_data['video_feat']
        captions.append(batch_data['caption'])
        video_ids.append(batch_data['video_id'])
    return batch_feat, captions, torch.tensor(batch_lens), video_ids

In [98]:
opt = {}
opt['ann_path'] = './data'
opt['cap_file'] = '1600captioning.pkl'
opt['splits'] = 'splits.pkl'
opt['vocab'] = 'vocab.pkl'

from torch.utils.data import DataLoader
dataset = How2100MDataset(opt, mode='train')
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

In [100]:
l = []
for batch_feat, captions, video_lens, video_ids in dataloader:
    l.append(len(captions[0].split(' ')))

In [101]:
max(l)

176

In [103]:
captions

["motor buying it's a shame it will be quite easy to move the bike or worse knock it over five kinds of aggressive spider-man to the rescue of all fellow motorist and ensuring that everyone abides by the universal laws of the road like using one parking five instead of two and by using only one spot being a huge help to all of humanity thanks spider-man 4 it's the celebration "]

## Statistics about the current captioniong task

In [3]:
#C, M, R, B: 0.003142048088866361 0.05902108271759048 0.12412222552064847 [0.12006724945117442, 0.03762453494350962, 0.013472772457743728, 0.005297708520132779]
print('Cider: {:.6f}, Meteor: {:.6f}, Rouge: {:.6f}, BLEU: {:.6f}, {:.6f}, {:.6f}, {:.6f}'
      .format(0.0044643, 0.05524047, 0.12112913, 0.1187265, 0.0325400,0.0107119754, 0.0042136144795))

Cider: 0.003142, Meteor: 0.059021, Rouge: 0.124122, BLEU: 0.120067, 0.037625, 0.013473, 0.005298


In [5]:
import pandas as pd
data = [[176, 15457, 9], [28, 2000, 30]]
pd.DataFrame(data, columns=["Max Len. of Sent.", "Size of Vocab", "Avg. Len. of Feat."])

Unnamed: 0,Max Len. of Sent.,Size of Vocab,Avg. Len. of Feat.
0,176,15457,9
1,28,2000,30
