# Imports

In [21]:
import os
import sys
import pandas as pd
import numpy as np
import uuid
import json
import subprocess
from sklearn.model_selection import train_test_split
from data_processing import get_interest_frames_from_video
from download import LomotifDownloader
from tqdm import trange
from PIL import Image

# Read data
This is the data you want to use to fine-tune CoOp.

In [25]:
df = pd.read_csv('sample_data.csv',
                 nrows=50)  # for demo, let's just look at the first 20 samples

In [4]:
df.head()

Unnamed: 0,ID,VIDEO,CREATED,PRIMARY_CATEGORIES,SECONDARY_CATEGORIES,COUNTRY
0,a975f22d-8faf-4b01-9012-1187f33c95c9,https://lomotif-prod.s3.amazonaws.com/lomotifs...,2022-04-10T00:46:38.025329Z,food,recipe,RU
1,b7f2c47c-671b-4889-a9e1-8b9874e35e8f,https://lomotif-prod.s3.amazonaws.com/lomotifs...,2022-04-10T00:49:17.651142Z,animals,pets,RU
2,4a2adbc8-cc03-4f96-96a6-83a3f09b0769,https://lomotif-prod.s3.amazonaws.com/lomotifs...,2022-04-10T01:20:43.506315Z,entertainment,romantic,RU
3,19c9d156-6056-4c47-8afa-0dc37dfd389e,https://lomotif-prod.s3.amazonaws.com/lomotifs...,2022-04-10T01:33:17.773981Z,beauty-and-grooming,hair,RU
4,6163ab3a-20dc-4d0a-b8d9-cfc6c7d134b9,https://lomotif-prod.s3.amazonaws.com/lomotifs...,2022-04-10T01:43:23.840748Z,beauty-and-grooming,hair,RU


# Download key frames from the lomotifs

In [7]:
save_dir = './downloaded_lomotifs'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
ld = LomotifDownloader(save_dir)

img_dir = './key_frames_extracted'
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

In [26]:
for i in trange(len(df)):
    video_url = df.loc[i, 'VIDEO']
    lid = df.loc[i, 'ID']

    # download the lomotif
    result, save_file_name = ld.download(video_url=video_url, lomotif_id=lid)

    # extract key frames
    (
        key_frames,
        fps,
        num_frames,
        selected_frame_indices,
    ) = get_interest_frames_from_video(save_file_name)

    # save all key frames as images
    for frame in key_frames:
        im = Image.fromarray(frame)
        imfilename = '{}.png'.format(str(uuid.uuid4()).split('-')[0])

        if not os.path.exists(os.path.join(img_dir, lid)):
            os.makedirs(os.path.join(img_dir, lid))
        im.save(os.path.join(img_dir, lid, imfilename))

    # delete downloaded lomotif as we do not need it anymore
    os.remove(save_file_name)

100%|██████████| 29/29 [01:22<00:00,  2.84s/it]


# Move images into folder structure required by CoOp

In [19]:
coop_data_dir = './CoOp/data/tagging'
if not os.path.exists(coop_data_dir):
    if not os.path.exists('./CoOp/data'):
        os.makedirs('./CoOp/data')
    os.makedirs(coop_data_dir)

In [15]:
# define secondary to primary category mapping
mapping = {
    'birds': 'animals',
    'domestic-animals': 'animals',
    'pets': 'animals',
    'wild-life': 'animals',
    'advertisement': 'branding-and-advertisements',
    'religious-speech': 'devotional',
    'religious-festivals': 'devotional',
    'god': 'devotional',
    'holy-places': 'devotional',
    'tutorials': 'education',
    'general-knowledge': 'education',
    'quiz': 'education',
    'performance': 'entertainment',
    'comedy': 'entertainment',
    'drama': 'entertainment',
    'meme': 'entertainment',
    'bloopers': 'entertainment',
    'funny-clips': 'entertainment',
    'funny-dubbing': 'entertainment',
    'pranks': 'entertainment',
    'romantic': 'entertainment',
    'entertainment-celebrities': 'entertainment',
    'party': 'entertainment',
    'tv-series-and-shows': 'entertainment',
    'movie': 'entertainment',
    'kids': 'family-and-friends',
    'friends': 'family-and-friends',
    'recipe': 'food',
    'street-food': 'food',
    'food-celebrities': 'food',
    'beverages': 'food',
    'desserts': 'food',
    'fitness': 'health-and-fitness',
    'shopping': 'shopping',
    'home-decor': 'home-decoration',
    'fashion': 'fashion',
    'quotes': 'inspirational',
    'attitude-motivation': 'inspirational',
    'spiritual-motivation': 'inspirational',
    'motivational-celebrities': 'inspirational',
    'car': 'automobiles',
    'motorbikes': 'automobiles',
    'trucks': 'automobiles',
    'cricket': 'sports-and-games',
    'badminton': 'sports-and-games',
    'football': 'sports-and-games',
    'wrestling': 'sports-and-games',
    'hockey': 'sports-and-games',
    'basketball': 'sports-and-games',
    'boxing': 'sports-and-games',
    'racing': 'sports-and-games',
    'swimming': 'sports-and-games',
    'sports-celebrities': 'sports-and-games',
    'bull-riding': 'sports-and-games',
    'skateboarding': 'adventure',
    'sky-diving': 'adventure',
    'bungee-jump': 'adventure',
    'wing-suit': 'adventure',
    'rope-walking': 'adventure',
    'adventure-rides': 'adventure',
    'vehicular-stunts': 'adventure',
    'surfing': 'adventure',
    'photography': 'art-and-creativity',
    'drawing-and-painting': 'art-and-creativity',
    'beach': 'travel',
    'nature': 'travel',
    'travel-vlogs': 'travel',
    'birthday': 'celebration-and-wishes',
    'other-wishes': 'celebration-and-wishes',
    'anniversary': 'celebration-and-wishes',
    'music-performance': 'music-and-singing',
    'singing': 'music-and-singing',
    'anime': 'animation',
    'gifs': 'animation',
    'cartoon': 'animation',
    'other-gaming': 'gaming',
    'dancing': 'dance',
    'dance-performance': 'dance',
    'make-up': 'beauty-and-grooming',
    'skincare': 'beauty-and-grooming',
    'hair': 'beauty-and-grooming',
    'grooming-tips': 'beauty-and-grooming',
    'nails': 'beauty-and-grooming',
    'beauty-accessories': 'beauty-and-grooming',
    'mobile-phones': 'gadgets-and-technology',
    'watches': 'gadgets-and-technology',
    'selfie': 'selfies',
    'animal-cruelty': 'animal-cruelty',
    'dead-bodies-&-corpes': 'dead-bodies-&-corpes',
    'physical-violence': 'physical-violence',
    'profanity': 'profanity',
    'visually-disturbing': 'visually-disturbing',
    'weapons-&-firearms': 'weapons-&-firearms'
}

In [27]:
for i in trange(len(df)):
    video_url = df.loc[i, 'VIDEO']
    lid = df.loc[i, 'ID']
    sec_cat = df.loc[i, 'SECONDARY_CATEGORIES'].split(', ')
    pri_cat = list(set([mapping[x] for x in sec_cat if x in mapping.keys()]))
    
    if len(pri_cat) > 0:
        # create a folder per class for the first time
        for label in pri_cat:
            if not os.path.exists(os.path.join(coop_data_dir, label)):
                os.makedirs(os.path.join(coop_data_dir, label))

        # move key frame images to respective class folders
        key_frame_files = os.listdir(os.path.join(img_dir, lid))

        for f in key_frame_files:
            for label in pri_cat:
                subprocess.call(['cp', os.path.join(img_dir, lid, f), os.path.join(coop_data_dir, label)])

100%|██████████| 50/50 [00:08<00:00,  6.03it/s]


In [34]:
for f in os.listdir(coop_data_dir):
    print('{}: {} images'.format(
        f, len(os.listdir(os.path.join(coop_data_dir, f)))))

travel: 68 images
beauty-and-grooming: 92 images
adventure: 29 images
animals: 56 images
fashion: 131 images
family-and-friends: 180 images
art-and-creativity: 15 images
food: 155 images
dance: 8 images
entertainment: 26 images


# Write json file indicating train test split 

In [49]:
src_dir = './CoOp/data/tagging'
dict_file = {}
dict_file['train'] = []
dict_file['test'] = []
dict_file['val'] = []
dict_file

{'train': [], 'test': [], 'val': []}

In [50]:
idxnew = 0
for idx, i in enumerate(os.listdir(src_dir)):
    train, test = train_test_split(os.listdir(os.path.join(src_dir, i)),
                                   test_size=0.8,
                                   random_state=123)
    test_new, val = train_test_split(test, test_size=0.5, random_state=123)

    for file_path_train in [os.path.join(i, j) for j in train]:
        dict_file['train'].append([file_path_train, idxnew, i])
    for file_path_test in [os.path.join(i, j) for j in test_new]:
        dict_file['test'].append([file_path_test, idxnew, i])
    for file_path_val in [os.path.join(i, j) for j in val]:
        dict_file['val'].append([file_path_val, idxnew, i])
    idxnew += 1

In [51]:
dict_file

{'train': [['travel/fdb74d9f.png', 0, 'travel'],
  ['travel/0c54a462.png', 0, 'travel'],
  ['travel/c359f3fa.png', 0, 'travel'],
  ['travel/6b3ef934.png', 0, 'travel'],
  ['travel/c348796c.png', 0, 'travel'],
  ['travel/7ab91a58.png', 0, 'travel'],
  ['travel/f5d7f758.png', 0, 'travel'],
  ['travel/2f9fa6de.png', 0, 'travel'],
  ['travel/f4c64c60.png', 0, 'travel'],
  ['travel/251dccbe.png', 0, 'travel'],
  ['travel/c39c70a5.png', 0, 'travel'],
  ['travel/486371d4.png', 0, 'travel'],
  ['travel/3fb23a83.png', 0, 'travel'],
  ['beauty-and-grooming/c0e07dbc.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/a5742969.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/c0ab370a.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/417668cb.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/f9016593.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/6001acd7.png', 1, 'beauty-and-grooming'],
  ['beauty-and-grooming/35b81949.png', 1, 'beauty-and-grooming'],
  ['beaut

In [52]:
with open('./CoOp/data/tagging/split_zhou_Tagging.json', 'w') as f:
    json.dump(dict_file, f, indent=4)