In [1]:
import os
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import shutil
import numpy as np
from multiprocessing import Pool

In [2]:
def get_zip_file(bucket, remote_path, local_path):
    try:
        bucket.download_file(remote_path, local_path)
        
        if os.system('unzip ' + local_path) != 0:
            raise Exception()
        
        shutil.move(local_path.split('/')[-1].split('.zip')[0],
                    os.path.join(TMP_DIR, local_path.split('/')[-1].split('.zip')[0]))
            
        os.remove(local_path)
    except:
        print('Unable to get/unzip ' + remote_path)

In [3]:
QUANTIGO_BUCKET = 'quantigo'
TMP_DIR = 'tmp'

In [4]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(QUANTIGO_BUCKET)

matches = np.unique([obj.key.split('/')[0]
                     for obj in bucket.objects.filter()
                     if obj.key.startswith('clubber_video')])
matches

array(['clubber_video1', 'clubber_video10', 'clubber_video11',
       'clubber_video12', 'clubber_video13', 'clubber_video14',
       'clubber_video15', 'clubber_video16', 'clubber_video17',
       'clubber_video18', 'clubber_video19', 'clubber_video2',
       'clubber_video20', 'clubber_video21', 'clubber_video22',
       'clubber_video23', 'clubber_video25', 'clubber_video26',
       'clubber_video3', 'clubber_video4', 'clubber_video6',
       'clubber_video7', 'clubber_video8', 'clubber_video9'], dtype='<U15')

In [17]:
TOT_BUDGET = 150
UNIT_COST = 0.05

match_budget = round(TOT_BUDGET / len(matches))
match_samples = round(match_budget / UNIT_COST)

match_samples

120

In [6]:
for match in matches:
    
    os.makedirs(TMP_DIR, exist_ok=True)
    remote_zip_files = [obj.key
                        for obj in bucket.objects.filter()
                        if obj.key.split('/')[0] == match]
    
    for remote_file in remote_zip_files:
        get_zip_file(bucket, remote_file, os.path.join(TMP_DIR, remote_file.split('/')[-1]))
        
    print(match, 'DONE')

clubber_video1 DONE
clubber_video10 DONE
clubber_video11 DONE
clubber_video12 DONE
clubber_video13 DONE
clubber_video14 DONE
clubber_video15 DONE
clubber_video16 DONE
clubber_video17 DONE
clubber_video18 DONE
clubber_video19 DONE
clubber_video2 DONE
clubber_video20 DONE
clubber_video21 DONE
clubber_video22 DONE
clubber_video23 DONE
clubber_video25 DONE
clubber_video26 DONE
clubber_video3 DONE
clubber_video4 DONE
clubber_video6 DONE
clubber_video7 DONE
clubber_video8 DONE
clubber_video9 DONE


In [18]:
pd.set_option('max_colwidth', None)

images_data = {'path': []}
for d in os.listdir(TMP_DIR):
    for f in os.listdir(os.path.join(TMP_DIR, d)):
        images_data['path'].append(os.path.join(TMP_DIR, d, f))

images_df = pd.DataFrame(images_data).sort_values('path')
images_df['match'] = [p.split('/')[1].split('-')[0].split('frames_')[1] for p in images_df['path']]
images_df

Unnamed: 0,path,match
25757,tmp/frames_clubber_video1-000027-000070/img_00000000.jpg,clubber_video1
25789,tmp/frames_clubber_video1-000027-000070/img_00000001.jpg,clubber_video1
25755,tmp/frames_clubber_video1-000027-000070/img_00000002.jpg,clubber_video1
25770,tmp/frames_clubber_video1-000027-000070/img_00000003.jpg,clubber_video1
25777,tmp/frames_clubber_video1-000027-000070/img_00000004.jpg,clubber_video1
...,...,...
25734,tmp/frames_clubber_video9-004178-004219/img_00000035.jpg,clubber_video9
25720,tmp/frames_clubber_video9-004178-004219/img_00000036.jpg,clubber_video9
25738,tmp/frames_clubber_video9-004178-004219/img_00000037.jpg,clubber_video9
25712,tmp/frames_clubber_video9-004178-004219/img_00000038.jpg,clubber_video9


In [25]:
seed = 0
samples_df = pd.DataFrame()

for match in images_df['match'].unique():
    df = images_df[images_df['match'] == match]
    df = df.sample(match_samples, random_state=0)
    samples_df = pd.concat([samples_df, df], axis=0)
    
samples_df    

Unnamed: 0,path,match
25766,tmp/frames_clubber_video1-000027-000070/img_00000018.jpg,clubber_video1
32243,tmp/frames_clubber_video1-000773-000794/img_00000003.jpg,clubber_video1
24684,tmp/frames_clubber_video1-001137-001148/img_00000007.jpg,clubber_video1
23278,tmp/frames_clubber_video1-002439-002459/img_00000002.jpg,clubber_video1
42023,tmp/frames_clubber_video1-002870-002877/img_00000000.jpg,clubber_video1
...,...,...
1058,tmp/frames_clubber_video9-002831-002860/img_00000022.jpg,clubber_video9
35274,tmp/frames_clubber_video9-002548-002565/img_00000007.jpg,clubber_video9
7770,tmp/frames_clubber_video9-002077-002114/img_00000034.jpg,clubber_video9
13700,tmp/frames_clubber_video9-003538-003541/img_00000001.jpg,clubber_video9


In [27]:
samples_df = samples_df.reset_index().drop('index', axis=1)
samples_df

Unnamed: 0,path,match
0,tmp/frames_clubber_video1-000027-000070/img_00000018.jpg,clubber_video1
1,tmp/frames_clubber_video1-000773-000794/img_00000003.jpg,clubber_video1
2,tmp/frames_clubber_video1-001137-001148/img_00000007.jpg,clubber_video1
3,tmp/frames_clubber_video1-002439-002459/img_00000002.jpg,clubber_video1
4,tmp/frames_clubber_video1-002870-002877/img_00000000.jpg,clubber_video1
...,...,...
2875,tmp/frames_clubber_video9-002831-002860/img_00000022.jpg,clubber_video9
2876,tmp/frames_clubber_video9-002548-002565/img_00000007.jpg,clubber_video9
2877,tmp/frames_clubber_video9-002077-002114/img_00000034.jpg,clubber_video9
2878,tmp/frames_clubber_video9-003538-003541/img_00000001.jpg,clubber_video9


In [28]:
samples_df['new_name'] = ['img_' + str(i).zfill(8) + '.jpg' for i, _ in enumerate(samples_df['path'])]
samples_df

Unnamed: 0,path,match,new_name
0,tmp/frames_clubber_video1-000027-000070/img_00000018.jpg,clubber_video1,img_00000000.jpg
1,tmp/frames_clubber_video1-000773-000794/img_00000003.jpg,clubber_video1,img_00000001.jpg
2,tmp/frames_clubber_video1-001137-001148/img_00000007.jpg,clubber_video1,img_00000002.jpg
3,tmp/frames_clubber_video1-002439-002459/img_00000002.jpg,clubber_video1,img_00000003.jpg
4,tmp/frames_clubber_video1-002870-002877/img_00000000.jpg,clubber_video1,img_00000004.jpg
...,...,...,...
2875,tmp/frames_clubber_video9-002831-002860/img_00000022.jpg,clubber_video9,img_00002875.jpg
2876,tmp/frames_clubber_video9-002548-002565/img_00000007.jpg,clubber_video9,img_00002876.jpg
2877,tmp/frames_clubber_video9-002077-002114/img_00000034.jpg,clubber_video9,img_00002877.jpg
2878,tmp/frames_clubber_video9-003538-003541/img_00000001.jpg,clubber_video9,img_00002878.jpg


In [29]:
OUT_DIR = 'filtered_samples'
os.makedirs(OUT_DIR, exist_ok=True)

for _, row in samples_df.iterrows():
    path = row['path']
    new_path = os.path.join(OUT_DIR, row['new_name'])
    shutil.copy(path, new_path)

In [32]:
samples_df.to_pickle('samples.pkl')
shutil.rmtree(TMP_DIR)