In [1]:
import os
os.chdir('../')

In [54]:
import pandas as pd
from pathlib import Path
from PIL import Image
import json
import random
import subprocess
from tqdm import tqdm

In [3]:
root_path = Path('./data/')

In [4]:
style_df = pd.read_csv(root_path / 'metadata' / 'style.csv')
style_df = style_df[~style_df['File_name'].isna()]
style_df = style_df.sort_values('File_name').reset_index(drop=True)

content_df = pd.read_csv(root_path / 'metadata' / 'content.csv')
content_df = content_df[~content_df['File_name'].isna()]
content_df = content_df.sort_values('File_name').reset_index(drop=True)

content_df.shape, style_df.shape

((126, 8), (100, 6))

In [43]:
def cartesian_product(d):
    index = pd.MultiIndex.from_product(d.values(), names=d.keys())
    return pd.DataFrame(index=index).reset_index()

weights = [5e4]

prod = cartesian_product({
    'content': content_df['File_name'],
    'style': style_df['File_name'],
    'weight': weights,
})

prod['index'] = list(prod.index+1)
prod['gifs'] = False
prod['images'] = False
prod['metadata'] = False
prod['to_review'] = False

In [44]:
import boto3
client = boto3.client('s3')

In [45]:
def iterate_bucket_items(bucket, prefix):
    """
    Generator that iterates over all objects in a given s3 bucket

    See http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client.list_objects_v2 
    for return data format
    :param bucket: name of s3 bucket
    :return: dict of metadata for an object
    """

    paginator = client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

    for page in page_iterator:
        if page['KeyCount'] > 0:
            for item in page['Contents']:
                yield item

In [52]:
def check_component(component):
    prefix = f'output/{component}/'
    
    for item in tqdm(iterate_bucket_items(bucket='neuralism-assets', prefix=prefix)):
        if '.' not in item['Key']:
            continue
            
        print(f"Processing item {item['Key']}")
            
        key = item['Key'].split('/')[-1].split('.')[0]
        if component=='gifs':
            key = key[1:]
        
        prod.loc[prod['index']==int(key), component] = True
        
        break

In [55]:
check_component('gifs')
check_component('images')
check_component('metadata')

1it [00:00,  1.73it/s]


Processing item output/gifs/g1.gif


1it [00:00,  3.69it/s]


Processing item output/images/1.jpg


1it [00:00,  4.44it/s]

Processing item output/metadata/1.json





In [56]:
prod

Unnamed: 0,content,style,weight,index,gifs,images,metadata,to_review
0,acrobat_and_young_harlequin.jpg,alpine_pasture.jfif,50000.0,1,True,True,True,False
1,acrobat_and_young_harlequin.jpg,ameisenhaufen.jpg,50000.0,2,False,False,False,False
2,acrobat_and_young_harlequin.jpg,animals_in_a_landscape.jpg,50000.0,3,False,False,False,False
3,acrobat_and_young_harlequin.jpg,battle_of_lights.jpg,50000.0,4,False,False,False,False
4,acrobat_and_young_harlequin.jpg,bed_of_chrysanthemums.jpg,50000.0,5,False,False,False,False
...,...,...,...,...,...,...,...,...
12595,witches_sabbath.jpg,view_of_toledo.jpg,50000.0,12596,False,False,False,False
12596,witches_sabbath.jpg,wall.jpg,50000.0,12597,False,False,False,False
12597,witches_sabbath.jpg,wasserschlangen_ii.jpeg,50000.0,12598,False,False,False,False
12598,witches_sabbath.jpg,wheat_field_with_cypresses.jfif,50000.0,12599,False,False,False,False


In [57]:
prod.to_csv('final_excel.csv', index=False)