In [None]:
%%time

from adlfs import AzureBlobFileSystem

import time

import json
import pandas
import pandas as pd
from tqdm import tqdm

from common.schemas.pyarrow_schema import tagging_schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.captioning.azure_descriptions import AzureCaption
from common.schemas.pyarrow_schema import schema
from PIL import Image
import requests


tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

from common.data_frame_functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

In [5]:
%%time

curated_df = pandas.read_parquet('data/parquet/back.parquet', filesystem=file_system, engine='pyarrow')

CPU times: total: 766 ms
Wall time: 1.88 s


In [6]:
%%time

captions = pd.read_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow')
tags = pd.read_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow')
crops = pd.read_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow')

display(captions.shape)
display(tags.shape)
display(crops.shape)

(31180, 7)

(71153, 3)

(3509, 6)

CPU times: total: 391 ms
Wall time: 1.09 s


In [7]:
%%time

current_captions = file_system.ls("data/caption")
display(len(current_captions))

3509

CPU times: total: 5.27 s
Wall time: 6.74 s


In [8]:
%%time

all_data = []
for caption_file in tqdm(current_captions, total=len(current_captions), desc='Reading caption files'):
	caption_data = json.loads(file_system.read_text(caption_file, encoding='utf-8'))
	dense_caption_result = caption_data.get('denseCaptionsResult')
	metadata = caption_data.get('metadata')
	tags_result = caption_data.get('tagsResult')
	smart_crop_result = caption_data.get('smartCropsResult')
	basic_caption = caption_data.get('captionResult')
	image_id = caption_file.split('/')[-1].split('.')[0]
	filtered_data = {
		"id": image_id,
		'captions': [basic_caption],
		"dense_captions": dense_caption_result['values'],
		"meta": [metadata],
		"tags": tags_result['values'],
		"smart_crop": smart_crop_result['values']
	}
	all_data.append(filtered_data)

Reading caption files: 100%|██████████| 3509/3509 [05:38<00:00, 10.36it/s]

CPU times: total: 1min 29s
Wall time: 5min 38s





In [9]:
%%time

new_captions = pandas.json_normalize(data=all_data, record_path=['dense_captions'], meta=['id'], record_prefix='dense_captions_')
new_tags = pandas.json_normalize(data=all_data, record_path=['tags'], meta=['id'], record_prefix='tags_')
new_crops = pandas.json_normalize(data=all_data, record_path=['smart_crop'], meta=['id'], record_prefix='smart_crop_')

CPU times: total: 4.95 s
Wall time: 7.66 s


In [10]:
%%time

new_basic_captions = pandas.json_normalize(data=all_data, record_path=['captions'], meta=['id'], record_prefix='captions_')
meta = pandas.json_normalize(data=all_data, record_path=['meta'], meta=['id'], record_prefix='meta_')

CPU times: total: 219 ms
Wall time: 344 ms


In [11]:
%%time

display(new_captions)
display(new_tags)
display(new_crops)
display(new_basic_captions)
display(meta)

Unnamed: 0,dense_captions_text,dense_captions_confidence,dense_captions_boundingBox.x,dense_captions_boundingBox.y,dense_captions_boundingBox.w,dense_captions_boundingBox.h,id
0,a woman taking a selfie,0.662057,0,0,1468,2608,1002cx2
1,a woman taking a selfie,0.665344,0,100,1447,2473,1002cx2
2,a woman wearing a black dress,0.473444,185,1443,1265,1150,1002cx2
3,a close up of a key,0.593095,716,1503,82,140,1002cx2
4,a close up of an eye,0.619898,778,744,152,84,1002cx2
...,...,...,...,...,...,...,...
31175,a close-up of a green plant,0.453059,680,1066,394,280,zzxq2l
31176,a mountain range in the distance,0.334624,498,277,568,99,zzxq2l
31177,a woman smiling at the camera,0.465720,244,293,319,530,zzxq2l
31178,a woman wearing a dress,0.548892,0,758,1048,570,zzxq2l


Unnamed: 0,tags_name,tags_confidence,id
0,person,0.998358,1002cx2
1,human face,0.997763,1002cx2
2,clothing,0.990993,1002cx2
3,lady,0.981919,1002cx2
4,smile,0.964994,1002cx2
...,...,...,...
71148,woman,0.753262,zzxq2l
71149,beach,0.739799,zzxq2l
71150,standing,0.630927,zzxq2l
71151,girl,0.556561,zzxq2l


Unnamed: 0,smart_crop_aspectRatio,smart_crop_boundingBox.x,smart_crop_boundingBox.y,smart_crop_boundingBox.w,smart_crop_boundingBox.h,id
0,1.0,0,140,1462,1462,1002cx2
1,1.0,143,138,1249,1250,1003dod
2,1.0,0,355,1948,1948,1008ddt
3,1.0,0,224,1120,1120,100dcrd
4,1.0,0,0,1107,1107,100jl1b
...,...,...,...,...,...,...
3504,1.0,0,0,1025,1025,zzr9nz
3505,1.0,0,0,461,461,zzruf4
3506,1.0,0,0,2273,2273,zzu8i8
3507,1.0,135,0,1299,1299,zzvv3x


Unnamed: 0,captions_text,captions_confidence,id
0,a woman taking a selfie,0.662057,1002cx2
1,a woman in a garment standing next to a tree,0.513081,1003dod
2,a woman taking a selfie,0.658102,1008ddt
3,a group of women posing for a picture,0.556227,100dcrd
4,a group of women posing for a picture,0.614969,100jl1b
...,...,...,...
3504,two women in garments on a beach,0.529292,zzr9nz
3505,a woman with tattoos on her arm,0.501798,zzruf4
3506,a woman in a blue lingerie,0.430763,zzu8i8
3507,a woman sitting on a bed,0.584322,zzvv3x


Unnamed: 0,meta_width,meta_height,id
0,1468,2608,1002cx2
1,1523,2030,1003dod
2,1956,3075,1008ddt
3,1152,1642,100dcrd
4,1152,1152,100jl1b
...,...,...,...
3504,1080,1350,zzr9nz
3505,570,704,zzruf4
3506,2309,2560,zzu8i8
3507,1440,1800,zzvv3x


CPU times: total: 125 ms
Wall time: 218 ms


In [12]:
merge_singles = pandas.merge(new_basic_captions, meta, on='id').set_index(keys=['id'], drop=False)
merge_singles.drop_duplicates(inplace=True)
merge_singles.reset_index(drop=True, inplace=True)
display(merge_singles)

Unnamed: 0,captions_text,captions_confidence,id,meta_width,meta_height
0,a woman taking a selfie,0.662057,1002cx2,1468,2608
1,a woman in a garment standing next to a tree,0.513081,1003dod,1523,2030
2,a woman taking a selfie,0.658102,1008ddt,1956,3075
3,a group of women posing for a picture,0.556227,100dcrd,1152,1642
4,a group of women posing for a picture,0.614969,100jl1b,1152,1152
...,...,...,...,...,...
3504,two women in garments on a beach,0.529292,zzr9nz,1080,1350
3505,a woman with tattoos on her arm,0.501798,zzruf4,570,704
3506,a woman in a blue lingerie,0.430763,zzu8i8,2309,2560
3507,a woman sitting on a bed,0.584322,zzvv3x,1440,1800


In [13]:
merged_to_curate = pandas.merge(merge_singles, curated_df, on='id', how='outer').set_index(keys=['id'], drop=False)
merged_to_curate.fillna(value='', inplace=True)
display(merged_to_curate)

Unnamed: 0_level_0,captions_text,captions_confidence,id,meta_width,meta_height,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1002cx2,a woman taking a selfie,0.662057,1002cx2,1468.0,2608.0,SFWRedheads,Sayleywayley,Happy New Year to every single person near and...,blonde woman with red hair and black bra top p...,1fb785ca16b10f4f7613961f0b88f369,/r/SFWRedheads/comments/1002cx2/happy_new_year...,https://i.redd.it/el3s490lzb9a1.jpg,1002cx2.jpg,data/image/1002cx2.jpg,RedHeadDiffusion,True,True,True,[]
1003dod,a woman in a garment standing next to a tree,0.513081,1003dod,1523.0,2030.0,HotGirlNextDoor,fairytale808,pink and blonde (iktr),blonde woman in pink bikinisuit standing on a ...,0849c7aafe126ad27b58cb6e6c9c6592,/r/HotGirlNextDoor/comments/1003dod/pink_and_b...,https://i.redd.it/x56eekrd8c9a1.jpg,1003dod.jpg,data/image/1003dod.jpg,SexyDiffusion,True,True,True,[]
1008ddt,a woman taking a selfie,0.658102,1008ddt,1956.0,3075.0,SFWRedheads,Fit_Advertising4668,happy new year from Scotland 🏴󠁧󠁢󠁳󠁣󠁴󠁿,smiling woman in a green velvet dress with a g...,945b3f444ccaa79f73655fa44bd6a156,/r/SFWRedheads/comments/1008ddt/happy_new_year...,https://i.redd.it/rdywcwiyhd9a1.jpg,1008ddt.jpg,data/image/1008ddt.jpg,RedHeadDiffusion,True,True,True,[]
100dcrd,a group of women posing for a picture,0.556227,100dcrd,1152.0,1642.0,HotGirlNextDoor,angizni,College hotties (iktr),three girls in bikinis posing for a picture wi...,f07022ea6fac84354178a09e92dc3865,/r/HotGirlNextDoor/comments/100dcrd/college_ho...,https://i.redd.it/bm6jpns80f9a1.jpg,100dcrd.jpg,data/image/100dcrd.jpg,SexyDiffusion,True,True,True,[]
100jl1b,a group of women posing for a picture,0.614969,100jl1b,1152.0,1152.0,HotGirlNextDoor,angizni,Sorority hotties (iktr),three women in red dresses standing next to ea...,6cdfb6741d444b243542ae7d40f57b40,/r/HotGirlNextDoor/comments/100jl1b/sorority_h...,https://i.redd.it/wfxx3uc35h9a1.jpg,100jl1b.jpg,data/image/100jl1b.jpg,SexyDiffusion,True,True,True,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13d2hsf,,,13d2hsf,,,bathandbodyworks,alesieoksap,Ice Cream Shop Collection?,someone holding a cup of ice cream in their hand,c6788074138175c661fc652bd7630e33,/r/bathandbodyworks/comments/13d2hsf/ice_cream...,https://i.redd.it/uhwamoq99wya1.jpg,13d2hsf.jpg,data/image/13d2hsf.jpg,CandleDiffusion,True,True,True,[]
13d661b,,,13d661b,,,bathandbodyworks,xeloux,Facebook market finds!,three candles are sitting on a blanket on a bed,e868289445c195815dd67dd0df1a65cb,/r/bathandbodyworks/comments/13d661b/facebook_...,https://i.redd.it/skd10hefxwya1.jpg,13d661b.jpg,data/image/13d661b.jpg,CandleDiffusion,True,True,True,[]
13d8s0i,,,13d8s0i,,,bathandbodyworks,Alternative-Tea-9355,Black Cherry Merlot dupe,someone holding a tube of body cream in a store,965bc5a58a597192700234729758101b,/r/bathandbodyworks/comments/13d8s0i/black_che...,https://i.redd.it/xt7qew9qexya1.jpg,13d8s0i.jpg,data/image/13d8s0i.jpg,CandleDiffusion,True,True,False,[]
13dagbr,,,13dagbr,,,bathandbodyworks,Dove04,Does anyone know how I could get rid of this r...,someone is holding a white device with a hole ...,41732e4109a99c99c91c2181867d18e3,/r/bathandbodyworks/comments/13dagbr/does_anyo...,https://i.redd.it/p0j6atz6rxya1.jpg,13dagbr.jpg,data/image/13dagbr.jpg,CandleDiffusion,True,True,False,[]


In [14]:
%%time

merged_captions = pandas.concat([new_captions, captions])
merged_captions.set_index(keys=['id', 'dense_captions_text', 'dense_captions_confidence'], inplace=True, drop=False)
merged_captions.drop_duplicates(inplace=True)
merged_captions.reset_index(drop=True, inplace=True)

display(f'{merged_captions.shape[0] - captions.shape[0]} new rows added to captions')

merged_captions.to_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow')
display(pandas.read_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to captions'

Unnamed: 0,dense_captions_text,dense_captions_confidence,dense_captions_boundingBox.x,dense_captions_boundingBox.y,dense_captions_boundingBox.w,dense_captions_boundingBox.h,id
0,a woman taking a selfie,0.662057,0,0,1468,2608,1002cx2
1,a woman taking a selfie,0.665344,0,100,1447,2473,1002cx2
2,a woman wearing a black dress,0.473444,185,1443,1265,1150,1002cx2
3,a close up of a key,0.593095,716,1503,82,140,1002cx2
4,a close up of an eye,0.619898,778,744,152,84,1002cx2
...,...,...,...,...,...,...,...
31175,a close-up of a green plant,0.453059,680,1066,394,280,zzxq2l
31176,a mountain range in the distance,0.334624,498,277,568,99,zzxq2l
31177,a woman smiling at the camera,0.465720,244,293,319,530,zzxq2l
31178,a woman wearing a dress,0.548892,0,758,1048,570,zzxq2l


CPU times: total: 562 ms
Wall time: 1.8 s


In [15]:
%%time

merged_tags = pandas.concat([new_tags, tags])
merged_tags.set_index(keys=['id', 'tags_name', 'tags_confidence'], inplace=True, drop=False)
merged_tags.drop_duplicates(inplace=True)
merged_tags.reset_index(drop=True, inplace=True)

display(f'{merged_tags.shape[0] - tags.shape[0]} new rows added to tags')

merged_tags.to_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow')
display(pandas.read_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to tags'

Unnamed: 0,tags_name,tags_confidence,id
0,person,0.998358,1002cx2
1,human face,0.997763,1002cx2
2,clothing,0.990993,1002cx2
3,lady,0.981919,1002cx2
4,smile,0.964994,1002cx2
...,...,...,...
71148,woman,0.753262,zzxq2l
71149,beach,0.739799,zzxq2l
71150,standing,0.630927,zzxq2l
71151,girl,0.556561,zzxq2l


CPU times: total: 625 ms
Wall time: 2.15 s


In [16]:
%%time

merged_crops = pandas.concat([new_crops, crops])
merged_crops.set_index(keys=['id'], inplace=True, drop=False)
merged_crops.drop_duplicates(inplace=True)
merged_crops.reset_index(drop=True, inplace=True)

display(f'{merged_crops.shape[0] - crops.shape[0]} new rows added to crops')

merged_crops.to_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow')

display(pandas.read_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to crops'

Unnamed: 0,smart_crop_aspectRatio,smart_crop_boundingBox.x,smart_crop_boundingBox.y,smart_crop_boundingBox.w,smart_crop_boundingBox.h,id
0,1.0,0,140,1462,1462,1002cx2
1,1.0,143,138,1249,1250,1003dod
2,1.0,0,355,1948,1948,1008ddt
3,1.0,0,224,1120,1120,100dcrd
4,1.0,0,0,1107,1107,100jl1b
...,...,...,...,...,...,...
3504,1.0,0,0,1025,1025,zzr9nz
3505,1.0,0,0,461,461,zzruf4
3506,1.0,0,0,2273,2273,zzu8i8
3507,1.0,135,0,1299,1299,zzvv3x


CPU times: total: 109 ms
Wall time: 578 ms


In [17]:
import os
def create_thumbnail(image_id_, curated_df_, crops_, extant_files_):
	try:
		record = None
		cropping_information = None
		if image_id_ in extant_files_:
			return None
		try:
			record = curated_df.loc[curated_df['id'] == image_id_]
			cropping_information = crops.loc[crops['id'] == image_id_]
		except KeyError or IndexError:
			return None
		if record is None and cropping_information is None and len(record) == 0 and len(cropping_information) == 0:
			return None
		if record is not None and cropping_information is not None:
			record = curated_df_.loc[curated_df_['id'] == image_id_]
			cropping_information = crops_.loc[crops['id'] == image_id_]
			try:
				image_url = file_system.url(record.path.values[0])
			except Exception as e:
				display(f'Error creating thumbnail for {image_id_}: {e}', clear=True)
				return None
			original_image = Image.open(requests.get(image_url, stream=True).raw)
			copied_image = original_image.copy()
			original_image.close()
			try:
				cropped = copied_image.crop((cropping_information['smart_crop_boundingBox.x'].values[0], cropping_information['smart_crop_boundingBox.y'].values[0], cropping_information['smart_crop_boundingBox.x'].values[0] + cropping_information['smart_crop_boundingBox.w'].values[0], cropping_information['smart_crop_boundingBox.y'].values[0] + cropping_information['smart_crop_boundingBox.h'].values[0]))
			except Exception as e:
				display(f'Error creating thumbnail for {image_id_}: {e}', clear=True)
				return None
			copied_image.close()
			resized = cropped.resize((512, 512), 1)
			cropped.close()
			resized.save('temp.jpg')
			file_system.upload('temp.jpg', f'data/image/thumbnail/{image_id_}.jpg', overwrite=True)
			display(f'Thumbnail created for {image_id_}', clear=True)
			return None
		else:
			return None
	except Exception as e:
		display(f'Error creating thumbnail for {image_id_}: {e}', clear=True)
		return None

In [18]:
%%time

extant = [os.path.basename(item.replace('\n', '').split('.')[0]) for item in file_system.ls('data/image/thumbnail')]
display(extant)

['1011gjo',
 '1013bdt',
 '1019kyo',
 '101w7lh',
 '1027i9a',
 '1032j8y',
 '1035nya',
 '103h4hd',
 '103nm31',
 '103zqoc',
 '104it0i',
 '105dxeb',
 '105mekt',
 '105qvgl',
 '105rpcj',
 '105styc',
 '105tw6t',
 '106673i',
 '106hchf',
 '106lsw4',
 '106mh03',
 '108e7ml',
 '1096qs1',
 '109n66u',
 '109x870',
 '10bdmd2',
 '10bpm50',
 '10ce95f',
 '10cywc7',
 '10dgnk2',
 '10f78ow',
 '10gliq7',
 '10gzlqt',
 '10ht3qh',
 '10i05he',
 '10i1flv',
 '10i1qkq',
 '10i56sz',
 '10i8he7',
 '10iuhd0',
 '10jefuk',
 '10jvfeb',
 '10lhpa1',
 '10mabxc',
 '10nyadr',
 '10o96va',
 '10odumt',
 '10ok9gc',
 '10olva7',
 '10onw06',
 '10osnnk',
 '10pl8zt',
 '10pq54f',
 '10prtpi',
 '10q17m7',
 '10q1t59',
 '10q46tx',
 '10q6wg2',
 '10q8gj9',
 '10q97um',
 '10qddse',
 '10qdfml',
 '10qizmn',
 '10rrt8z',
 '10s97lu',
 '10v8oqk',
 '10vqm17',
 '110xnuk',
 '112nsvw',
 '115h657',
 '116vz5e',
 '11gpbyp',
 '11jmesr',
 '11lk9x2',
 '11pgiwg',
 '11v9eir',
 '1213x1h',
 '124poau',
 '1271iyi',
 '1282v0e',
 '128n9ri',
 '129m4ch',
 '129z5l8',
 '12

CPU times: total: 656 ms
Wall time: 1.63 s


In [19]:
%%time

for elem in tqdm(curated_df.id.values, total=len(curated_df.id.values), desc='Creating thumbnails'):
	create_thumbnail(elem, curated_df, crops, extant)

'Thumbnail created for zsv66h'

Creating thumbnails:  28%|██▊       | 10341/37031 [1:37:25<3:21:42,  2.21it/s]