In [1]:
%%time

import json

import pandas
import requests
from PIL import Image
from adlfs import AzureBlobFileSystem
from tqdm import tqdm

from common.captioning.azure_descriptions import AzureCaption
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.schemas.pyarrow_schema import tagging_schema

tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

from common.functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

CPU times: total: 3.59 s
Wall time: 3.95 s


In [68]:
%%time

curated_df = pandas.read_parquet('data/parquet/back.parquet', filesystem=file_system, engine='pyarrow')

accepted = curated_df.loc[curated_df["accept"] == True]

dropped = accepted.dropna()
dropped.reset_index(inplace=True, drop=True)

del accepted

accepted = dropped.copy()
del dropped

display(accepted)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
0,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,True,True,[]
1,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,True,True,[]
2,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,True,True,[]
3,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,True,True,[]
4,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,True,True,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19899,13m6hv4,tightdresses,katiecooper4xo,Bringing tartan back!,arafed woman in a plaid dress posing for a pic...,9501d5f34d0748f58eb3caba7ce1641c,/r/tightdresses/comments/13m6hv4/bringing_tart...,https://i.redd.it/8nx1kv7o0w0b1.jpg,13m6hv4.jpg,data/image/13m6hv4.jpg,PrettyGirlDiffusion,True,True,True,[]
19900,13mcqn4,tightdresses,GucciBagHolder,Blonde,a close up of a woman in a white dress posing ...,70b0eff895fd87d368cb1f53af9a7f3c,/r/tightdresses/comments/13mcqn4/blonde/,https://i.redd.it/6ojq4wtl9x0b1.jpg,13mcqn4.jpg,data/image/13mcqn4.jpg,PrettyGirlDiffusion,True,True,True,[]
19901,13m91m2,DLAH,qwertybird3141,College grad,arafed woman standing on steps with a red and ...,adcbd61b5e37c5e048cb6907eca21bfe,/r/DLAH/comments/13m91m2/college_grad/,https://i.redd.it/skv5m0jwiw0b1.jpg,13m91m2.jpg,data/image/13m91m2.jpg,PrettyGirlDiffusion,True,True,True,[]
19902,13m6gm7,bathandbodyworks,unicornkenz,Candle Sale,bath and body works coup coup,47a78b117b59b636fa22093ec4f29bd7,/r/bathandbodyworks/comments/13m6gm7/candle_sale/,https://i.redd.it/h321gmgf0w0b1.jpg,13m6gm7.jpg,data/image/13m6gm7.jpg,CandleDiffusion,True,True,True,[]


CPU times: total: 391 ms
Wall time: 975 ms


In [24]:
%%time


display("=== Obtaining Caption Files List ===")
current_captions = file_system.ls("data/caption")
display(f"current caption files: {len(current_captions)}")

all_data = []
filtered_data = []
for caption_file in tqdm(current_captions, total=len(current_captions), desc='Reading caption files'):
	image_id = caption_file.split('/')[-1].split('.')[0]
	try:
		file_size = file_system.size(caption_file)
		if file_size == 0:
			display(f'Empty file -- removing {image_id}', clear=True)
			file_system.rm(caption_file)
			continue
		caption_data = json.loads(file_system.read_text(caption_file, encoding='utf-8'))
		caption_data["id"] = image_id
		dense_caption_result = caption_data.get('denseCaptionsResult')
		metadata = caption_data.get('metadata')
		tags_result = caption_data.get('tagsResult')
		smart_crop_result = caption_data.get('smartCropsResult')
		basic_caption = caption_data.get('captionResult')
		_filtered_data = {
			"id": image_id,
			"captions": [basic_caption],
			"dense_captions": dense_caption_result['values'],
			"meta": [metadata],
			"tags": tags_result['values'],
			"smart_crop": smart_crop_result['values']
		}
		all_data.append(caption_data)
		filtered_data.append(_filtered_data)
	except Exception as e:
		display(f" Exception {e} for {image_id}", clear=True)
		continue

'=== Obtaining Caption Files List ==='

'current caption files: 6615'

Reading caption files: 100%|██████████| 6615/6615 [07:19<00:00, 15.05it/s]

CPU times: total: 1min 47s
Wall time: 7min 19s





In [25]:
%%time

all_data_from_captions = pandas.DataFrame(data=all_data)
filtered_data_from_captions = pandas.DataFrame(data=filtered_data)

display(all_data_from_captions)
display(filtered_data_from_captions)

all_data_from_captions.to_parquet("data/parquet/all_data_from_captions.parquet", engine='pyarrow',
								  filesystem=file_system)

filtered_data_from_captions.to_parquet("data/parquet/filtered_data_from_captions.parquet", engine='pyarrow',
									   filesystem=file_system)

Unnamed: 0,captionResult,objectsResult,readResult,denseCaptionsResult,modelVersion,metadata,tagsResult,smartCropsResult,peopleResult,id
0,"{'text': 'a pier in the ocean', 'confidence': ...",{'values': []},"{'stringIndexType': 'TextElements', 'content':...","{'values': [{'text': 'a pier in the ocean', 'c...",2023-02-01-preview,"{'width': 4032, 'height': 3024}","{'values': [{'name': 'outdoor', 'confidence': ...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 1360, 'y': 1...",1002bsl
1,"{'text': 'a town in the mountains', 'confidenc...",{'values': []},"{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a group of houses in a f...,2023-02-01-preview,"{'width': 1200, 'height': 800}","{'values': [{'name': 'outdoor', 'confidence': ...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 1039, 'y': 5...",1002cgj
2,"{'text': 'a woman taking a selfie', 'confidenc...","{'values': [{'boundingBox': {'x': 50, 'y': 128...","{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a woman taking a selfie'...,2023-02-01-preview,"{'width': 1468, 'height': 2608}","{'values': [{'name': 'person', 'confidence': 0...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 0, 'y': 122,...",1002cx2
3,{'text': 'a woman in a garment standing next t...,"{'values': [{'boundingBox': {'x': 657, 'y': 75...","{'stringIndexType': 'TextElements', 'content':...","{'values': [{'text': 'a woman in a garment', '...",2023-02-01-preview,"{'width': 1523, 'height': 2030}","{'values': [{'name': 'outdoor', 'confidence': ...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 425, 'y': 42...",1003dod
4,{'text': 'a train on a bridge over a body of w...,{'values': []},"{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a group of trees in the ...,2023-02-01-preview,"{'width': 4032, 'height': 3024}","{'values': [{'name': 'outdoor', 'confidence': ...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 3347, 'y': 1...",10044wv
...,...,...,...,...,...,...,...,...,...,...
6610,"{'text': 'a woman in a blue lingerie', 'confid...","{'values': [{'boundingBox': {'x': 87, 'y': 236...","{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a woman in a blue garmen...,2023-02-01-preview,"{'width': 2309, 'height': 2560}","{'values': [{'name': 'person', 'confidence': 0...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 139, 'y': 22...",zzu8i8
6611,{'text': 'a city with many buildings and a roa...,{'values': []},"{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a roof of a building at ...,2023-02-01-preview,"{'width': 1283, 'height': 1519}","{'values': [{'name': 'sky', 'confidence': 0.97...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 1047, 'y': 1...",zzv4hz
6612,"{'text': 'a woman sitting on a bed', 'confiden...","{'values': [{'boundingBox': {'x': 352, 'y': 62...","{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a woman sitting on a bed...,2023-02-01-preview,"{'width': 1440, 'height': 1800}","{'values': [{'name': 'person', 'confidence': 0...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 351, 'y': 8,...",zzvv3x
6613,{'text': 'a woman in a dress posing for a pict...,"{'values': [{'boundingBox': {'x': 263, 'y': 28...","{'stringIndexType': 'TextElements', 'content':...",{'values': [{'text': 'a person wearing a neckl...,2023-02-01-preview,"{'width': 1075, 'height': 1351}","{'values': [{'name': 'outdoor', 'confidence': ...","{'values': [{'aspectRatio': 1.0, 'boundingBox'...","{'values': [{'boundingBox': {'x': 222, 'y': 30...",zzxq2l


Unnamed: 0,id,captions,dense_captions,meta,tags,smart_crop
0,1002bsl,"[{'text': 'a pier in the ocean', 'confidence':...","[{'text': 'a pier in the ocean', 'confidence':...","[{'width': 4032, 'height': 3024}]","[{'name': 'outdoor', 'confidence': 0.997430682...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 619..."
1,1002cgj,"[{'text': 'a town in the mountains', 'confiden...","[{'text': 'a group of houses in a forest', 'co...","[{'width': 1200, 'height': 800}]","[{'name': 'outdoor', 'confidence': 0.996889829...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 206..."
2,1002cx2,"[{'text': 'a woman taking a selfie', 'confiden...","[{'text': 'a woman taking a selfie', 'confiden...","[{'width': 1468, 'height': 2608}]","[{'name': 'person', 'confidence': 0.9983580112...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 0, ..."
3,1003dod,[{'text': 'a woman in a garment standing next ...,"[{'text': 'a woman in a garment', 'confidence'...","[{'width': 1523, 'height': 2030}]","[{'name': 'outdoor', 'confidence': 0.973955869...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 143..."
4,10044wv,[{'text': 'a train on a bridge over a body of ...,"[{'text': 'a group of trees in the fog', 'conf...","[{'width': 4032, 'height': 3024}]","[{'name': 'outdoor', 'confidence': 0.991929769...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 206..."
...,...,...,...,...,...,...
6610,zzu8i8,"[{'text': 'a woman in a blue lingerie', 'confi...","[{'text': 'a woman in a blue garment', 'confid...","[{'width': 2309, 'height': 2560}]","[{'name': 'person', 'confidence': 0.9933421611...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 0, ..."
6611,zzv4hz,[{'text': 'a city with many buildings and a ro...,"[{'text': 'a roof of a building at night', 'co...","[{'width': 1283, 'height': 1519}]","[{'name': 'sky', 'confidence': 0.9703860282897...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 30,..."
6612,zzvv3x,"[{'text': 'a woman sitting on a bed', 'confide...","[{'text': 'a woman sitting on a bed', 'confide...","[{'width': 1440, 'height': 1800}]","[{'name': 'person', 'confidence': 0.9989609718...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 135..."
6613,zzxq2l,[{'text': 'a woman in a dress posing for a pic...,"[{'text': 'a person wearing a necklace', 'conf...","[{'width': 1075, 'height': 1351}]","[{'name': 'outdoor', 'confidence': 0.993963718...","[{'aspectRatio': 1.0, 'boundingBox': {'x': 101..."


CPU times: total: 1.47 s
Wall time: 6.39 s


In [89]:
single_caption_data = pandas.DataFrame(
	{
		'id': all_data_from_captions['id'],
		'azure_caption': [item['text'] for item in all_data_from_captions['captionResult']],
		'tags': [[foo['name'] for foo in item['values']] for item in all_data_from_captions['tagsResult']]
	})


single_caption_data_indexed = single_caption_data.set_index("id")

accepted_indexed = accepted.set_index("id")

for index, row in accepted_indexed.iterrows():
	accepted_indexed.at[index, 'azure_caption'] = ""
	accepted_indexed.at[index, 'thumbnail_path'] = ""
	accepted_indexed.at[index, 'thumbnail_exists'] = False
	accepted_indexed.at[index, 'thumbnail_curated'] = False
	accepted_indexed.at[index, 'thumbnail_accept'] = False

accepted_indexed.update(single_caption_data_indexed)

accepted_final = accepted_indexed.reset_index()

display("== Updated With Basic Captions ==")
display(accepted_final)

'== Updated With Basic Captions =='

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags,azure_caption,thumbnail_path,thumbnail_exists,thumbnail_curated,thumbnail_accept
0,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,True,True,"[person, human face, clothing, lady, smile, ch...",a woman taking a selfie,,False,False,False
1,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,True,True,"[person, human face, skin, eyelash, eyebrow, b...",a woman taking a selfie,,False,False,False
2,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,True,True,"[clothing, person, human face, smile, shoulder...",a woman smiling at camera,,False,False,False
3,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,True,True,"[person, human face, indoor, clothing, woman, ...",a woman taking a selfie,,False,False,False
4,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,True,True,"[person, human face, smile, clothing, woman, i...",a woman taking a selfie,,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19899,13m6hv4,tightdresses,katiecooper4xo,Bringing tartan back!,arafed woman in a plaid dress posing for a pic...,9501d5f34d0748f58eb3caba7ce1641c,/r/tightdresses/comments/13m6hv4/bringing_tart...,https://i.redd.it/8nx1kv7o0w0b1.jpg,13m6hv4.jpg,data/image/13m6hv4.jpg,PrettyGirlDiffusion,True,True,True,[],,,False,False,False
19900,13mcqn4,tightdresses,GucciBagHolder,Blonde,a close up of a woman in a white dress posing ...,70b0eff895fd87d368cb1f53af9a7f3c,/r/tightdresses/comments/13mcqn4/blonde/,https://i.redd.it/6ojq4wtl9x0b1.jpg,13mcqn4.jpg,data/image/13mcqn4.jpg,PrettyGirlDiffusion,True,True,True,[],,,False,False,False
19901,13m91m2,DLAH,qwertybird3141,College grad,arafed woman standing on steps with a red and ...,adcbd61b5e37c5e048cb6907eca21bfe,/r/DLAH/comments/13m91m2/college_grad/,https://i.redd.it/skv5m0jwiw0b1.jpg,13m91m2.jpg,data/image/13m91m2.jpg,PrettyGirlDiffusion,True,True,True,[],,,False,False,False
19902,13m6gm7,bathandbodyworks,unicornkenz,Candle Sale,bath and body works coup coup,47a78b117b59b636fa22093ec4f29bd7,/r/bathandbodyworks/comments/13m6gm7/candle_sale/,https://i.redd.it/h321gmgf0w0b1.jpg,13m6gm7.jpg,data/image/13m6gm7.jpg,CandleDiffusion,True,True,True,[],,,False,False,False


In [92]:
accepted_final.to_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
del accepted_final
accepted_final = pandas.read_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
display(accepted_final)

In [None]:
# TODO Get dataframe for crops

In [37]:
import os

def create_thumbnail(_image_id: str, _curated_df: pandas.DataFrame, _crops: pandas.DataFrame, _extant_file_names: list):
	_file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()
	out_path = f"data/image/thumbnail/{_image_id}.jpg"
	try:
		if _image_id is None or _image_id in _extant_file_names:
			display(f'Image {_image_id} already exists, skipping', clear=True)
			return out_path

		cropping_information = _crops.loc[_crops['id'] == _image_id]
		if cropping_information is None or len(cropping_information) == 0:
			display(f'No cropping information for {_image_id}, skipping', clear=True)
			return ""

		record = _curated_df.loc[_curated_df['id'] == _image_id]
		record_path = record.to_dict(orient='records')[0]['path']
		image_url = file_system.url(record_path)
		original_image = Image.open(requests.get(image_url, stream=True).raw)
		copied_image = original_image.copy()
		original_image.close()

		cropped = copied_image.crop((cropping_information['smart_crop_boundingBox.x'].values[0],
									 cropping_information['smart_crop_boundingBox.y'].values[0],
									 cropping_information['smart_crop_boundingBox.x'].values[0] +
									 cropping_information['smart_crop_boundingBox.w'].values[0],
									 cropping_information['smart_crop_boundingBox.y'].values[0] +
									 cropping_information['smart_crop_boundingBox.h'].values[0]))
		copied_image.close()

		resized = cropped.resize((512, 512), 1)
		resized.save('temp.jpg')
		resized.close()
		file_system.upload('temp.jpg', out_path, overwrite=True)
		display(f'Thumbnail created for {_image_id}', clear=True)
		return out_path

	except Exception as ex:
		display(f'Error creating thumbnail for {_image_id}: {ex}', clear=True)
		return ""

In [None]:
records = d.to_dict(orient='records')

for elem in tqdm(records, len(records)):
	create_thumbnail(elem['id'], d, )

In [None]:
!jupyter notebook stop