In [1]:
%%time

import json

import pandas
import requests
from PIL import Image
from adlfs import AzureBlobFileSystem
from tqdm import tqdm

from common.captioning.azure_descriptions import AzureCaption
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.schemas.pyarrow_schema import tagging_schema

tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

from common.functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

CPU times: total: 8.7 s
Wall time: 16.1 s


In [2]:
%%time

curated_df = pandas.read_parquet('data/parquet/back.parquet', filesystem=file_system, engine='pyarrow')

accepted = curated_df.loc[curated_df["accept"] == True]

dropped = accepted.dropna()
dropped.reset_index(inplace=True, drop=True)

del accepted

accepted = dropped.copy()
del dropped

display(accepted)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
0,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,True,True,[]
1,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,True,True,[]
2,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,True,True,[]
3,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,True,True,[]
4,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,True,True,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20146,13nw4ss,gentlemanboners,Kvikkleire,Natalie Portman,a woman in a red dress and red coat posing for...,85453a20bb721d013e88b37a005df958,/r/gentlemanboners/comments/13nw4ss/natalie_po...,https://i.redd.it/0jb5tssc071b1.jpg,13nw4ss.jpg,data/image/13nw4ss.jpg,PrettyGirlDiffusion,True,True,True,[]
20147,13nwtx1,gentlemanboners,kape_pandesal,Ysabel Ortega,araffe woman in a blue dress posing for a picture,09d5b6d47007707d728928008111812e,/r/gentlemanboners/comments/13nwtx1/ysabel_ort...,https://i.redd.it/jlw0wjg9n81b1.jpg,13nwtx1.jpg,data/image/13nwtx1.jpg,PrettyGirlDiffusion,True,True,True,[]
20148,13nqkt6,DLAH,gonefishin37,Money green,araffe in a green dress posing for a picture,6f4f421d99e495d4499e7a3d9870ba32,/r/DLAH/comments/13nqkt6/money_green/,https://i.redd.it/he0g249po71b1.jpg,13nqkt6.jpg,data/image/13nqkt6.jpg,PrettyGirlDiffusion,True,True,True,[]
20149,13nvy9c,DLAH,Thin_Data_2134,DLAH,three women in white dresses standing in front...,ac7202cbb42c7d49f6f81bd53a017b75,/r/DLAH/comments/13nvy9c/dlah/,https://i.redd.it/mbo1lnoig81b1.jpg,13nvy9c.jpg,data/image/13nvy9c.jpg,PrettyGirlDiffusion,True,True,True,[]


CPU times: total: 875 ms
Wall time: 14.9 s


In [None]:
%%time


display("=== Obtaining Caption Files List ===")
current_captions = file_system.ls("data/caption")
display(f"current caption files: {len(current_captions)}")

all_data = []
filtered_data = []
for caption_file in tqdm(current_captions, total=len(current_captions), desc='Reading caption files'):
	image_id = caption_file.split('/')[-1].split('.')[0]
	try:
		file_size = file_system.size(caption_file)
		if file_size == 0:
			display(f'Empty file -- removing {image_id}', clear=True)
			file_system.rm(caption_file)
			continue
		caption_data = json.loads(file_system.read_text(caption_file, encoding='utf-8'))
		caption_data["id"] = image_id
		dense_caption_result = caption_data.get('denseCaptionsResult')
		metadata = caption_data.get('metadata')
		tags_result = caption_data.get('tagsResult')
		smart_crop_result = caption_data.get('smartCropsResult')
		basic_caption = caption_data.get('captionResult')
		_filtered_data = {
			"id": image_id,
			"captions": [basic_caption],
			"dense_captions": dense_caption_result['values'],
			"meta": [metadata],
			"tags": tags_result['values'],
			"smart_crop": smart_crop_result['values']
		}
		all_data.append(caption_data)
		filtered_data.append(_filtered_data)
	except Exception as e:
		display(f" Exception {e} for {image_id}", clear=True)
		continue

'=== Obtaining Caption Files List ==='

'current caption files: 18746'

In [None]:
%%time

all_data_from_captions = pandas.DataFrame(data=all_data)
filtered_data_from_captions = pandas.DataFrame(data=filtered_data)

display(all_data_from_captions)
display(filtered_data_from_captions)

all_data_from_captions.to_parquet("data/parquet/all_data_from_captions.parquet", engine='pyarrow',
								  filesystem=file_system)

filtered_data_from_captions.to_parquet("data/parquet/filtered_data_from_captions.parquet", engine='pyarrow',
									   filesystem=file_system)

In [None]:
single_caption_data = pandas.DataFrame(
	{
		'id': all_data_from_captions['id'],
		'azure_caption': [item['text'] for item in all_data_from_captions['captionResult']],
		'tags': [[foo['name'] for foo in item['values']] for item in all_data_from_captions['tagsResult']]
	})


single_caption_data_indexed = single_caption_data.set_index("id")

accepted_indexed = accepted.set_index("id")

for index, row in accepted_indexed.iterrows():
	accepted_indexed.at[index, 'azure_caption'] = ""
	accepted_indexed.at[index, 'thumbnail_path'] = ""
	accepted_indexed.at[index, 'thumbnail_exists'] = False
	accepted_indexed.at[index, 'thumbnail_curated'] = False
	accepted_indexed.at[index, 'thumbnail_accept'] = False

accepted_indexed.update(single_caption_data_indexed)

accepted_final = accepted_indexed.reset_index()

display("== Updated With Basic Captions ==")
display(accepted_final)

In [None]:
accepted_final.to_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
del accepted_final
accepted_final = pandas.read_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
display(accepted_final)

In [None]:
# TODO Get dataframe for crops

In [None]:
import os

def create_thumbnail(_image_id: str, _curated_df: pandas.DataFrame, _crops: pandas.DataFrame, _extant_file_names: list):
	_file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()
	out_path = f"data/image/thumbnail/{_image_id}.jpg"
	try:
		if _image_id is None or _image_id in _extant_file_names:
			display(f'Image {_image_id} already exists, skipping', clear=True)
			return out_path

		cropping_information = _crops.loc[_crops['id'] == _image_id]
		if cropping_information is None or len(cropping_information) == 0:
			display(f'No cropping information for {_image_id}, skipping', clear=True)
			return ""

		record = _curated_df.loc[_curated_df['id'] == _image_id]
		record_path = record.to_dict(orient='records')[0]['path']
		image_url = file_system.url(record_path)
		original_image = Image.open(requests.get(image_url, stream=True).raw)
		copied_image = original_image.copy()
		original_image.close()

		cropped = copied_image.crop((cropping_information['smart_crop_boundingBox.x'].values[0],
									 cropping_information['smart_crop_boundingBox.y'].values[0],
									 cropping_information['smart_crop_boundingBox.x'].values[0] +
									 cropping_information['smart_crop_boundingBox.w'].values[0],
									 cropping_information['smart_crop_boundingBox.y'].values[0] +
									 cropping_information['smart_crop_boundingBox.h'].values[0]))
		copied_image.close()

		resized = cropped.resize((512, 512), 1)
		resized.save('temp.jpg')
		resized.close()
		file_system.upload('temp.jpg', out_path, overwrite=True)
		display(f'Thumbnail created for {_image_id}', clear=True)
		return out_path

	except Exception as ex:
		display(f'Error creating thumbnail for {_image_id}: {ex}', clear=True)
		return ""

In [None]:
# records = d.to_dict(orient='records')
#
# for elem in tqdm(records, len(records)):
# 	create_thumbnail(elem['id'], d, )

In [None]:
!jupyter notebook stop