In [None]:
%%time

import pandas
import requests
from PIL import Image
from adlfs import AzureBlobFileSystem
from tqdm import tqdm

from common.captioning.azure_descriptions import AzureCaption
from common.schemas.pyarrow_schema import tagging_schema
from common.storage.azure_file_storage import AzureFileStorageAdapter

tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

from common.functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

In [None]:
# %%time
#
# curated_df = pandas.read_parquet('data/parquet/primary_caption.parquet', filesystem=file_system, engine='pyarrow')
#
# accepted = curated_df.loc[curated_df["accept"] == True]
#
# dropped = accepted.dropna()
# dropped.reset_index(inplace=True, drop=True)
#
# del accepted
#
# accepted = dropped.copy()
# del dropped
#
# display(accepted)

In [None]:
# %%time
#
#
# display("=== Obtaining Caption Files List ===")
# current_captions = file_system.ls("data/caption")
# display(f"current caption files: {len(current_captions)}")
#
# all_data = []
# filtered_data = []
# for caption_file in tqdm(current_captions, total=len(current_captions), desc='Reading caption files'):
# 	image_id = caption_file.split('/')[-1].split('.')[0]
# 	try:
# 		file_size = file_system.size(caption_file)
# 		if file_size == 0:
# 			display(f'Empty file -- removing {image_id}', clear=True)
# 			file_system.rm(caption_file)
# 			continue
# 		caption_data = json.loads(file_system.read_text(caption_file, encoding='utf-8'))
# 		caption_data["id"] = image_id
# 		dense_caption_result = caption_data.get('denseCaptionsResult')
# 		metadata = caption_data.get('metadata')
# 		tags_result = caption_data.get('tagsResult')
# 		smart_crop_result = caption_data.get('smartCropsResult')
# 		basic_caption = caption_data.get('captionResult')
# 		_filtered_data = {
# 			"id": image_id,
# 			"captions": [basic_caption],
# 			"dense_captions": dense_caption_result['values'],
# 			"meta": [metadata],
# 			"tags": tags_result['values'],
# 			"smart_crop": smart_crop_result['values']
# 		}
# 		all_data.append(caption_data)
# 		filtered_data.append(_filtered_data)
# 	except Exception as e:
# 		display(f" Exception {e} for {image_id}", clear=True)
# 		continue

In [None]:
# %%time
#
# all_data_from_captions = pandas.DataFrame(data=all_data)
# filtered_data_from_captions = pandas.DataFrame(data=filtered_data)
#
# display(all_data_from_captions)
# display(filtered_data_from_captions)
#
# all_data_from_captions.to_parquet("data/parquet/all_data_from_captions.parquet", engine='pyarrow',
# 								  filesystem=file_system)
#
# filtered_data_from_captions.to_parquet("data/parquet/filtered_data_from_captions.parquet", engine='pyarrow',
# 									   filesystem=file_system)

In [None]:
all_data_from_captions = pandas.read_parquet("data/parquet/all_data_from_captions.parquet", engine='pyarrow', filesystem=file_system)
display(all_data_from_captions)
filtered_data_from_captions = pandas.read_parquet("data/parquet/filtered_data_from_captions.parquet", engine='pyarrow', filesystem=file_system)
display(filtered_data_from_captions)

In [None]:
%%time

tags = pandas.DataFrame({'id': filtered_data_from_captions.id, 'tags': filtered_data_from_captions.tags})
out = []
for i,r in tqdm(tags.iterrows(), total=len(tags)):
	if r['tags'] is None:
			continue
	tag = pandas.json_normalize(r['tags'])
	tag['id'] = r['id']
	d = tag.to_dict(orient='records')
	out.extend(d)
converted_tags = pandas.DataFrame(data=out)
display(converted_tags)
converted_tags.to_parquet("data/parquet/tags.parquet", engine='pyarrow', filesystem=file_system)

In [None]:
%%time

dense = pandas.DataFrame({'id': filtered_data_from_captions.id, 'dense_captions': filtered_data_from_captions.dense_captions})

out = []
for i,r in tqdm(dense.iterrows(), total=len(dense)):
	if r['dense_captions'] is None:
			continue
	dense_caption = pandas.json_normalize(r['dense_captions'])
	dense_caption['id'] = r['id']
	d = dense_caption.to_dict(orient='records')
	out.extend(d)

converted = pandas.DataFrame(data=out)
display(converted)
converted.to_parquet("data/parquet/dense_captions.parquet", engine='pyarrow', filesystem=file_system)

In [None]:
# single_caption_data = pandas.DataFrame(
# 	{
# 		'id': all_data_from_captions['id'],
# 		'azure_caption': [item['text'] for item in all_data_from_captions['captionResult']],
# 		'tags': [[foo['name'] for foo in item['values']] for item in all_data_from_captions['tagsResult']]
# 	})
#
#
# single_caption_data_indexed = single_caption_data.set_index("id")
#
# accepted_indexed = accepted.set_index("id")
#
# for index, row in accepted_indexed.iterrows():
# 	accepted_indexed.at[index, 'azure_caption'] = ""
# 	accepted_indexed.at[index, 'thumbnail_path'] = ""
# 	accepted_indexed.at[index, 'thumbnail_exists'] = False
# 	accepted_indexed.at[index, 'thumbnail_curated'] = False
# 	accepted_indexed.at[index, 'thumbnail_accept'] = False
#	accepted_indexed.at[index, 'additional_captions'] = ['']
#
# accepted_indexed.update(single_caption_data_indexed)
#
# accepted_final = accepted_indexed.reset_index()
#
# display("== Updated With Basic Captions ==")
# display(accepted_final)

In [None]:
# accepted_final.to_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
# del accepted_final

In [None]:
%%time

accepted_final = pandas.read_parquet("data/parquet/curation_2.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)
display(accepted_final)

In [None]:
%%time

def get_aspect_ratio(x: object):
	return x['crops'][0]['aspectRatio']
def get_bounding_box(x: object):
	return x['crops'][0]['boundingBox']

In [None]:
cropping = pandas.DataFrame({'id': filtered_data_from_captions['id'], 'crops': filtered_data_from_captions['smart_crop']}).set_index('id', drop=False)

cropping['aspectRatio'] = cropping.progress_apply(lambda x: get_aspect_ratio(x), axis=1)
cropping['bounding_box'] = cropping.progress_apply(lambda x: get_bounding_box(x), axis=1)
cropping['x'] = cropping.progress_apply(lambda x: x['bounding_box']['x'], axis=1)
cropping['y'] = cropping.progress_apply(lambda x: x['bounding_box']['y'], axis=1)
cropping['w'] = cropping.progress_apply(lambda x: x['bounding_box']['w'], axis=1)
cropping['h'] = cropping.progress_apply(lambda x: x['bounding_box']['h'], axis=1)

display(cropping)

In [None]:
def create_thumbnail(target_image_id: str, file_names: list, crops: pandas.DataFrame, curated_data: pandas.DataFrame):
	_file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

	out_path = f"data/image/thumbnail/{target_image_id}.jpg"
	try:
		if target_image_id is None or out_path in file_names:
			# print(f'Image {target_image_id} already exists, skipping')
			return out_path

		cropping_information = cropping.loc[crops['id'] == target_image_id]
		if cropping_information is None or len(cropping_information) == 0:
			# print(f'No cropping information for {target_image_id}, skipping')
			return "/data/nope"

		record = curated_data.loc[curated_data['id'] == target_image_id]
		record_path = record.to_dict(orient='records')[0]['path']
		image_url = file_system.url(record_path)
		original_image = Image.open(requests.get(image_url, stream=True).raw)
		copied_image = original_image.copy()
		original_image.close()

		cropped = copied_image.crop((cropping_information['x'].values[0],
									 cropping_information['y'].values[0],
									 cropping_information['x'].values[0] +
									 cropping_information['w'].values[0],
									 cropping_information['y'].values[0] +
									 cropping_information['h'].values[0]))
		copied_image.close()

		resized = cropped.resize((512, 512), 1)
		resized.save('temp.jpg')
		resized.close()
		file_system.upload('temp.jpg', out_path, overwrite=True)
		print(f'Thumbnail created for {target_image_id}')
		return out_path

	except Exception as ex:
		print(f'Error creating thumbnail for {target_image_id}: {ex}')
		return "/data/nope"

In [None]:
file_names = file_system.ls('data/image/thumbnail')

accepted_final['thumbnail_path'] = accepted_final.progress_apply(lambda x: create_thumbnail(x['id'], file_names, cropping, accepted_final), axis=1)

display(accepted_final)

In [None]:
%%time

accepted_final['thumbnail_exists'] = accepted_final.progress_apply(lambda x: file_system.exists(x['thumbnail_path']), axis=1)

display(accepted_final)

In [None]:
%%time

accepted_copy = accepted_final.copy()

accepted_copy['additional_captions'] = accepted_copy.progress_apply(lambda x: [], axis=1)

accepted_slice = accepted_copy.loc[accepted_copy['thumbnail_exists'] == True, tagging_schema.names]

accepted_slice.dropna(inplace=True)

accepted_slice.reset_index(inplace=True, drop=True)

display(accepted_slice)

In [None]:
%%time

# TODO: Only update the this accepted slice if there are new captions, we don't want to overwrite the curated data

In [None]:
%%time

file_system.cp('data/parquet/thumbnail_curation.parquet', 'data/parquet/thumbnail_curation.parquet.bak')

# accepted_slice.to_parquet("data/parquet/thumbnail_curation.parquet", engine='pyarrow', filesystem=file_system, schema=tagging_schema)

In [None]:
display(accepted_slice)