In [89]:
%%time

from adlfs import AzureBlobFileSystem

import time

import json
import pandas
import pandas as pd
from tqdm import tqdm

from common.schemas.pyarrow_schema import tagging_schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.captioning.azure_descriptions import AzureCaption
from common.schemas.pyarrow_schema import schema

tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

from common.data_frame_functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

CPU times: total: 15.6 ms
Wall time: 10 ms


In [90]:
%%time

captions = pd.read_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow')
tags = pd.read_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow')
crops = pd.read_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow')

display(captions.shape)
display(tags.shape)
display(crops.shape)

(16720, 7)

(38085, 3)

(1872, 6)

CPU times: total: 141 ms
Wall time: 1.14 s


In [91]:
%%time

current_captions = file_system.ls("data/caption")
display(len(current_captions))

1872

CPU times: total: 0 ns
Wall time: 3 ms


In [92]:
%%time

all_data = []
for caption_file in tqdm(current_captions, total=len(current_captions), desc='Reading caption files'):
	caption_data = json.loads(file_system.read_text(caption_file, encoding='utf-8'))
	dense_caption_result = caption_data.get('denseCaptionsResult')
	metadata = caption_data.get('metadata')
	tags_result = caption_data.get('tagsResult')
	smart_crop_result = caption_data.get('smartCropsResult')
	image_id = caption_file.split('/')[-1].split('.')[0]
	filtered_data = {
		"id": image_id,
		"dense_captions": dense_caption_result['values'],
		"tags": tags_result['values'],
		"smart_crop": smart_crop_result['values']
	}
	all_data.append(filtered_data)

Reading caption files: 100%|██████████| 1872/1872 [02:02<00:00, 15.26it/s]

CPU times: total: 18.7 s
Wall time: 2min 2s





In [93]:
%%time

new_captions = pandas.json_normalize(data=all_data, record_path=['dense_captions'], meta=['id'], record_prefix='dense_captions_')
new_tags = pandas.json_normalize(data=all_data, record_path=['tags'], meta=['id'], record_prefix='tags_')
new_crops = pandas.json_normalize(data=all_data, record_path=['smart_crop'], meta=['id'], record_prefix='smart_crop_')

CPU times: total: 422 ms
Wall time: 421 ms


In [94]:
%%time

merged_captions = pandas.concat([new_captions, captions])
merged_captions.set_index(keys=['id', 'dense_captions_text', 'dense_captions_confidence'], inplace=True, drop=False)
merged_captions.drop_duplicates(inplace=True)
merged_captions.reset_index(drop=True, inplace=True)

display(f'{merged_captions.shape[0] - captions.shape[0]} new rows added to captions')

merged_captions.to_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow')
display(pandas.read_parquet('data/parquet/image_captions.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to captions'

Unnamed: 0,dense_captions_text,dense_captions_confidence,dense_captions_boundingBox.x,dense_captions_boundingBox.y,dense_captions_boundingBox.w,dense_captions_boundingBox.h,id
0,a woman taking a selfie,0.662057,0,0,1468,2608,1002cx2
1,a woman taking a selfie,0.665344,0,100,1447,2473,1002cx2
2,a woman wearing a black dress,0.473444,185,1443,1265,1150,1002cx2
3,a close up of a key,0.593095,716,1503,82,140,1002cx2
4,a close up of an eye,0.619898,778,744,152,84,1002cx2
...,...,...,...,...,...,...,...
16715,a close-up of a green plant,0.453059,680,1066,394,280,zzxq2l
16716,a mountain range in the distance,0.334624,498,277,568,99,zzxq2l
16717,a woman smiling at the camera,0.465720,244,293,319,530,zzxq2l
16718,a woman wearing a dress,0.548892,0,758,1048,570,zzxq2l


CPU times: total: 125 ms
Wall time: 653 ms


In [95]:
%%time

merged_tags = pandas.concat([new_tags, tags])
merged_tags.set_index(keys=['id', 'tags_name', 'tags_confidence'], inplace=True, drop=False)
merged_tags.drop_duplicates(inplace=True)
merged_tags.reset_index(drop=True, inplace=True)

display(f'{merged_tags.shape[0] - tags.shape[0]} new rows added to tags')

merged_tags.to_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow')
display(pandas.read_parquet('data/parquet/image_tags.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to tags'

Unnamed: 0,tags_name,tags_confidence,id
0,person,0.998358,1002cx2
1,human face,0.997763,1002cx2
2,clothing,0.990993,1002cx2
3,lady,0.981919,1002cx2
4,smile,0.964994,1002cx2
...,...,...,...
38080,woman,0.753262,zzxq2l
38081,beach,0.739799,zzxq2l
38082,standing,0.630927,zzxq2l
38083,girl,0.556561,zzxq2l


CPU times: total: 156 ms
Wall time: 828 ms


In [96]:
%%time

merged_crops = pandas.concat([new_crops, crops])
merged_crops.set_index(keys=['id'], inplace=True, drop=False)
merged_crops.drop_duplicates(inplace=True)
merged_crops.reset_index(drop=True, inplace=True)

display(f'{merged_crops.shape[0] - crops.shape[0]} new rows added to crops')

merged_crops.to_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow')
display(pandas.read_parquet('data/parquet/image_cropping.parquet', filesystem=file_system, engine='pyarrow'))

'0 new rows added to crops'

Unnamed: 0,smart_crop_aspectRatio,smart_crop_boundingBox.x,smart_crop_boundingBox.y,smart_crop_boundingBox.w,smart_crop_boundingBox.h,id
0,1.0,0,140,1462,1462,1002cx2
1,1.0,143,138,1249,1250,1003dod
2,1.0,0,355,1948,1948,1008ddt
3,1.0,0,224,1120,1120,100dcrd
4,1.0,0,0,1107,1107,100jl1b
...,...,...,...,...,...,...
1867,1.0,0,0,1025,1025,zzr9nz
1868,1.0,0,0,461,461,zzruf4
1869,1.0,0,0,2273,2273,zzu8i8
1870,1.0,135,0,1299,1299,zzvv3x


CPU times: total: 46.9 ms
Wall time: 433 ms
