In [None]:
%%time

display("=== Starting 0-2 Blip Image Captioning ===")

import datetime as dt
import os

import pandas
import pandas as pd
from tqdm import tqdm

from common.captioning.caption import BlipCaption
from common.schemas.pyarrow_schema import schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.functions.functions import Functions

tqdm.pandas(desc="Progress")

file_system = AzureFileStorageAdapter('data').get_file_storage()

functions: Functions = Functions()

In [None]:
%%time

import torch
if torch.cuda.is_available():
	display("=== Loading CUDA ===")
	caption_0 = BlipCaption("cuda:0")
	caption_1 = BlipCaption("cuda:1")
else:
	display("=== Loading CPU ===")
	caption_0 = BlipCaption("cpu")
	caption_1 = BlipCaption("cpu")

In [None]:
%%time

ready_to_caption = pandas.read_parquet('data/temp/caption', engine='pyarrow', filesystem=file_system, schema=schema)
ready_to_caption.set_index("id", inplace=True, drop=False)

extant_curate = pandas.read_parquet('data/parquet/primary_caption.parquet', engine='pyarrow', filesystem=file_system, schema=schema)
extant_curate.set_index("id", inplace=True, drop=False)

display("=== Extant Data ===")
display(extant_curate)


display("=== Read To Caption ===")
display(ready_to_caption)

In [None]:
%%time

check = ready_to_caption.loc[(ready_to_caption['caption'] == "") & (ready_to_caption['caption'].notnull()) & (ready_to_caption['exists'] == True)]

del ready_to_caption

ready_to_caption = check.copy()
del check

display("== Filtered On Existing Caption ==")
display(ready_to_caption.shape)
display(ready_to_caption)

In [None]:
%%time

dropped = ready_to_caption.dropna(axis=1, how='all')
dropped.reset_index(drop=True, inplace=True)
dropped.set_index("id", inplace=True, drop=False)

del ready_to_caption

ready_to_caption = dropped.copy()

del dropped

display("== Dropped And Reset Data ==")
display(ready_to_caption.shape)
display(ready_to_caption)

In [None]:
%%time

extant_ids = extant_curate.index.values.tolist()
requires_captioning = ready_to_caption.loc[~ready_to_caption['id'].isin(extant_ids)]

dropped = requires_captioning.dropna(axis=1, how='all')
dropped.reset_index(drop=True, inplace=True)
dropped.set_index("id", inplace=True, drop=False)

del requires_captioning
del ready_to_caption

ready_to_caption = dropped.copy()

del dropped

display("== Dropped And Reset Data ==")
display(ready_to_caption.shape)
display(ready_to_caption)

In [None]:
%%time

ready_to_caption['caption'] = ready_to_caption.progress_apply(lambda x: functions.apply_caption(x, [caption_0, caption_1]), axis=1)

display("== Data With Captions ==")
display(ready_to_caption.shape)
display(ready_to_caption)

In [None]:
%%time

dropped = ready_to_caption.dropna(axis=1, how='all')
dropped.reset_index(drop=True, inplace=True)
dropped.set_index("id", inplace=True, drop=False)

del ready_to_caption

ready_to_caption = dropped.copy()

del dropped

display("== Dropped And Reset Data After Captioning ==")
display(ready_to_caption.shape)
display(ready_to_caption)

In [None]:
%%time

concat = pandas.concat([extant_curate, ready_to_caption])

dropped = concat.dropna(axis=1, how='all')
dropped.reset_index(drop=True, inplace=True)
dropped.set_index("id", inplace=True, drop=False)

del concat

concat = dropped.copy()

del dropped

display("== Dropped And Reset Data ==")
display(concat.shape)
display(concat)

In [None]:
%%time

back_up_name = f"data/parquet/primary_caption_{dt.datetime.timestamp(dt.datetime.now())}.parquet"

display(f"== Writing Back-Up {back_up_name} ==")
current = pandas.read_parquet('data/parquet/primary_caption.parquet', engine='pyarrow', filesystem=file_system, schema=schema)
current.to_parquet(back_up_name, schema=schema, filesystem=file_system)

display(current.shape)
display(current)

In [None]:
%%time

concat.to_parquet("data/parquet/primary_caption.parquet", schema=schema, filesystem=file_system)
new = pd.read_parquet("data/parquet/primary_caption.parquet", engine='pyarrow', schema=schema, filesystem=file_system)

display("== Updating Ready To Curate With All New Data ==")
display(new.shape)
display(new)

In [None]:
!jupyter notebook stop