In [None]:
import os

import pandas

import dask.dataframe as dd

from shared_code.utility.spark.set_environ import set_azure_env

set_azure_env()

from shared_code.utility.storage.table import TableAdapter

In [None]:
from tqdm.dask import TqdmCallback
cb = TqdmCallback(desc="global")
cb.register()

from tqdm.auto import tqdm
tqdm.pandas(desc="global")

In [None]:
from tqdm import tqdm

class InnerProgressBar(tqdm):
	def __init__(self, total, desc):
		super().__init__(desc=desc)
		self.total = total
		self.current = 0

	def update_to(self):
		self.update(self.current)

In [None]:
table_name = "training"

table_adapter: TableAdapter = TableAdapter()

data_path = "/data/parquet/"

parquet_raw_data_path = data_path + "raw_data.parquet"

parquet_process_data_path = data_path + "processed_data.parquet"

In [None]:
os.makedirs(data_path, exist_ok=True)

print(f"Data Path:\t{data_path}\n\nExists:\t{os.path.exists(data_path)}")

In [None]:
pandas_df = None
if os.path.exists("foo"):
	print("Loading from parquet")
	pandas_df = pandas.read_parquet(parquet_raw_data_path)


else:
	print("Loading from table")
	raw_data = table_adapter.get_all_entities(table_name)
	pandas_df = pandas.DataFrame(list(raw_data))
	print(f"Saving to parquet {parquet_process_data_path}")
	pandas_df.to_parquet(parquet_raw_data_path)

In [None]:
display(f"Initial Dataframe Shape:\t{pandas_df.shape}")

display(pandas_df)

In [None]:
print("Normalizing Dataframe For Processing")

subreddit = pandas_df["subreddit"]
image = pandas_df["image_name"]
text = pandas_df["text"]
hash_value = pandas_df["hash"]
original_caption = pandas_df["caption"]
thumbnail_caption = pandas_df["updated_caption"]
comment_id = pandas_df["id"]
all_normalized_frame = pandas.DataFrame(
	{
		"subreddit": subreddit,
		"file_name": image,
		"text": text,
		"original_caption": original_caption,
		"thumbnail_path": pandas_df.apply(lambda x: f"D:\\data\\images\\{x['subreddit']}\\thumbnail\\{x['image_name']}", axis=1),
		"thumbnail_caption": thumbnail_caption,
		"thumbnail_exists": pandas_df.apply(lambda x: os.path.exists(f"D:\\data\\images\\{x['subreddit']}\\thumbnail\\{x['image_name']}"), axis=1),
		"original_image": pandas_df.apply(lambda x: f"D:\\data\\images\\{x['subreddit']}\\{x['image_name']}", axis=1),
		"original_image_exists": pandas_df.apply(lambda x: os.path.exists(f"D:\\data\\images\\{x['subreddit']}\\thumbnail\\{x['image_name']}"), axis=1),
		"hash": hash_value,
		"id": comment_id
	}
)
display(f"Normalized Dataframe Shape:\t{all_normalized_frame.shape}")
display(f"Saving to parquet {parquet_process_data_path}")
all_normalized_frame.to_parquet(parquet_process_data_path)

In [None]:
display(f"Reading from parquet {parquet_process_data_path}")
all_normalized_frame = pandas.read_parquet(parquet_process_data_path)
display(all_normalized_frame)

In [None]:
display("Filtering Subreddits with Images")
filtered_on_exist = all_normalized_frame[(all_normalized_frame["thumbnail_exists"] == True) & (all_normalized_frame["original_image_exists"] == True)]

total_images_display = filtered_on_exist.groupby("subreddit").size().reset_index(name="count")
display(total_images_display.sort_values("count", ascending=False))
display(f"Total Records {total_images_display['count'].sum()}")

In [None]:
display("Data With Known Images")
display(filtered_on_exist)

In [None]:
from shared_code.utility.scripts import blip_caption
blip_caption = blip_caption.BlipCaption(1)

In [None]:
def caption_image(image_path) -> str:
	try:
		caption = blip_caption.caption_image(image_path)
		return caption
	except Exception as e:
		print(f":: Error in caption_image: {e}")
		return "bruh"

In [None]:
def wrapper_for_captions(row: object) -> str:
	bruh = "bruh"
	try:
		caption = row["original_caption"]
		image_path = row["original_image"]
		foo = row.__dict__['_name']  # Fucking silly
		progress.update()

		if caption and len(caption) > 5:
			return caption
		else:
			return blip_caption.caption_image(image_path)
	except Exception as e:
		print(e)
		return bruh

In [None]:
def create_tokens(row: object):
	bruh = "bruh"
	try:
		caption = row["original_caption"]
		progress.update()
		tokens = blip_caption.get_nlk_tokens(caption)
		return tokens
	except Exception as e:
		print(e)
		return bruh

In [None]:
temp = filtered_on_exist.copy()
ddf = dd.from_pandas(temp, npartitions=12)
display(temp)

In [None]:
progress: tqdm = InnerProgressBar(len(temp), "Captioning-Primary-Images")
display(f"Total Images: {progress.total}")
temp['new_column'] = ddf.apply(lambda x: wrapper_for_captions(x), axis=1, meta=('str', object)).compute()

In [None]:
display("=== Before Drop ===")
display(temp)
temp.rename(columns={"original_caption": "original_caption_old"}, inplace=True)
temp.rename(columns={"new_column": "original_caption"}, inplace=True)
temp.drop(columns=["original_caption_old"], inplace=True)
display("=== After Drop ===")
display(temp)

In [None]:
display(f"Saving to parquet {parquet_process_data_path}")
temp.to_parquet(parquet_process_data_path)
del temp

In [None]:
display(f"Reading from parquet {parquet_process_data_path} with Updated Primary Captions")
processed_with_captions = pandas.read_parquet(parquet_process_data_path)
display(processed_with_captions)

In [None]:
temp = processed_with_captions.copy()
ddf = dd.from_pandas(temp, npartitions=6)

In [None]:
progress = InnerProgressBar(len(temp), "Captioning-Tokens-For-Image")
display(f"Total Images: {progress.total}")
temp['new_column'] = ddf.apply(lambda x: create_tokens(x), axis=1, meta=('str', object)).compute()

In [None]:
display("=== Before Drop ===")
display(temp)
temp.rename(columns={"thumbnail_caption": "thumbnail_caption_old"}, inplace=True)
temp.rename(columns={"new_column": "thumbnail_caption"}, inplace=True)
temp.drop(columns=["thumbnail_caption_old"], inplace=True)
display("=== After Drop ===")
display(temp)

In [None]:
display(f"Saving to parquet {parquet_process_data_path} with Updated Thumbnail Captions")
temp.to_parquet(parquet_process_data_path)
del temp

In [None]:
display(f"Reading from parquet {parquet_process_data_path} with Updated Thumbnail Captions")
processed_with_captions_more = pandas.read_parquet(parquet_process_data_path)
display(processed_with_captions_more)

In [None]:
display("Filtering Subreddits with Images By original_caption")
filtered_captions = processed_with_captions_more[
	(processed_with_captions_more["original_caption"] != "bruh") &
	(~processed_with_captions_more["original_caption"].isna() | ~ processed_with_captions_more[
		"original_caption"].isnull())
	]

filtered_captions_display = filtered_captions.groupby("subreddit").size().reset_index(name="count")

display(filtered_captions_display.sort_values("count", ascending=False))
display(f"Total Records {filtered_captions_display['count'].sum()}")

In [None]:
display("Filtering Subreddits with Images By thumbnail_caption")
filtered_captions_by_thumbnail = filtered_captions[
	(processed_with_captions_more["thumbnail_caption"] != "bruh") &
	(~processed_with_captions_more["thumbnail_caption"].isna() | ~ processed_with_captions_more[
		"thumbnail_caption"].isnull())
	]
filtered_captions_by_thumbnail_display = filtered_captions_by_thumbnail.groupby("subreddit").size().reset_index(name="count")
display(filtered_captions_by_thumbnail_display.sort_values("count", ascending=False))
display(f"Total Records {filtered_captions_by_thumbnail_display['count'].sum()}")

In [None]:
display("Updating Cloud Storage With Filtered Captioned Images")
def update_cloud(row):
	try:
		_table_adapter = TableAdapter()
		partition_key = "training"
		row_key = row["id"]
		caption = row["original_caption"]
		updated_caption = row["thumbnail_caption"]
		entity = _table_adapter.get_entity("training", partition_key, row_key)
		entity["caption"] = caption
		entity["updated_caption"] = updated_caption
		_table_adapter.upsert_entity_to_table("training", entity)
		progress.update()
		return "bruh"
	except Exception as e:
		print(e)
		return "bruh"

In [71]:
import json
temp = processed_with_captions_more.copy()
temp['foo'] = temp.apply(lambda x: ",".join([json.dumps("|".join([f for f in item.tolist()])) for item in x['thumbnail_caption'].tolist()]), axis=1)
temp.rename(columns={"thumbnail_caption": "thumbnail_caption_old"}, inplace=True)
temp.rename(columns={"foo": "thumbnail_caption"}, inplace=True)
temp.drop(columns=["thumbnail_caption_old"], inplace=True)
display(temp)

upload to cloud:  11%|█▏        | 892/7888 [03:39<41:45,  2.79it/s][A
upload to cloud:  11%|█▏        | 893/7888 [03:39<42:56,  2.72it/s][A
upload to cloud:  11%|█▏        | 896/7888 [03:40<40:11,  2.90it/s][A
upload to cloud:  11%|█▏        | 898/7888 [03:40<53:00,  2.20it/s][A
upload to cloud:  11%|█▏        | 899/7888 [03:40<1:09:11,  1.68it/s][A
upload to cloud:  11%|█▏        | 899/7888 [03:40<1:09:11,  1.68it/s][A
upload to cloud:  11%|█▏        | 900/7888 [03:40<1:31:57,  1.27it/s][A

Unnamed: 0,subreddit,file_name,text,thumbnail_path,thumbnail_exists,original_image,original_image_exists,hash,id,original_caption,thumbnail_caption
0,CityPorn,4emw5uldib9a1.jpg,New York in the fog,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,D:\data\images\CityPorn\4emw5uldib9a1.jpg,True,7a8d96e378c15c8ab8440ac311f12c11,1000cej,cars are parked on the side of the road in the...,"""cars|NNS"",""are|VBP"",""parked|VBN"",""on|IN"",""the..."
3,AmIhotAF,4xyb1vgbjb9a1.jpg,Just looking for entertainment,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,True,e554c1ed7ffa2740436ac082068b2824,1000glf,blonde woman with blonde hair and tattoos on h...,"""blonde|NNS"",""woman|NN"",""with|IN"",""hair|NN"",""a..."
4,greentext,3mewbe0wjb9a1.jpg,Anon wants Elon cut,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,D:\data\images\greentext\3mewbe0wjb9a1.jpg,True,1dec3dabb5e46cde01855d06089c287a,1000j1n,a man with a beard and a beard sitting in fron...,"""man|NN"",""with|IN"",""beard|NN"",""and|CC"",""sittin..."
5,spaceporn,7s5aafaqkb9a1.jpg,Northern Lights above Lofoten,D:\data\images\spaceporn\thumbnail\7s5aafaqkb9...,True,D:\data\images\spaceporn\7s5aafaqkb9a1.jpg,True,2c39ce1290fba541abd0b004b09da6b2,1000mjs,a view of a view of a large green and purple a...,"""view|NN"",""of|IN"",""large|JJ"",""green|JJ"",""and|C..."
7,spaceporn,abojw7lqlb9a1.jpg,Viking Lights,D:\data\images\spaceporn\thumbnail\abojw7lqlb9...,True,D:\data\images\spaceporn\abojw7lqlb9a1.jpg,True,0f72de47c69ff50eca5fa3990215f4ac,1000qpd,a scene of a boat is sitting on the shore of a...,"""scene|NN"",""of|IN"",""boat|NN"",""is|VBZ"",""sitting..."
...,...,...,...,...,...,...,...,...,...,...,...
11724,spaceporn,abwhhq0w8b9a1.jpg,Polaris to Cassiopeia on a cloudy night.,D:\data\images\spaceporn\thumbnail\abwhhq0w8b9...,True,D:\data\images\spaceporn\abwhhq0w8b9a1.jpg,True,f5973637fc56360c15818ba0ca1f7ffa,zzz6dp,starrdust sky with a few stars and a few stars,"""starrdust|NN"",""sky|NN"",""with|IN"",""few|JJ"",""st..."
11725,spaceporn,7hzipg1bab9a1.jpg,The hunt for habitable ocean worlds beyond our...,D:\data\images\spaceporn\thumbnail\7hzipg1bab9...,True,D:\data\images\spaceporn\7hzipg1bab9a1.jpg,True,5b22bea7582229c1f9b992176a2ca2c6,zzzcn5,a picture taken from the earth's surface of th...,"""picture|NN"",""taken|VBN"",""from|IN"",""the|DT"",""e..."
11726,greentext,bgho6WK.jpg,Anon does a little trolling,D:\data\images\greentext\thumbnail\bgho6WK.jpg,True,D:\data\images\greentext\bgho6WK.jpg,True,df666b8b2ad543c77b3fdba89becda1a,zzzeoi,a screenshote of a text message from a man who...,"""screenshote|NN"",""of|IN"",""text|JJ"",""message|NN..."
11728,trippinthroughtime,arCpzQ0.jpg,He didn't shed light on the topic I guess.,D:\data\images\trippinthroughtime\thumbnail\ar...,True,D:\data\images\trippinthroughtime\arCpzQ0.jpg,True,5007b937974ae333022c0c91b795ca09,zzzlbf,a man in a red dress and a woman in a red dress,"""man|NN"",""in|IN"",""red|JJ"",""dress|NN"",""and|CC"",..."



upload to cloud:  11%|█▏        | 900/7888 [03:41<1:31:57,  1.27it/s][A


In [72]:
progress = InnerProgressBar(len(processed_with_captions_more), "upload to cloud")
ddf = dd.from_pandas(temp, npartitions=12)
ddf.apply(lambda x: update_cloud(x), axis=1, meta=('str', object)).compute()
print("Process Complete: Items Updated")

[A
upload to cloud:  15%|█▍        | 1168/7888 [03:56<26:50,  4.17it/s][A
upload to cloud:  15%|█▍        | 1169/7888 [03:56<28:48,  3.89it/s][A
upload to cloud:  15%|█▍        | 1170/7888 [03:56<24:09,  4.63it/s][A
upload to cloud:  15%|█▍        | 1170/7888 [03:56<24:09,  4.63it/s][A
upload to cloud:  15%|█▍        | 1170/7888 [03:56<24:09,  4.63it/s][A
upload to cloud:  15%|█▍        | 1172/7888 [03:56<31:11,  3.59it/s][A
upload to cloud:  15%|█▍        | 1175/7888 [03:56<16:48,  6.66it/s][A
upload to cloud:  15%|█▍        | 1176/7888 [03:56<20:38,  5.42it/s][A
upload to cloud:  15%|█▍        | 1177/7888 [03:56<22:02,  5.07it/s][A
upload to cloud:  15%|█▍        | 1177/7888 [03:56<22:02,  5.07it/s][A
upload to cloud:  15%|█▍        | 1178/7888 [03:56<23:25,  4.77it/s][A
upload to cloud:  15%|█▍        | 1179/7888 [03:56<24:52,  4.50it/s][A
upload to cloud: 0it [00:00, ?it/s]180/7888 [03:56<28:14,  3.96it/s][A
upload to cloud:  15%|█▍        | 1181/7888 [03:56<25:10,  4

global:   0%|          | 0/12 [00:00<?, ?it/s]

upload to cloud: 14595it [11:33,  4.07it/s]                         

Process Complete: Items Updated
