In [None]:
%%time

display("=== Starting 0-3 Azure Image Captioning ===")

import json

import pandas
import pandas as pd
from adlfs import AzureBlobFileSystem
from tqdm import tqdm

from common.captioning.azure_descriptions import AzureCaption
from common.functions.functions import Functions
from common.schemas.pyarrow_schema import schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
import os

tqdm.pandas(desc="Progress")

file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

functions: Functions = Functions()

In [None]:
%%time

curated_data = pandas.read_parquet("data/parquet/back.parquet", engine="pyarrow", filesystem=file_system)

curated_data.set_index("id", inplace=True, drop=False)

filtered = curated_data.loc[curated_data["accept"] == True, schema.names]

filtered.dropna(inplace=True)

filtered.reset_index(inplace=True, drop=True)

display("== Filtered By Acceptance ==")

display(f"Shape: {filtered.shape}")

display(filtered)

In [None]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "TTTDiffusion", "data": ["trippinthroughtime"]},
	{"name": "WallStreetDiffusion", "data": ["wallstreetbets"]},
	{"name": "SexyDiffusion","data": ["selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor", "sexygirls", "PrettyGirls", "gentlemanboners", "hotofficegirls", "tightdresses", "DLAH", "cougars_and_milfs_sfw"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "CelebrityDiffusion", "data": ["celebrities"]},
	{"name": "OldLadyDiffusion", "data": ["oldladiesbakingpies"]},
	{"name": "SWFPetite", "data": ["sfwpetite"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]},
	{"name": "SexyAsianDiffusion","data": ["realasians", "KoreanHotties", "prettyasiangirls", "AsianOfficeLady", "AsianInvasion"]},
	{"name": "MildlyPenisDiffusion", "data": ["mildlypenis"]},
	{"name": "CandleDiffusion", "data": ["bathandbodyworks"]},
]
sources_df = pd.DataFrame.from_records(sources)

display("== Loaded Sources ==")
display(sources_df)

In [None]:
%%time

filtered['model'] = filtered.apply(lambda x: functions.add_source(x, sources), axis=1)

filtered_model = filtered.loc[filtered['model'] != "", schema.names]

filtered_model.dropna(inplace=True)

filtered_model.reset_index(inplace=True, drop=True)

display("== Filtered By Model Type ==")
display(f'Shape: {filtered_model.shape}')
display(filtered_model)

In [None]:
%%time

group = filtered_model[["id", "subreddit"]].groupby("subreddit").count().sort_values(by="id", ascending=False)

plot = group.plot.bar(figsize=(20, 10), title="Subreddits with most posts", legend=True)

display(plot)

In [None]:
%%time

group = filtered_model[["id", "model", "subreddit"]].groupby(["model"]).count().sort_values(by="id", ascending=False)

plot_1 = group.plot.bar(figsize=(20, 10), title="Models with most images", legend=True)

display(plot_1)

In [None]:
def handle_captioning(x: object, _file_system: AzureBlobFileSystem, _current_captions: [str]) -> str:
	"""
	Creates a caption file for the given image id
	:param x: The record to process
	:param _file_system: The instance of an adlfs file system
	:param _current_captions: List of caption names
	:return: Success or failure
	"""
	temp_json_name = "temp.json"
	caption: AzureCaption = AzureCaption(_file_system)
	try:
		_path: str = x['path']
		_id = x['id']
		_remote_path: str = file_system.url(_path)
		out_path = f"data/caption/{_id}.json"

		if f'data/caption/{_id}.json' in current_captions:
			display(f"Captioning Exists For {_id}", clear=True)
			return out_path

		_output = caption.image_analysis(_remote_path)

		if _output is None:
			display(f"Error In Output is empty for {_id}", clear=True)
			return ""

		_json_result = _output.json_result

		if _json_result is None:
			display(f"Error In Json Result is empty for {_id}", clear=True)
			return ""

		if json.loads(_json_result).get('error'):
			display(f"Error In Json Resul with: {_json_result} for {_id}", clear=True)
			return ""

		with open(temp_json_name, 'w', encoding='utf-8') as _handle:
			display(f"Writing Captioning For {_id}", clear=True)
			_handle.write(_json_result)
			_file_system.upload('temp.json', out_path, overwrite=True)
			return out_path
	except Exception as ex:
		display(f"Error in handle_captioning with exception {ex}", clear=True)
		return ""
	finally:
		if os.path.exists(temp_json_name):
  			os.remove(temp_json_name)
		del caption

In [None]:
current_captions = [item.replace('\n', '') for item in file_system.ls("data/caption")]
display(f"Total Number Of Caption Files - {len(current_captions)}")

In [None]:
%%time

filtered_model['azure_success'] = filtered_model.progress_apply(lambda x: handle_captioning(x, file_system, current_captions), axis=1)

display(filtered_model)

In [None]:
%%time

display(f"Total Number Of Caption Files - {len(file_system.ls('data/caption'))}")
display("0-2 Azure Image Analysis Process Complete - Shutting Down")

In [None]:
!jupyter notebook stop