In [None]:
%%time

import json

import pandas
import pandas as pd
from adlfs import AzureBlobFileSystem
from tqdm import tqdm

from common.captioning.azure_descriptions import AzureCaption
from common.functions.functions import Functions
from common.schemas.pyarrow_schema import schema
from common.storage.azure_file_storage import AzureFileStorageAdapter

tqdm.pandas(desc="Progress")

file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

In [None]:
%%time

curated_data = pandas.read_parquet("data/parquet/back.parquet", engine="pyarrow", filesystem=file_system)

curated_data.set_index("id", inplace=True, drop=False)

filtered = curated_data.loc[curated_data["accept"] == True, schema.names]

filtered.dropna(inplace=True)

filtered.reset_index(inplace=True, drop=True)

display(filtered.shape)

display(filtered)

In [None]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "TTTDiffusion", "data": ["trippinthroughtime"]},
	{"name": "WallStreetDiffusion", "data": ["wallstreetbets"]},
	{"name": "SexyDiffusion",
	 "data": ["selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor", "sexygirls", "PrettyGirls",
			  "gentlemanboners", "hotofficegirls", "tightdresses", "DLAH"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "CelebrityDiffusion", "data": ["celebrities"]},
	{"name": "OldLadyDiffusion", "data": ["oldladiesbakingpies"]},
	{"name": "SWFPetite", "data": ["sfwpetite"]},
	{"name": "SFWMilfs", "data": ["cougars_and_milfs_sfw"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]},
	{"name": "SexyAsianDiffusion",
	 "data": ["realasians", "KoreanHotties", "prettyasiangirls", "AsianOfficeLady", "AsianInvasion"]},
	{"name": "MildlyPenisDiffusion", "data": ["mildlypenis"]},
	{"name": "CandleDiffusion", "data": ["bathandbodyworks"]},
]
sources_df = pd.DataFrame.from_records(sources)

In [None]:
%%time

filtered['model'] = filtered.apply(lambda x: functions.add_source(x, sources), axis=1)
filtered_model = filtered.loc[filtered['model'] != ""]
filtered_model.dropna(inplace=True)
filtered_model.reset_index(inplace=True, drop=True)
display(filtered_model)

In [None]:
%%time

group = filtered_model[["id", "subreddit"]].groupby("subreddit").count().sort_values(by="id", ascending=False)
plot = group.plot.bar(figsize=(20, 10), title="Subreddits with most posts", legend=True)
display(plot)

In [None]:
%%time

group = filtered_model[["id", "model", "subreddit"]].groupby(["model"]).count().sort_values(by="id", ascending=False)
plot_1 = group.plot.bar(figsize=(20, 10), title="Models with most images", legend=True)
display(plot_1)

In [None]:
sexy_model = filtered_model.loc[
	(filtered_model["model"] == "SexyDiffusion") | (filtered_model["model"] == "SexyAsianDiffusion") | (
				filtered_model['model'] == "NextDoorGirlDiffusion") | (filtered_model['model'] == 'RedHeadDiffusion')]
sexy_model.dropna(inplace=True)
sexy_model.reset_index(inplace=True, drop=True)
display(sexy_model.shape)
display(sexy_model)

In [None]:
%%time

import time

records = sexy_model.to_dict(orient="records")
current_captions = [item.replace('\n', '') for item in file_system.ls("data/caption")]
i = 0
start = time.time()

for elem in records:
	end = time.time()
	elapsed = end - start
	i += 1
	path = elem['path']
	remote_path = file_system.url(path)
	if f'data/caption/{elem["id"]}.json' not in current_captions:
		time.sleep(1)
		print(f'Processing {elem["id"]} -- {i}/{len(records)} -- {elapsed}')

		output = caption.image_analysis(remote_path)

		json_result = output.json_result
		if json_result is None:
			print(f'Error with {elem["id"]} -- Empty Result -- {i}/{len(records)} -- {elapsed}')
			continue
		try:
			if json.loads(json_result).get('error'):
				print(
					f'Error with {elem["id"]} -- Error: {json.loads(json_result).get("error")} -- {i}/{len(records)} -- {elapsed}')
				continue
			with open('temp.json', 'w', encoding='utf-8') as handle:
				handle.write(json_result)
				print(f'Uploading {elem["id"]} -- {i}/{len(records)} -- {elapsed}')
				file_system.upload('temp.json', f'data/caption/{elem["id"]}.json')
		except Exception as e:
			print(f'Error with {elem["id"]} -- Error: {e} -- {i}/{len(records)} -- {elapsed}')
			continue
	else:
		continue

'Processing 10cjysv -- 525/17794'

In [None]:
# !jupyter notebook stop