In [2]:
%%time

import sys
sys.path.append('../../')

from adlfs import AzureBlobFileSystem


import time


import os
import json
import pandas
import pandas as pd
from tqdm import tqdm

from common.schemas.pyarrow_schema import tagging_schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.captioning.azure_descriptions import AzureCaption
from common.schemas.pyarrow_schema import schema
from common.functions.functions import Functions

tqdm.pandas(desc="Progress")
file_system: AzureBlobFileSystem = AzureFileStorageAdapter('data').get_file_storage()

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

ModuleNotFoundError: No module named 'azure.cognitiveservices'

In [None]:
%%time

curated_data = pandas.read_parquet("data/parquet/back.parquet", engine="pyarrow", filesystem=file_system)

curated_data.set_index("id", inplace=True, drop=False)

filtered = curated_data.loc[curated_data["accept"] == True, schema.names]

filtered.dropna(inplace=True)

display(filtered.shape)

display(filtered)

In [None]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "TTTDiffusion", "data": ["trippinthroughtime"]},
	{"name": "WallStreetDiffusion", "data": ["wallstreetbets"]},
	{"name": "SexyDiffusion", "data": ["selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "CelebrityDiffusion", "data": ["celebrities"]},
	{"name": "OldLadyDiffusion", "data": ["oldladiesbakingpies"]},
	{"name": "SWFPetite", "data": ["sfwpetite"]},
	{"name": "SFWMilfs", "data": ["cougars_and_milfs_sfw"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]},
	{"name": "SexyAsianDiffusion","data": ["realasians", "KoreanHotties", "prettyasiangirls", "AsianOfficeLady", "AsianInvasion","AesPleasingAsianGirls"]},
	{"name": "MildlyPenisDiffusion", "data": ["mildlypenis"]},
	{"name": "PrettyGirlDiffusion","data": ["sexygirls", "PrettyGirls", "gentlemanboners", "hotofficegirls", "tightdresses", "DLAH"]},
	{"name": "CandleDiffusion", "data": ["bathandbodyworks"]}
]
sources_df = pd.DataFrame.from_records(sources)

In [None]:
filtered['model'] = filtered.apply(lambda x: functions.add_source(x, sources), axis=1)
foo = filtered.loc[filtered['model'] != ""]
foo.dropna()
filtered = foo
display(filtered)

In [None]:
%%time

group = filtered[["id", "subreddit"]].groupby("subreddit").count().sort_values(by="id", ascending=False)
plot = group.plot.bar(figsize=(20, 10), title="Subreddits with most posts", legend=True)
display(plot)

In [None]:
%%time

group = filtered[["id", "model", "subreddit"]].groupby(["model"]).count().sort_values(by="id", ascending=False)
plot_1 = group.plot.bar(figsize=(20, 10), title="Models with most images", legend=True)
display(plot_1)

In [None]:
bar = filtered.loc[(filtered["model"] == "SexyDiffusion") | (filtered["model"] == "SexyAsianDiffusion") | (filtered['model'] == "NextDoorGirlDiffusion") | (filtered['model'] == 'RedHeadDiffusion')]
dropped = bar.dropna(inplace=True)
display(bar.shape)
display(bar)

In [None]:
%%time

import datetime
from common.captioning.azure_descriptions import AzureCaption

records = bar.to_dict(orient="records")
current_captions = [item.replace('\n', '') for item in file_system.ls("data/caption")]

i = 0
start_time = datetime.datetime.now()
for elem in records:
	op_time = datetime.datetime.now()
	time_current = f"{(op_time - start_time).total_seconds()/60} minutes"
	i += 1
	path = elem['path']
	remote_path = file_system.url(path)

	if f'data/caption/{elem["id"]}.json' not in current_captions:
		display(f'Processing {elem["id"]} -- {i}/{len(records)}...{time_current}', clear=True)
		caption: AzureCaption = AzureCaption(file_system)
		output = caption.image_analysis(remote_path)
		time.sleep(12)
		json_result = output.json_result
		if json_result is None:
			display(f'Error with {elem["id"]} -- Empty Result -- {i}/{len(records)}...{time_current}', clear=True)
			continue
		try:
			if json.loads(json_result).get('error'):
				display(f'Error with {elem["id"]} -- Error: {json.loads(json_result).get("error")} -- {i}/{len(records)}...{time_current}', clear=True)
				continue
			handle = open('temp.json', 'w', encoding='utf-8')
			handle.write(json_result)
			handle.close()
			display(f'Uploading {elem["id"]} -- {i}/{len(records)}...{time_current}', clear=True)
			file_system.upload('temp.json', f'data/caption/{elem["id"]}.json')
		except Exception as e:
			display(f'Error with {elem["id"]} -- Error: {e} -- {i}/{len(records)}', clear=True)
			continue
	else:
		display(f'Skipping {elem["id"]} -- Already Processed -- {i}/{len(records)}...{time_current}', clear=True)
		continue

In [None]:
# !jupyter notebook stop