In [None]:
%%time


import json
import pandas
import pandas as pd
from tqdm import tqdm

from common.schemas.pyarrow_schema import tagging_schema
from common.storage.azure_file_storage import AzureFileStorageAdapter
from common.captioning.azure_descriptions import AzureCaption
from common.schemas.pyarrow_schema import schema

tqdm.pandas(desc="Progress")
file_system = AzureFileStorageAdapter('data').get_file_storage()

from common.data_frame_functions.functions import Functions

functions: Functions = Functions()

caption: AzureCaption = AzureCaption(file_system)

In [None]:
%%time

curated_data = pandas.read_parquet("data/parquet/back.parquet", engine="pyarrow", filesystem=file_system)

curated_data.set_index("id", inplace=True, drop=False)

filtered = curated_data.loc[curated_data["accept"] == True, schema.names]

filtered.dropna(inplace=True)

display(filtered.shape)

display(filtered)

In [None]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "TTTDiffusion", "data": ["trippinthroughtime"]},
	{"name": "WallStreetDiffusion", "data": ["wallstreetbets"]},
	{"name": "SexyDiffusion", "data": ["selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor", "sexygirls", "PrettyGirls", "gentlemanboners", "hotofficegirls", "tightdresses", "DLAH"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "CelebrityDiffusion", "data": ["celebrities"]},
	{"name": "OldLadyDiffusion", "data": ["oldladiesbakingpies"]},
	{"name": "SWFPetite", "data": ["sfwpetite"]},
	{"name": "SFWMilfs", "data": ["cougars_and_milfs_sfw"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]},
	{"name": "SexyAsianDiffusion", "data": ["realasians", "KoreanHotties", "prettyasiangirls", "AsianOfficeLady", "AsianInvasion"]},
	{"name": "MildlyPenisDiffusion", "data": ["mildlypenis"]},
	{"name": "CandleDiffusion", "data": ["bathandbodyworks"] },
]
sources_df = pd.DataFrame.from_records(sources)

In [None]:
filtered['model'] = filtered.apply(lambda x: functions.add_source(x, sources), axis=1)
foo = filtered.loc[filtered['model'] != ""]
foo.dropna()
filtered = foo
display(filtered)

In [None]:
%%time

group = filtered[["id", "subreddit"]].groupby("subreddit").count().sort_values(by="id", ascending=False)
plot = group.plot.bar(figsize=(20, 10), title="Subreddits with most posts", legend=True)
display(plot)

In [None]:
%%time

group = filtered[["id", "model", "subreddit"]].groupby(["model"]).count().sort_values(by="id", ascending=False)
plot_1 = group.plot.bar(figsize=(20, 10), title="Models with most images", legend=True)
display(plot_1)

In [None]:
bar = filtered.loc[(filtered["model"] == "SexyDiffusion") | (filtered["model"] == "SexyAsianDiffusion") | (filtered['model'] == "NextDoorGirlDiffusion") | (filtered['model'] == 'RedHeadDiffusion')]
dropped = bar.dropna(inplace=True)
display(bar.shape)
display(bar)

In [None]:
%%time

from PIL import Image
from io import BytesIO

records = bar.to_dict(orient="records")
for elem in tqdm(records, total=len(records), desc="Progress"):
	path = elem['path']
	fs_bytes = file_system.read_bytes(path)
	img = Image.open(BytesIO(fs_bytes))
	img_copy = img.copy()
	max_size = (512, 512)
	copy = img_copy.resize(max_size, 1)
	out_path = "temp.jpg"
	copy.save(out_path)
	img.close()
	img_copy.close()
	copy.close()
	file_system.upload(out_path, f'data/image/thumbnail/{elem["id"]}.jpg')