In [None]:
import os

import pandas

import dask.dataframe as dd

from shared_code.utility.spark.set_environ import set_azure_env

set_azure_env()

from shared_code.utility.storage.table import TableAdapter

In [None]:
from tqdm.dask import TqdmCallback

cb = TqdmCallback(desc="global")
cb.register()

from tqdm import tqdm
tqdm.pandas(desc="global")

In [None]:
from tqdm import tqdm

class InnerProgressBar(tqdm):
	def __init__(self, total, desc):
		super().__init__(desc=desc)
		self.total = total
		self.current = 0

	def update_to(self):
		self.update(self.current)

In [None]:
data_path = "/data/parquet/"
parquet_process_data_path = data_path + "processed_data.parquet"

# Creating SD Models
## SexyDiffusion
- HotGirlNextDoor
- sfwpetite
- AmIhotAF
- selfies
- amihot
- SFWNextDoorGirls
- SFWRedheads
- SFWPetite
- Amicute

## CityScapes
- CityPorn

## NatureScapes
- EarthPorn

## Memes
- greentext

The basic training line for a model is:
```json lines
{"file_name": "0001.png", "text": "A cute cat."}
{"file_name": "0002.png", "text": "A cute dog."}
{"file_name": "0003.png", "text": "A cute bird."}
```

For each image we will do the following:
- Caption the image with the caption (the actual caption from the other AI)
- Use the thumbnail version of the image is to be used
- Move all the images to a single folder along with the metadata.jsonl file

A training line will look like:
```json lines
{"file_name": "0001.png", "text": "A cute cat."}
```

Create small GPT Model for each SD model that will be used to generate the captions for the images based on what a user would say with the following translation:

`<|startoftext|><|model|>SexyDiffusion<|model|><|prompt|>A cute cat<|prompt|><|text|>Foo<|text|><endoftext|>`

This file will be named and stored in the following format:
`training.txt`

In [None]:
print(f"Reading from parquet {parquet_process_data_path} with Updated Thumbnail Captions")
processed_with_captions_more = pandas.read_parquet(parquet_process_data_path)
display(processed_with_captions_more)

In [None]:
print("Filtering Subreddits with Images By original_caption")
filtered_captions = processed_with_captions_more[
	(processed_with_captions_more["original_caption"] != "bruh") &
	(~processed_with_captions_more["original_caption"].isna() | ~ processed_with_captions_more[
		"original_caption"].isnull())
	]

filtered_captions_display = filtered_captions.groupby("subreddit").size().reset_index(name="count")

display(filtered_captions_display.sort_values("count", ascending=False))
print(f"Total Records {filtered_captions_display['count'].sum()}")

In [None]:
print("Filtering Subreddits with Images By thumbnail_caption")
filtered_captions_by_thumbnail = filtered_captions[
	(processed_with_captions_more["thumbnail_caption"] != "bruh") &
	(~processed_with_captions_more["thumbnail_caption"].isna() | ~ processed_with_captions_more[
		"thumbnail_caption"].isnull())
	]

filtered_captions_by_thumbnail_display = filtered_captions_by_thumbnail.groupby("subreddit").size().reset_index(
	name="count")
display(filtered_captions_by_thumbnail_display.sort_values("count", ascending=False))
print(f"Total Records {filtered_captions_by_thumbnail_display['count'].sum()}")

In [None]:
sources = [
	{"name": "CityScapes", "data": ["CityPorn"]},
	{"name": "NatureScapes", "data": ["EarthPorn"]},
	{"name": "memes", "data": ["greentext"]},
	{"name": "SexyDiffusion",
	 "data": ["HotGirlNextDoor", "sfwpetite", "AmIhotAF", "selfies", "amihot", "SFWNextDoorGirls", "SFWRedheads",
			  "SFWPetite", "Amicute"]}
]

In [None]:
import shutil
from PIL import Image

for item in sources:
	new_records = []
	out_dir = os.path.join("out", item['name'])
	os.makedirs(out_dir, exist_ok=True)
	for record in filtered_captions_by_thumbnail.to_dict(orient='records'):
		subreddit = record['subreddit']
		if subreddit in item['data']:
			valid_image = record.get("thumbnail_path")
			try:
				foo:Image = Image.open(valid_image)
				b = foo.size
				foo.close()
			except:
				print(f"Invalid Image {valid_image}")
				continue

			shutil.copy(valid_image, out_dir)
			out_record = {"file_name": record.get("file_name"), "text": record.get("original_caption")}
			new_records.append(out_record)

	out_records = pandas.DataFrame(new_records)
	out_records.to_json("metadata.jsonl", orient="records", lines=True)
	shutil.move("metadata.jsonl", out_dir)

In [None]:
if os.path.exists("out.zip"):
    print("Removing Old File")
    !rm out.zip

In [None]:
!tar -a -c -f out.zip out