In [1]:
import os

import pandas
from tqdm import tqdm
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
import shutil
from PIL import Image
from tqdm.dask import TqdmCallback
from tqdm.auto import tqdm
from tqdm import tqdm
from shared_code.utility.spark.set_environ import set_azure_env

set_azure_env()


In [2]:
tqdm.pandas()
cb = TqdmCallback(desc="global-dd")
cb.register()
tqdm.pandas(desc="global-pd")

In [3]:
class InnerProgressBar(tqdm):
	def __init__(self, total, desc):
		super().__init__(desc=desc)
		self.total = total
		self.current = 0

	def update_to(self):
		self.update(self.current)

In [4]:
data_path = "/data/parquet/"
parquet_process_data_path = data_path + "processed_data.parquet"

# Creating SD Models
## SexyDiffusion
- HotGirlNextDoor
- sfwpetite
- AmIhotAF
- selfies
- amihot
- SFWNextDoorGirls
- SFWRedheads
- SFWPetite
- Amicute

## CityScapes
- CityPorn

## NatureScapes
- EarthPorn

## Memes
- greentext

The basic training line for a model is:
```json lines
{"file_name": "0001.png", "text": "A cute cat."}
{"file_name": "0002.png", "text": "A cute dog."}
{"file_name": "0003.png", "text": "A cute bird."}
```

For each image we will do the following:
- Caption the image with the caption (the actual caption from the other AI)
- Use the thumbnail version of the image is to be used
- Move all the images to a single folder along with the metadata.jsonl file

A training line will look like:
```json lines
{"file_name": "0001.png", "text": "A cute cat."}
```

Create small GPT Model for each SD model that will be used to generate the captions for the images based on what a user would say with the following translation:

`<|startoftext|><|model|>SexyDiffusion<|model|><|prompt|>A cute cat<|prompt|><|text|>Foo<|text|><endoftext|>`

This file will be named and stored in the following format:
`training.txt`

In [5]:
print(f"Reading from parquet {parquet_process_data_path} with Updated Thumbnail Captions")
processed_with_captions_more = pandas.read_parquet(parquet_process_data_path)
display(processed_with_captions_more)

Reading from parquet /data/parquet/processed_data.parquet with Updated Thumbnail Captions


Unnamed: 0,subreddit,file_name,text,thumbnail_path,thumbnail_exists,original_image,original_image_exists,hash,id,original_caption,thumbnail_caption
0,CityPorn,4emw5uldib9a1.jpg,New York in the fog,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,D:\data\images\CityPorn\4emw5uldib9a1.jpg,True,7a8d96e378c15c8ab8440ac311f12c11,1000cej,cars are parked on the side of the road in the...,"[[cars, NNS], [are, VBP], [parked, VBN], [on, ..."
3,AmIhotAF,4xyb1vgbjb9a1.jpg,Just looking for entertainment,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,True,e554c1ed7ffa2740436ac082068b2824,1000glf,blonde woman with blonde hair and tattoos on h...,"[[blonde, NNS], [woman, NN], [with, IN], [hair..."
4,greentext,3mewbe0wjb9a1.jpg,Anon wants Elon cut,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,D:\data\images\greentext\3mewbe0wjb9a1.jpg,True,1dec3dabb5e46cde01855d06089c287a,1000j1n,a man with a beard and a beard sitting in fron...,"[[man, NN], [with, IN], [beard, NN], [and, CC]..."
5,spaceporn,7s5aafaqkb9a1.jpg,Northern Lights above Lofoten,D:\data\images\spaceporn\thumbnail\7s5aafaqkb9...,True,D:\data\images\spaceporn\7s5aafaqkb9a1.jpg,True,2c39ce1290fba541abd0b004b09da6b2,1000mjs,a view of a view of a large green and purple a...,"[[view, NN], [of, IN], [large, JJ], [green, JJ..."
7,spaceporn,abojw7lqlb9a1.jpg,Viking Lights,D:\data\images\spaceporn\thumbnail\abojw7lqlb9...,True,D:\data\images\spaceporn\abojw7lqlb9a1.jpg,True,0f72de47c69ff50eca5fa3990215f4ac,1000qpd,a scene of a boat is sitting on the shore of a...,"[[scene, NN], [of, IN], [boat, NN], [is, VBZ],..."
...,...,...,...,...,...,...,...,...,...,...,...
16077,spaceporn,abwhhq0w8b9a1.jpg,Polaris to Cassiopeia on a cloudy night.,D:\data\images\spaceporn\thumbnail\abwhhq0w8b9...,True,D:\data\images\spaceporn\abwhhq0w8b9a1.jpg,True,f5973637fc56360c15818ba0ca1f7ffa,zzz6dp,starrdust sky with a few stars and a few stars,"[[starrdust, NN], [sky, NN], [with, IN], [few,..."
16078,spaceporn,7hzipg1bab9a1.jpg,The hunt for habitable ocean worlds beyond our...,D:\data\images\spaceporn\thumbnail\7hzipg1bab9...,True,D:\data\images\spaceporn\7hzipg1bab9a1.jpg,True,5b22bea7582229c1f9b992176a2ca2c6,zzzcn5,a picture taken from the earth's surface of th...,"[[picture, NN], [taken, VBN], [from, IN], [the..."
16079,greentext,bgho6WK.jpg,Anon does a little trolling,D:\data\images\greentext\thumbnail\bgho6WK.jpg,True,D:\data\images\greentext\bgho6WK.jpg,True,df666b8b2ad543c77b3fdba89becda1a,zzzeoi,a screenshote of a text message from a man who...,"[[screenshote, NN], [of, IN], [text, JJ], [mes..."
16081,trippinthroughtime,arCpzQ0.jpg,He didn't shed light on the topic I guess.,D:\data\images\trippinthroughtime\thumbnail\ar...,True,D:\data\images\trippinthroughtime\arCpzQ0.jpg,True,5007b937974ae333022c0c91b795ca09,zzzlbf,a man in a red dress and a woman in a red dress,"[[man, NN], [in, IN], [red, JJ], [dress, NN], ..."


In [6]:
print("Filtering Subreddits with Images By original_caption")
filtered_captions = processed_with_captions_more[
	(processed_with_captions_more["original_caption"] != "bruh") &
	(~processed_with_captions_more["original_caption"].isna() | ~ processed_with_captions_more[
		"original_caption"].isnull())
	]

filtered_captions_display = filtered_captions.groupby("subreddit").size().reset_index(name="count")

display(filtered_captions_display.sort_values("count", ascending=False))
print(f"Total Records {filtered_captions_display['count'].sum()}")

Filtering Subreddits with Images By original_caption


Unnamed: 0,subreddit,count
3,EarthPorn,1883
2,CityPorn,1795
13,memes,1180
16,spaceporn,1101
4,Faces,1028
9,SFWRedheads,984
8,SFWNextDoorGirls,848
12,greentext,713
17,trippinthroughtime,478
15,sfwpetite,288


Total Records 11514


In [7]:
print("Filtering Subreddits with Images By thumbnail_caption")
filtered_captions_by_thumbnail = filtered_captions[
	(processed_with_captions_more["thumbnail_caption"] != "bruh") &
	(~processed_with_captions_more["thumbnail_caption"].isna() | ~ processed_with_captions_more[
		"thumbnail_caption"].isnull())
	]

filtered_captions_by_thumbnail_display = filtered_captions_by_thumbnail.groupby("subreddit").size().reset_index(
	name="count")
display(filtered_captions_by_thumbnail_display.sort_values("count", ascending=False))
print(f"Total Records {filtered_captions_by_thumbnail_display['count'].sum()}")

Filtering Subreddits with Images By thumbnail_caption


  result = libops.scalar_compare(x.ravel(), y, op)


Unnamed: 0,subreddit,count
3,EarthPorn,1883
2,CityPorn,1795
13,memes,1180
16,spaceporn,1101
4,Faces,1028
9,SFWRedheads,984
8,SFWNextDoorGirls,848
12,greentext,713
17,trippinthroughtime,478
15,sfwpetite,288


Total Records 11514


In [8]:
sources = [
	{"name": "CityScapes", "data": ["CityPorn"]},
	{"name": "NatureScapes", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "memes", "data": ["memes"]},
	{"name": "MemeDiffusion", "data": ["memes", "memes"]},
	{"name": "SexyDiffusion",
	 "data": ["sfwpetite", "selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor", "SFWNextDoorGirls",
			  "SFWRedheads"]
	 }
]

In [9]:
display(sources)
sources_df = pandas.DataFrame(sources)
display(sources_df)

[{'name': 'CityScapes', 'data': ['CityPorn']},
 {'name': 'NatureScapes', 'data': ['EarthPorn']},
 {'name': 'CosmicDiffusion', 'data': ['spaceporn']},
 {'name': 'memes', 'data': ['memes']},
 {'name': 'MemeDiffusion', 'data': ['memes', 'memes']},
 {'name': 'SexyDiffusion',
  'data': ['sfwpetite',
   'selfies',
   'Amicute',
   'amihot',
   'AmIhotAF',
   'HotGirlNextDoor',
   'SFWNextDoorGirls',
   'SFWRedheads']}]

Unnamed: 0,name,data
0,CityScapes,[CityPorn]
1,NatureScapes,[EarthPorn]
2,CosmicDiffusion,[spaceporn]
3,memes,[memes]
4,MemeDiffusion,"[memes, memes]"
5,SexyDiffusion,"[sfwpetite, selfies, Amicute, amihot, AmIhotAF..."


In [10]:
# from tqdm import dask
# import pandas
# import dask.dataframe as dd
# import shutil
# from PIL import Image
# import pandas
# def map_source_record(source_record: dict, filtered_records: pandas.DataFrame):
# 	model_name: str = source_record.get('name')
# 	sub_list: [str] = source_record.get('data')
# 	out_path_for_model = os.path.join("out", model_name)
# 	os.makedirs(out_path_for_model, exist_ok=True)
# 	ddf = dd.from_pandas(filtered_records, npartitions=12)
# 	with TqdmCallback(desc="map-inner-records"):
# 		ddf.apply(lambda x: map_inner_record(x, sub_list, out_path_for_model), axis=1, meta=('str', object))
# 		ddf.compute()
#
#
# def map_inner_record(inner_record: dict, sub_list: [str], out_path_for_model: str):
# 	new_out_records = []
#
# 	sub_reddit = inner_record.get('subreddit')
# 	if sub_reddit in sub_list:
# 		file_name = inner_record.get("original_image")
# 		text = inner_record.get("text")
# 		text_2 = inner_record.get("thumbnail_caption")
#
# 		valid_image_path = inner_record.get("original_image")
# 		try:
# 			temp_opened_image: Image = Image.open(valid_image_path)
# 			image_size = temp_opened_image.size
# 			temp_opened_image.close()
# 		except Exception as e:
# 			print(f"Invalid Image {valid_image_path}, {e}")
# 			return None
#
# 		shutil.copy(valid_image_path, out_path_for_model)
# 		new_out_record = {"file_name": file_name, "text": [text, text_2]}
# 		new_out_records.append(new_out_record)
#
# 	final_out_records = pandas.DataFrame(new_out_records)
# 	final_out_records.to_json("metadata.jsonl", orient="records", lines=True)
# 	shutil.copy2("metadata.jsonl", out_path_for_model)

In [11]:
# sources_df = pandas.DataFrame(sources)
# sources_ddf = dd.from_pandas(sources_df, npartitions=12)
# with TqdmCallback(desc="run-map-source-record"):
# 	sources_ddf.apply(lambda x: map_source_record(x, filtered_captions_by_thumbnail), axis=1, meta=('str', object))
# 	sources_ddf.compute()

In [12]:
import shutil

if os.path.exists("out"):
	shutil.rmtree("out")

if os.path.exists("out.zip"):
	print("Removing Old File")
	os.remove("out.zip")

Removing Old File


In [13]:
sources_df = pandas.DataFrame(sources)
import shutil
from PIL import Image
import shutil

if os.path.exists("out"):
	shutil.rmtree("out")

if os.path.exists("out.zip"):
	print("Removing Old File")
	os.remove("out.zip")

for item in sources:
	new_records = []
	out_dir = os.path.join("out", item['name'])
	os.makedirs(out_dir, exist_ok=True)
	for record in filtered_captions_by_thumbnail.to_dict(orient='records'):
		valid_image = record.get("original_image")
		try:
			subreddit = record['subreddit']
			if subreddit in item['data']:
				opened_image: Image = Image.open(valid_image)
				b = opened_image.size
				opened_image.close()

				shutil.copy(valid_image, out_dir)
				out_record = {"file_name": record.get("file_name"),
							  "text": [record.get("text"), record.get("original_caption")]}
				new_records.append(out_record)
		except Exception as e:
			print(f"Invalid Image {valid_image}")
			continue
	out_records = pandas.DataFrame(new_records)
	out_records.to_json("metadata.jsonl", orient="records", lines=True)
	shutil.move("metadata.jsonl", out_dir)



In [14]:
!tar -a -c -f out.zip out

In [15]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=507548422904-d8bsqsniuihb6a5uh7hesjo5s7brhs5u.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=online&response_type=code

Authentication successful.


In [16]:
gfile = drive.CreateFile({
	'title': 'training_images.zip',
	'parents': [
		{
			'id': '1-XhPy_rDB1TrStfC2NUO8q2pN0k1Njni'
		}
	]
})

gfile.SetContentFile('out.zip')
gfile.Upload()
print('title: %s, id: %s' % (gfile['title'], gfile['id']))
print(gfile)

title: training_images.zip, id: 1i0qWGBw5NYOHopge9kka5qLAj8yD62ri
GoogleDriveFile({'title': 'training_images.zip', 'parents': [{'kind': 'drive#parentReference', 'id': '1-XhPy_rDB1TrStfC2NUO8q2pN0k1Njni', 'selfLink': 'https://www.googleapis.com/drive/v2/files/1i0qWGBw5NYOHopge9kka5qLAj8yD62ri/parents/1-XhPy_rDB1TrStfC2NUO8q2pN0k1Njni', 'parentLink': 'https://www.googleapis.com/drive/v2/files/1-XhPy_rDB1TrStfC2NUO8q2pN0k1Njni', 'isRoot': False}], 'mimeType': 'application/x-zip-compressed', 'kind': 'drive#file', 'id': '1i0qWGBw5NYOHopge9kka5qLAj8yD62ri', 'etag': '"MTY3ODUwMzIxNzYxMQ"', 'selfLink': 'https://www.googleapis.com/drive/v2/files/1i0qWGBw5NYOHopge9kka5qLAj8yD62ri', 'webContentLink': 'https://drive.google.com/uc?id=1i0qWGBw5NYOHopge9kka5qLAj8yD62ri&export=download', 'alternateLink': 'https://drive.google.com/file/d/1i0qWGBw5NYOHopge9kka5qLAj8yD62ri/view?usp=drivesdk', 'embedLink': 'https://drive.google.com/file/d/1i0qWGBw5NYOHopge9kka5qLAj8yD62ri/preview?usp=drivesdk', 'iconLink'