In [1]:
%%time

import os.path
import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from tqdm import tqdm
from tqdm.dask import TqdmCallback

from shared_code.utility.spark.set_environ import set_azure_env

set_azure_env()

from shared_code.utility.storage.azure_file_storage import AzureFileStorageAdapter

fs_adapter = AzureFileStorageAdapter('data')
file_system = fs_adapter.get_file_storage()

CPU times: total: 4.44 s
Wall time: 10.2 s


In [2]:
from typing import List
from dataclasses import dataclass


@dataclass
class DataSources:
	name: str
	data: List[str]

	@staticmethod
	def from_dict(obj: dict) -> 'DataSources':
		_name = obj.get("name")
		_data = [x for x in obj.get("data")]
		return DataSources(_name, _data)

In [3]:
def add_source(x: object, source_list) -> str:
	for source in source_list:
		if x['subreddit'] in source['data']:
			return source['name']
	return ""

In [4]:
def add_compressed_data(x: object) -> str:
	try:
		data = open(x['path'], 'rb').read()
		return data
	except Exception as e:
		print(e)
		return ""

In [5]:
import shutil

def move_file(x: object) -> str:
	try:
		target_dir = x['name']
		source_path = x['path']
		image_name = x['image_name']
		os.makedirs(target_dir, exist_ok=True)
		dest_path = f"{target_dir}\\{image_name}"
		if os.path.exists(dest_path):
			return dest_path
		if source_path is None:
			return ""
		if source_path == "":
			return ""

		if os.path.exists(source_path):
			shutil.copy2(source_path, dest_path)
			return dest_path
		else:
			return ""
	except Exception as e:
		# print(e)
		return ""

In [6]:
%%time

all_data = pd.read_parquet("data/processed_raw_data.parquet", engine='pyarrow', filesystem=file_system)

display(all_data)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,128z5ns,celebrities,A-Sexy-Name,Rachel Weisz,cars are parked on the side of the road in the...,f09b1c0c5dea50c934081a04c83d4d33,/r/celebrities/comments/128z5ns/rachel_weisz/,https://i.redd.it/nv80afddacra1.jpg,f09b1c0c5dea50c934081a04c83d4d33.jpg,D:\data\images\celebrities\f09b1c0c5dea50c9340...,D:\data\images\celebrities\thumbnail\f09b1c0c5...,True,False
1,11yk8db,prettyasiangirls,MisoShiru520,Library,blonde woman with blonde hair and tattoos on h...,9d5b7331b941ecf3fdbc8ad322303825,/r/prettyasiangirls/comments/11yk8db/library/,https://i.redd.it/ht4d2wuusapa1.jpg,9d5b7331b941ecf3fdbc8ad322303825.jpg,D:\data\images\prettyasiangirls\9d5b7331b941ec...,D:\data\images\prettyasiangirls\thumbnail\9d5b...,True,False
2,11hfioi,KoreanHotties,Majestic_Painter8660,Underboob bikini,a man with a beard and a beard sitting in fron...,a154a56b1b87cf3a0bc93600c86e4585,/r/KoreanHotties/comments/11hfioi/underboob_bi...,https://i.redd.it/cimr17q84jla1.jpg,a154a56b1b87cf3a0bc93600c86e4585.jpg,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,D:\data\images\KoreanHotties\thumbnail\a154a56...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29597,11oz7fi,mildlypenis,Extra-Lie489,Lenis,there is a small piece of metal with two small...,7604c3e55ce8dbf238520683d6e71f89,/r/mildlypenis/comments/11oz7fi/lenis/,https://i.redd.it/pskj3q0ao8na1.jpg,7604c3e55ce8dbf238520683d6e71f89.jpg,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,D:\data\images\mildlypenis\thumbnail\7604c3e55...,True,False
29598,11oz73d,mildlypenis,DeimosLV,My friend is making a stuffed animal and this ...,a close up of a blue knitted bagueth on a chair,389f8aa0fb191bdeba08c8d2cfff8ce2,/r/mildlypenis/comments/11oz73d/my_friend_is_m...,https://i.redd.it/gemqhq67o8na1.jpg,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,D:\data\images\mildlypenis\thumbnail\389f8aa0f...,True,False
29599,11oyzm3,mildlypenis,Mymemesareswell,This weird Mickey keychain hand,someone is holding a key chain with a mickey m...,87563ac0c8ddeeb0ce47bc915b99308b,/r/mildlypenis/comments/11oyzm3/this_weird_mic...,https://i.redd.it/vrivt8rom8na1.jpg,87563ac0c8ddeeb0ce47bc915b99308b.jpg,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,D:\data\images\mildlypenis\thumbnail\87563ac0c...,True,False
29600,11oyvx1,mildlypenis,LucindaBobinda,Parking lot pp,a close up of a black and white photo of a bla...,88b596be6de959bd29eed4e8946bb476,/r/mildlypenis/comments/11oyvx1/parking_lot_pp/,https://i.redd.it/w1vebizwl8na1.jpg,88b596be6de959bd29eed4e8946bb476.jpg,D:\data\images\mildlypenis\88b596be6de959bd29e...,D:\data\images\mildlypenis\thumbnail\88b596be6...,True,False


CPU times: total: 719 ms
Wall time: 2.02 s


In [7]:
def check_if_thumb_and_image_exist(x: object):
	try:
		path = x['path']
		if path is None:
			return False
		thumb_nail_path = x['thumbnail_path']
		if thumb_nail_path is None:
			return False
		full_image_exists = os.path.exists(path)
		thumbnail_image_exists = os.path.exists(thumb_nail_path)
		return full_image_exists and thumbnail_image_exists
	except Exception as e:
		print("Error In check_if_thumb_and_image_exist: " + str(e))
		return False

In [8]:
all_data['exists'] = all_data.apply(lambda x: check_if_thumb_and_image_exist(x), axis=1)

display(all_data)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,128z5ns,celebrities,A-Sexy-Name,Rachel Weisz,cars are parked on the side of the road in the...,f09b1c0c5dea50c934081a04c83d4d33,/r/celebrities/comments/128z5ns/rachel_weisz/,https://i.redd.it/nv80afddacra1.jpg,f09b1c0c5dea50c934081a04c83d4d33.jpg,D:\data\images\celebrities\f09b1c0c5dea50c9340...,D:\data\images\celebrities\thumbnail\f09b1c0c5...,True,False
1,11yk8db,prettyasiangirls,MisoShiru520,Library,blonde woman with blonde hair and tattoos on h...,9d5b7331b941ecf3fdbc8ad322303825,/r/prettyasiangirls/comments/11yk8db/library/,https://i.redd.it/ht4d2wuusapa1.jpg,9d5b7331b941ecf3fdbc8ad322303825.jpg,D:\data\images\prettyasiangirls\9d5b7331b941ec...,D:\data\images\prettyasiangirls\thumbnail\9d5b...,True,False
2,11hfioi,KoreanHotties,Majestic_Painter8660,Underboob bikini,a man with a beard and a beard sitting in fron...,a154a56b1b87cf3a0bc93600c86e4585,/r/KoreanHotties/comments/11hfioi/underboob_bi...,https://i.redd.it/cimr17q84jla1.jpg,a154a56b1b87cf3a0bc93600c86e4585.jpg,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,D:\data\images\KoreanHotties\thumbnail\a154a56...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29597,11oz7fi,mildlypenis,Extra-Lie489,Lenis,there is a small piece of metal with two small...,7604c3e55ce8dbf238520683d6e71f89,/r/mildlypenis/comments/11oz7fi/lenis/,https://i.redd.it/pskj3q0ao8na1.jpg,7604c3e55ce8dbf238520683d6e71f89.jpg,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,D:\data\images\mildlypenis\thumbnail\7604c3e55...,True,False
29598,11oz73d,mildlypenis,DeimosLV,My friend is making a stuffed animal and this ...,a close up of a blue knitted bagueth on a chair,389f8aa0fb191bdeba08c8d2cfff8ce2,/r/mildlypenis/comments/11oz73d/my_friend_is_m...,https://i.redd.it/gemqhq67o8na1.jpg,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,D:\data\images\mildlypenis\thumbnail\389f8aa0f...,True,False
29599,11oyzm3,mildlypenis,Mymemesareswell,This weird Mickey keychain hand,someone is holding a key chain with a mickey m...,87563ac0c8ddeeb0ce47bc915b99308b,/r/mildlypenis/comments/11oyzm3/this_weird_mic...,https://i.redd.it/vrivt8rom8na1.jpg,87563ac0c8ddeeb0ce47bc915b99308b.jpg,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,D:\data\images\mildlypenis\thumbnail\87563ac0c...,True,False
29600,11oyvx1,mildlypenis,LucindaBobinda,Parking lot pp,a close up of a black and white photo of a bla...,88b596be6de959bd29eed4e8946bb476,/r/mildlypenis/comments/11oyvx1/parking_lot_pp/,https://i.redd.it/w1vebizwl8na1.jpg,88b596be6de959bd29eed4e8946bb476.jpg,D:\data\images\mildlypenis\88b596be6de959bd29e...,D:\data\images\mildlypenis\thumbnail\88b596be6...,True,False


In [9]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "TTTDiffusion", "data": ["trippinthroughtime"]},
	{"name": "WallStreetDiffusion", "data": ["wallstreetbets"]},
	{"name": "SexyDiffusion", "data": ["selfies", "Amicute", "amihot", "AmIhotAF", "HotGirlNextDoor"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "CelebrityDiffusion", "data": ["celebrities"]},
	{"name": "OldLadyDiffusion", "data": ["oldladiesbakingpies"]},
	{"name": "SWFPetite", "data": ["sfwpetite"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]},
	{"name": "SexyAsianDiffusion", "data": ["realasians", "KoreanHotties", "prettyasiangirls"]},
	{"name": "AsianOfficeGirlDiffusion", "data": ["AsianOfficeLady"]},
	{"name": "MildlyPenisDiffusion", "data": ["mildlypenis"]}
]
sources_df = pd.DataFrame.from_records(sources)

CPU times: total: 0 ns
Wall time: 15.6 ms


In [10]:
%%time

smaller_exportable_df = pd.DataFrame(
	data=[all_data['path'], all_data['image_name'], all_data['caption'], all_data['title'], all_data['subreddit'], all_data['exists']],
	index=['path', 'image_name', 'caption', 'title', 'subreddit', 'exists']).T

with ProgressBar():
	smaller_exportable_df['name'] = smaller_exportable_df.apply(lambda x: add_source(x, sources), axis=1)

display(smaller_exportable_df)

Unnamed: 0,path,image_name,caption,title,subreddit,exists,name
0,D:\data\images\celebrities\f09b1c0c5dea50c9340...,f09b1c0c5dea50c934081a04c83d4d33.jpg,cars are parked on the side of the road in the...,Rachel Weisz,celebrities,True,CelebrityDiffusion
1,D:\data\images\prettyasiangirls\9d5b7331b941ec...,9d5b7331b941ecf3fdbc8ad322303825.jpg,blonde woman with blonde hair and tattoos on h...,Library,prettyasiangirls,True,SexyAsianDiffusion
2,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,a154a56b1b87cf3a0bc93600c86e4585.jpg,a man with a beard and a beard sitting in fron...,Underboob bikini,KoreanHotties,True,SexyAsianDiffusion
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,True,CosmicDiffusion
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,True,CosmicDiffusion
...,...,...,...,...,...,...,...
29597,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,7604c3e55ce8dbf238520683d6e71f89.jpg,there is a small piece of metal with two small...,Lenis,mildlypenis,True,MildlyPenisDiffusion
29598,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,a close up of a blue knitted bagueth on a chair,My friend is making a stuffed animal and this ...,mildlypenis,True,MildlyPenisDiffusion
29599,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,87563ac0c8ddeeb0ce47bc915b99308b.jpg,someone is holding a key chain with a mickey m...,This weird Mickey keychain hand,mildlypenis,True,MildlyPenisDiffusion
29600,D:\data\images\mildlypenis\88b596be6de959bd29e...,88b596be6de959bd29eed4e8946bb476.jpg,a close up of a black and white photo of a bla...,Parking lot pp,mildlypenis,True,MildlyPenisDiffusion


CPU times: total: 5.3 s
Wall time: 5.39 s


In [11]:
foo = smaller_exportable_df.where(smaller_exportable_df['exists'] == True).dropna(how='all').reset_index(drop=True)
display(foo)

Unnamed: 0,path,image_name,caption,title,subreddit,exists,name
0,D:\data\images\celebrities\f09b1c0c5dea50c9340...,f09b1c0c5dea50c934081a04c83d4d33.jpg,cars are parked on the side of the road in the...,Rachel Weisz,celebrities,True,CelebrityDiffusion
1,D:\data\images\prettyasiangirls\9d5b7331b941ec...,9d5b7331b941ecf3fdbc8ad322303825.jpg,blonde woman with blonde hair and tattoos on h...,Library,prettyasiangirls,True,SexyAsianDiffusion
2,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,a154a56b1b87cf3a0bc93600c86e4585.jpg,a man with a beard and a beard sitting in fron...,Underboob bikini,KoreanHotties,True,SexyAsianDiffusion
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,True,CosmicDiffusion
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,True,CosmicDiffusion
...,...,...,...,...,...,...,...
26394,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,7604c3e55ce8dbf238520683d6e71f89.jpg,there is a small piece of metal with two small...,Lenis,mildlypenis,True,MildlyPenisDiffusion
26395,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,a close up of a blue knitted bagueth on a chair,My friend is making a stuffed animal and this ...,mildlypenis,True,MildlyPenisDiffusion
26396,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,87563ac0c8ddeeb0ce47bc915b99308b.jpg,someone is holding a key chain with a mickey m...,This weird Mickey keychain hand,mildlypenis,True,MildlyPenisDiffusion
26397,D:\data\images\mildlypenis\88b596be6de959bd29e...,88b596be6de959bd29eed4e8946bb476.jpg,a close up of a black and white photo of a bla...,Parking lot pp,mildlypenis,True,MildlyPenisDiffusion


In [12]:
display(foo.groupby(by="name").count())

Unnamed: 0_level_0,path,image_name,caption,title,subreddit,exists
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,2024,2024,2024,2024,2024,2024
AsianOfficeGirlDiffusion,991,991,991,991,991,991
CelebrityDiffusion,435,435,435,435,435,435
CityDiffusion,3172,3172,3172,3172,3172,3172
CosmicDiffusion,1940,1940,1940,1940,1940,1940
FatSquirrelDiffusion,1002,1002,1002,1002,1002,1002
ITAPDiffusion,3144,3144,3144,3144,3144,3144
MemeDiffusion,3162,3162,3162,3162,3162,3162
MildlyPenisDiffusion,823,823,823,823,823,823
NatureDiffusion,3786,3786,3786,3786,3786,3786


In [13]:
%%time

smaller_exportable_df = smaller_exportable_df.where(lambda x: x['exists'] == True).dropna(how='all').reset_index(drop=True)

# with ProgressBar():
with tqdm(total=len(smaller_exportable_df), desc="Copy"):
	smaller_exportable_df['image_data'] = smaller_exportable_df.apply(lambda x: move_file(x))
	# dask_frame = dd.from_pandas(smaller_exportable_df, npartitions=10)
	# smaller_exportable_df['image_data'] = dask_frame.apply(lambda x: move_file(x), meta=('str', object), axis=1).compute()

display(smaller_exportable_df)

Copy:   0%|          | 0/26399 [00:00<?, ?it/s]


Unnamed: 0,path,image_name,caption,title,subreddit,exists,name,image_data
0,D:\data\images\celebrities\f09b1c0c5dea50c9340...,f09b1c0c5dea50c934081a04c83d4d33.jpg,cars are parked on the side of the road in the...,Rachel Weisz,celebrities,True,CelebrityDiffusion,
1,D:\data\images\prettyasiangirls\9d5b7331b941ec...,9d5b7331b941ecf3fdbc8ad322303825.jpg,blonde woman with blonde hair and tattoos on h...,Library,prettyasiangirls,True,SexyAsianDiffusion,
2,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,a154a56b1b87cf3a0bc93600c86e4585.jpg,a man with a beard and a beard sitting in fron...,Underboob bikini,KoreanHotties,True,SexyAsianDiffusion,
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,True,CosmicDiffusion,
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,True,CosmicDiffusion,
...,...,...,...,...,...,...,...,...
26394,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,7604c3e55ce8dbf238520683d6e71f89.jpg,there is a small piece of metal with two small...,Lenis,mildlypenis,True,MildlyPenisDiffusion,
26395,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,a close up of a blue knitted bagueth on a chair,My friend is making a stuffed animal and this ...,mildlypenis,True,MildlyPenisDiffusion,
26396,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,87563ac0c8ddeeb0ce47bc915b99308b.jpg,someone is holding a key chain with a mickey m...,This weird Mickey keychain hand,mildlypenis,True,MildlyPenisDiffusion,
26397,D:\data\images\mildlypenis\88b596be6de959bd29e...,88b596be6de959bd29eed4e8946bb476.jpg,a close up of a black and white photo of a bla...,Parking lot pp,mildlypenis,True,MildlyPenisDiffusion,


CPU times: total: 188 ms
Wall time: 203 ms


In [14]:
%%time

filtered_again = smaller_exportable_df.where(lambda x: x['image_data'] != "").dropna(how='all').reset_index(drop=True)
filtered_again = filtered_again.where(lambda x: x['name'] != "").dropna(how='all').reset_index(drop=True)
display(filtered_again)

Unnamed: 0,path,image_name,caption,title,subreddit,exists,name,image_data
0,D:\data\images\celebrities\f09b1c0c5dea50c9340...,f09b1c0c5dea50c934081a04c83d4d33.jpg,cars are parked on the side of the road in the...,Rachel Weisz,celebrities,True,CelebrityDiffusion,
1,D:\data\images\prettyasiangirls\9d5b7331b941ec...,9d5b7331b941ecf3fdbc8ad322303825.jpg,blonde woman with blonde hair and tattoos on h...,Library,prettyasiangirls,True,SexyAsianDiffusion,
2,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,a154a56b1b87cf3a0bc93600c86e4585.jpg,a man with a beard and a beard sitting in fron...,Underboob bikini,KoreanHotties,True,SexyAsianDiffusion,
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,True,CosmicDiffusion,
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,True,CosmicDiffusion,
...,...,...,...,...,...,...,...,...
24370,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,7604c3e55ce8dbf238520683d6e71f89.jpg,there is a small piece of metal with two small...,Lenis,mildlypenis,True,MildlyPenisDiffusion,
24371,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,a close up of a blue knitted bagueth on a chair,My friend is making a stuffed animal and this ...,mildlypenis,True,MildlyPenisDiffusion,
24372,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,87563ac0c8ddeeb0ce47bc915b99308b.jpg,someone is holding a key chain with a mickey m...,This weird Mickey keychain hand,mildlypenis,True,MildlyPenisDiffusion,
24373,D:\data\images\mildlypenis\88b596be6de959bd29e...,88b596be6de959bd29eed4e8946bb476.jpg,a close up of a black and white photo of a bla...,Parking lot pp,mildlypenis,True,MildlyPenisDiffusion,


CPU times: total: 297 ms
Wall time: 314 ms


In [15]:
%%time

final = filtered_again
display(final)

grouped = final.groupby('name')
groupings = [grouped.get_group(x) for x in grouped.groups]

for group in groupings:
	group.to_parquet(f"data/curated/{group['name'].iloc[0]}.parquet")

Unnamed: 0,path,image_name,caption,title,subreddit,exists,name,image_data
0,D:\data\images\celebrities\f09b1c0c5dea50c9340...,f09b1c0c5dea50c934081a04c83d4d33.jpg,cars are parked on the side of the road in the...,Rachel Weisz,celebrities,True,CelebrityDiffusion,
1,D:\data\images\prettyasiangirls\9d5b7331b941ec...,9d5b7331b941ecf3fdbc8ad322303825.jpg,blonde woman with blonde hair and tattoos on h...,Library,prettyasiangirls,True,SexyAsianDiffusion,
2,D:\data\images\KoreanHotties\a154a56b1b87cf3a0...,a154a56b1b87cf3a0bc93600c86e4585.jpg,a man with a beard and a beard sitting in fron...,Underboob bikini,KoreanHotties,True,SexyAsianDiffusion,
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,True,CosmicDiffusion,
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,True,CosmicDiffusion,
...,...,...,...,...,...,...,...,...
24370,D:\data\images\mildlypenis\7604c3e55ce8dbf2385...,7604c3e55ce8dbf238520683d6e71f89.jpg,there is a small piece of metal with two small...,Lenis,mildlypenis,True,MildlyPenisDiffusion,
24371,D:\data\images\mildlypenis\389f8aa0fb191bdeba0...,389f8aa0fb191bdeba08c8d2cfff8ce2.jpg,a close up of a blue knitted bagueth on a chair,My friend is making a stuffed animal and this ...,mildlypenis,True,MildlyPenisDiffusion,
24372,D:\data\images\mildlypenis\87563ac0c8ddeeb0ce4...,87563ac0c8ddeeb0ce47bc915b99308b.jpg,someone is holding a key chain with a mickey m...,This weird Mickey keychain hand,mildlypenis,True,MildlyPenisDiffusion,
24373,D:\data\images\mildlypenis\88b596be6de959bd29e...,88b596be6de959bd29eed4e8946bb476.jpg,a close up of a black and white photo of a bla...,Parking lot pp,mildlypenis,True,MildlyPenisDiffusion,


CPU times: total: 422 ms
Wall time: 626 ms


In [16]:
lines = []
training_lines = []
for record in final.to_dict(orient='records'):
	subreddit = record['subreddit']
	name = record['name']
	prompt = record['title']
	caption = record['caption']
	line = f"<|startoftext|><|model|>{name}<|prompt|>{prompt}<|text|>{caption}<|endoftext|>" + "\n"
	lines.append(line)
with open("training.txt", "wb") as f:
	for line in lines:
		f.write(line.encode("utf-8"))

In [17]:
data_paths = os.listdir("data/curated")

dfs = []

for path in data_paths:
	foo = os.path.join("data", "curated", path)
	df = pd.read_parquet(foo)
	dfs.append(df)

final = pd.concat(dfs)
display(final)

Unnamed: 0,path,image_name,caption,title,subreddit,exists,name,image_data
21778,D:\data\images\AsianOfficeLady\8ee1550e8f3782c...,8ee1550e8f3782c6a5e50284f50d2a20.jpg,a woman in a suit case standing in front of a ...,#956,AsianOfficeLady,True,AsianOfficeGirlDiffusion,
21779,D:\data\images\AsianOfficeLady\22cef92d2735421...,22cef92d27354215a01cc5946aacc697.jpg,a woman in a white top and black skirt standin...,#955,AsianOfficeLady,True,AsianOfficeGirlDiffusion,
21780,D:\data\images\AsianOfficeLady\98655e2a68ab925...,98655e2a68ab9251d77f5e88025661a1.jpg,a woman in a black suit and a white shirt is s...,#954,AsianOfficeLady,True,AsianOfficeGirlDiffusion,
21781,D:\data\images\AsianOfficeLady\c056ef3a0a9f345...,c056ef3a0a9f34538c29d803cb3be077.jpg,a woman in a suit and glasses is posing for a ...,#953,AsianOfficeLady,True,AsianOfficeGirlDiffusion,
21782,D:\data\images\AsianOfficeLady\303198c51c0729f...,303198c51c0729f2e3ce2a6ce85e4dbb.jpg,woman in a blue dress with a red dress and a b...,#952,AsianOfficeLady,True,AsianOfficeGirlDiffusion,
...,...,...,...,...,...,...,...,...
20541,D:\data\images\wallstreetbets\d68443295aac5154...,d68443295aac515459b13ad786e9649f.jpg,a screenshote of a cell phone showing a messag...,ii illegal short selling… 🤔,wallstreetbets,True,WallStreetDiffusion,
20542,D:\data\images\wallstreetbets\959579cc440ec31a...,959579cc440ec31a1cbb228fa3e4a3af.jpg,a screenshote of a cell phone showing a screen...,II illegal short selling 🤔,wallstreetbets,True,WallStreetDiffusion,
20543,D:\data\images\wallstreetbets\07a6f67b65c0452c...,07a6f67b65c0452c98aa59c1b57fc75d.jpg,a screenshote of a dashboard with a bunch of d...,BAER expected to begin trading in Nasdaq on Ja...,wallstreetbets,True,WallStreetDiffusion,
20544,D:\data\images\wallstreetbets\03c75f7d48f92b73...,03c75f7d48f92b73c5fd14ad5ee46b83.jpg,a black and white photo of a man in a suit and...,Thought of you guys,wallstreetbets,True,WallStreetDiffusion,
