In [20]:
%%time

import os.path
import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from tqdm import tqdm
from tqdm.dask import TqdmCallback

from shared_code.utility.spark.set_environ import set_azure_env

cb = TqdmCallback(desc="global")
cb.register()

tqdm.pandas()
tqdm.pandas(desc="global")

pbar = ProgressBar()
pbar.register()

set_azure_env()

from shared_code.utility.storage.azure_file_storage import AzureFileStorageAdapter

fs_adapter = AzureFileStorageAdapter('data')
file_system = fs_adapter.get_file_storage()

CPU times: total: 0 ns
Wall time: 13.1 ms


In [21]:
from typing import List
from dataclasses import dataclass


@dataclass
class DataSources:
	name: str
	data: List[str]

	@staticmethod
	def from_dict(obj: dict) -> 'DataSources':
		_name = obj.get("name")
		_data = [x for x in obj.get("data")]
		return DataSources(_name, _data)

In [22]:
def add_source(x: object, source_list) -> str:
	for source in source_list:
		if x['subreddit'] in source['data']:
			return source['name']
	return ""

In [23]:
def add_compressed_data(x: object) -> str:
	try:
		data = open(x['path'], 'rb').read()
		return data
	except Exception as e:
		print(e)
		return ""

In [24]:
%%time
all_data = pd.read_parquet("data/processed_raw_data.parquet", engine='pyarrow', filesystem=file_system)
display(all_data)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
2,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19763,ywjp42,itookapicture,leflore4,ITAP - Of coach teaching young men to stand st...,there is a man in a black hat and a baseball c...,f9c5e68a2e6082b0827469253807647e,/r/itookapicture/comments/ywjp42/itap_of_coach...,https://i.redd.it/8e4ku12iq80a1.jpg,f9c5e68a2e6082b0827469253807647e.jpg,D:\data\images\itookapicture\f9c5e68a2e6082b08...,D:\data\images\itookapicture\thumbnail\f9c5e68...,True,False
19764,ywjli7,itookapicture,Naj183,ITAP of a Turkey Vulture spreading its wings,a close up of a large bird with its wings open...,a37c752a3ecdd654cd402280b0a3de7f,/r/itookapicture/comments/ywjli7/itap_of_a_tur...,https://i.redd.it/35zgwn967a0a1.jpg,a37c752a3ecdd654cd402280b0a3de7f.jpg,D:\data\images\itookapicture\a37c752a3ecdd654c...,D:\data\images\itookapicture\thumbnail\a37c752...,True,False
19765,ywjew1,itookapicture,ExaminationHopeful36,ITAP of my cat looking out of the window,a close up of a black cat sitting on a window ...,53f99cb828e71b8a3172fea41d1f2259,/r/itookapicture/comments/ywjew1/itap_of_my_ca...,https://i.redd.it/orn7pcse5a0a1.jpg,53f99cb828e71b8a3172fea41d1f2259.jpg,D:\data\images\itookapicture\53f99cb828e71b8a3...,D:\data\images\itookapicture\thumbnail\53f99cb...,True,False
19766,ywjeai,itookapicture,retrospektor,ITAP of an empty beer bottle,a close up of a candle lit up in a candlelight,9d5acf1a16798238d606fc3646d16f75,/r/itookapicture/comments/ywjeai/itap_of_an_em...,https://i.redd.it/oyuscvpg5a0a1.jpg,9d5acf1a16798238d606fc3646d16f75.jpg,D:\data\images\itookapicture\9d5acf1a16798238d...,D:\data\images\itookapicture\thumbnail\9d5acf1...,True,False


CPU times: total: 1.06 s
Wall time: 2.36 s


In [25]:
%%time

filtered_on_exist = all_data.where(lambda x: x['exists']).dropna(how='all').reset_index(drop=True)
filtered_on_exist = filtered_on_exist.where(lambda x: x['caption'] != "").dropna(how='all').reset_index(drop=True)
display(filtered_on_exist)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
2,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19763,ywjp42,itookapicture,leflore4,ITAP - Of coach teaching young men to stand st...,there is a man in a black hat and a baseball c...,f9c5e68a2e6082b0827469253807647e,/r/itookapicture/comments/ywjp42/itap_of_coach...,https://i.redd.it/8e4ku12iq80a1.jpg,f9c5e68a2e6082b0827469253807647e.jpg,D:\data\images\itookapicture\f9c5e68a2e6082b08...,D:\data\images\itookapicture\thumbnail\f9c5e68...,True,False
19764,ywjli7,itookapicture,Naj183,ITAP of a Turkey Vulture spreading its wings,a close up of a large bird with its wings open...,a37c752a3ecdd654cd402280b0a3de7f,/r/itookapicture/comments/ywjli7/itap_of_a_tur...,https://i.redd.it/35zgwn967a0a1.jpg,a37c752a3ecdd654cd402280b0a3de7f.jpg,D:\data\images\itookapicture\a37c752a3ecdd654c...,D:\data\images\itookapicture\thumbnail\a37c752...,True,False
19765,ywjew1,itookapicture,ExaminationHopeful36,ITAP of my cat looking out of the window,a close up of a black cat sitting on a window ...,53f99cb828e71b8a3172fea41d1f2259,/r/itookapicture/comments/ywjew1/itap_of_my_ca...,https://i.redd.it/orn7pcse5a0a1.jpg,53f99cb828e71b8a3172fea41d1f2259.jpg,D:\data\images\itookapicture\53f99cb828e71b8a3...,D:\data\images\itookapicture\thumbnail\53f99cb...,True,False
19766,ywjeai,itookapicture,retrospektor,ITAP of an empty beer bottle,a close up of a candle lit up in a candlelight,9d5acf1a16798238d606fc3646d16f75,/r/itookapicture/comments/ywjeai/itap_of_an_em...,https://i.redd.it/oyuscvpg5a0a1.jpg,9d5acf1a16798238d606fc3646d16f75.jpg,D:\data\images\itookapicture\9d5acf1a16798238d...,D:\data\images\itookapicture\thumbnail\9d5acf1...,True,False


CPU times: total: 406 ms
Wall time: 471 ms


In [26]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes","trippinthroughtime"]},
	{"name": "SexyDiffusion", "data": ["sfwpetite","selfies","Amicute","amihot","AmIhotAF","HotGirlNextDoor"]},
	{"name": "FatSquirrelDiffusion", "data": ["fatsquirrelhate"]},
	{"name": "RedHeadDiffusion", "data": ["SFWRedheads"]},
	{"name": "NextDoorGirlsDiffusion", "data": ["SFWNextDoorGirls"]}
]
sources_df = pd.DataFrame.from_records(sources)
display(sources_df)

Unnamed: 0,name,data
0,CityDiffusion,[CityPorn]
1,NatureDiffusion,[EarthPorn]
2,CosmicDiffusion,[spaceporn]
3,ITAPDiffusion,[itookapicture]
4,MemeDiffusion,"[memes, trippinthroughtime]"
5,SexyDiffusion,"[sfwpetite, selfies, Amicute, amihot, AmIhotAF..."
6,FatSquirrelDiffusion,[fatsquirrelhate]
7,RedHeadDiffusion,[SFWRedheads]
8,NextDoorGirlsDiffusion,[SFWNextDoorGirls]


CPU times: total: 15.6 ms
Wall time: 53 ms


In [27]:
%%time

smaller_exportable_df = pd.DataFrame(
	data=[filtered_on_exist['thumbnail_path'], filtered_on_exist['image_name'], filtered_on_exist['caption'], filtered_on_exist['title'], filtered_on_exist['subreddit']],
	index=['path', 'image_name', 'caption', 'title', 'subreddit']).T

with ProgressBar():
	smaller_exportable_df['name'] = smaller_exportable_df.progress_apply(lambda x: add_source(x, sources), axis=1)

display(smaller_exportable_df)

global: 100%|██████████| 19768/19768 [00:01<00:00, 10731.80it/s]


Unnamed: 0,path,image_name,caption,title,subreddit,name
0,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion
1,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion
2,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,
3,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion
4,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion
...,...,...,...,...,...,...
19763,D:\data\images\itookapicture\thumbnail\f9c5e68...,f9c5e68a2e6082b0827469253807647e.jpg,there is a man in a black hat and a baseball c...,ITAP - Of coach teaching young men to stand st...,itookapicture,ITAPDiffusion
19764,D:\data\images\itookapicture\thumbnail\a37c752...,a37c752a3ecdd654cd402280b0a3de7f.jpg,a close up of a large bird with its wings open...,ITAP of a Turkey Vulture spreading its wings,itookapicture,ITAPDiffusion
19765,D:\data\images\itookapicture\thumbnail\53f99cb...,53f99cb828e71b8a3172fea41d1f2259.jpg,a close up of a black cat sitting on a window ...,ITAP of my cat looking out of the window,itookapicture,ITAPDiffusion
19766,D:\data\images\itookapicture\thumbnail\9d5acf1...,9d5acf1a16798238d606fc3646d16f75.jpg,a close up of a candle lit up in a candlelight,ITAP of an empty beer bottle,itookapicture,ITAPDiffusion


CPU times: total: 6.28 s
Wall time: 12.3 s


In [28]:
%%time

with cb:
	dask_frame = dd.from_pandas(smaller_exportable_df, npartitions=10)
	smaller_exportable_df['image_data'] = dask_frame.apply(lambda x: add_compressed_data(x), meta=('str', object), axis=1).compute()

display(smaller_exportable_df)

[                                        ] | 0% Completed | 147.76 ms

global:   0%|          | 0/10 [00:00<?, ?it/s]

global:   0%|          | 0/10 [00:00<?, ?it/s]

[########################################] | 100% Completed | 24.06 s
[########################################] | 100% Completed | 24.14 s
[########################################] | 100% Completed | 24.19 s


Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
19763,D:\data\images\itookapicture\thumbnail\f9c5e68...,f9c5e68a2e6082b0827469253807647e.jpg,there is a man in a black hat and a baseball c...,ITAP - Of coach teaching young men to stand st...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
19764,D:\data\images\itookapicture\thumbnail\a37c752...,a37c752a3ecdd654cd402280b0a3de7f.jpg,a close up of a large bird with its wings open...,ITAP of a Turkey Vulture spreading its wings,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
19765,D:\data\images\itookapicture\thumbnail\53f99cb...,53f99cb828e71b8a3172fea41d1f2259.jpg,a close up of a black cat sitting on a window ...,ITAP of my cat looking out of the window,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
19766,D:\data\images\itookapicture\thumbnail\9d5acf1...,9d5acf1a16798238d606fc3646d16f75.jpg,a close up of a candle lit up in a candlelight,ITAP of an empty beer bottle,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 21.9 s
Wall time: 25 s


In [29]:
%%time

filtered_again = smaller_exportable_df.where(lambda x: x['image_data'] != "").dropna(how='all').reset_index(drop=True)
filtered_again = filtered_again.where(lambda x: x['name'] != "").dropna(how='all').reset_index(drop=True)
display(filtered_again)

Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,D:\data\images\sfwpetite\thumbnail\v3ra9g4vrb9...,v3ra9g4vrb9a1.jpg,blonde haired woman in a bikini top and bikini...,Braids,sfwpetite,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
17736,D:\data\images\itookapicture\thumbnail\f9c5e68...,f9c5e68a2e6082b0827469253807647e.jpg,there is a man in a black hat and a baseball c...,ITAP - Of coach teaching young men to stand st...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17737,D:\data\images\itookapicture\thumbnail\a37c752...,a37c752a3ecdd654cd402280b0a3de7f.jpg,a close up of a large bird with its wings open...,ITAP of a Turkey Vulture spreading its wings,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17738,D:\data\images\itookapicture\thumbnail\53f99cb...,53f99cb828e71b8a3172fea41d1f2259.jpg,a close up of a black cat sitting on a window ...,ITAP of my cat looking out of the window,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17739,D:\data\images\itookapicture\thumbnail\9d5acf1...,9d5acf1a16798238d606fc3646d16f75.jpg,a close up of a candle lit up in a candlelight,ITAP of an empty beer bottle,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 422 ms
Wall time: 439 ms


In [30]:
%%time

final = filtered_again
display(final)

grouped = final.groupby('name')
groupings = [grouped.get_group(x) for x in grouped.groups]

for group in groupings:
	group.to_parquet(f"data/curated/{group['name'].iloc[0]}.parquet")

Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,D:\data\images\sfwpetite\thumbnail\v3ra9g4vrb9...,v3ra9g4vrb9a1.jpg,blonde haired woman in a bikini top and bikini...,Braids,sfwpetite,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
17736,D:\data\images\itookapicture\thumbnail\f9c5e68...,f9c5e68a2e6082b0827469253807647e.jpg,there is a man in a black hat and a baseball c...,ITAP - Of coach teaching young men to stand st...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17737,D:\data\images\itookapicture\thumbnail\a37c752...,a37c752a3ecdd654cd402280b0a3de7f.jpg,a close up of a large bird with its wings open...,ITAP of a Turkey Vulture spreading its wings,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17738,D:\data\images\itookapicture\thumbnail\53f99cb...,53f99cb828e71b8a3172fea41d1f2259.jpg,a close up of a black cat sitting on a window ...,ITAP of my cat looking out of the window,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
17739,D:\data\images\itookapicture\thumbnail\9d5acf1...,9d5acf1a16798238d606fc3646d16f75.jpg,a close up of a candle lit up in a candlelight,ITAP of an empty beer bottle,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 10.3 s
Wall time: 22.9 s


In [31]:
lines = []
training_lines = []
for record in final.to_dict(orient='records'):
	subreddit = record['subreddit']
	name = record['name']
	prompt = record['title']
	caption = record['caption']
	line = f"<|startoftext|><|model|>{name}<|prompt|>{prompt}<|text|>{caption}<|endoftext|>" + "\n"
	lines.append(line)
with open("training.txt", "wb") as f:
	for line in lines:
		f.write(line.encode("utf-8"))

In [42]:
data_paths = os.listdir("data/curated")

dfs = []

for path in data_paths:
	foo = os.path.join("data", "curated", path)
	df = pd.read_parquet(foo)
	dfs.append(df)

final = pd.concat(dfs)
display(final)

Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
7,D:\data\images\CityPorn\thumbnail\slyqoecbzb9a...,slyqoecbzb9a1.jpg,arafed view of a pier with a pier and a pier,Pacific Beach California,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
8,D:\data\images\CityPorn\thumbnail\09kuna4hzb9a...,09kuna4hzb9a1.jpg,mountainside view of a town with a flagpole an...,"Not exactly a city, but Jim Thorpe PA is very ...",CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13,D:\data\images\CityPorn\thumbnail\4y6hzocyec9a...,4y6hzocyec9a1.jpg,arafed view of a train on a bridge over a river,"Wissahickon, Phila, PA",CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
14,D:\data\images\CityPorn\thumbnail\8xwnx4dlgc9a...,8xwnx4dlgc9a1.jpg,arafed image of a city street scene with peopl...,"2:00am in Nashville, Tennessee",CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
15863,D:\data\images\trippinthroughtime\thumbnail\33...,339083838d089fc8fe66c0afea5d7871.jpg,a painting of a man laying on a horse in a stable,"He told me ""Neigh!""",trippinthroughtime,TrippingThroughTimeDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
15864,D:\data\images\trippinthroughtime\thumbnail\6f...,6f0b42ae45c3c5a1e3285fe23c5d7295.jpg,a picture taken from a picture of a man with a...,"Awww, have a bad dream buddy?",trippinthroughtime,TrippingThroughTimeDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
15865,D:\data\images\trippinthroughtime\thumbnail\f0...,f040d5fc4198b4ead5a0d275b99533f7.jpg,a picture taken from a book cover of a woman w...,The worst way to start your day,trippinthroughtime,TrippingThroughTimeDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
15866,D:\data\images\trippinthroughtime\thumbnail\34...,34559f17e64ed6b7933c056b3c3dd635.jpg,a painting of a painting of a man in a gladion,When your rent is too damn high,trippinthroughtime,TrippingThroughTimeDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
