In [13]:
%%time

import os.path
import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from tqdm import tqdm
from tqdm.dask import TqdmCallback

from shared_code.utility.spark.set_environ import set_azure_env

cb = TqdmCallback(desc="global")
cb.register()

tqdm.pandas()
tqdm.pandas(desc="global")

pbar = ProgressBar()
pbar.register()

set_azure_env()

from shared_code.utility.storage.azure_file_storage import AzureFileStorageAdapter

fs_adapter = AzureFileStorageAdapter('data')
file_system = fs_adapter.get_file_storage()

CPU times: total: 2.3 s
Wall time: 4.84 s
CPU times: total: 0 ns
Wall time: 0 ns


In [14]:
from typing import List
from dataclasses import dataclass

@dataclass
class DataSources:
	name: str
	data: List[str]

	@staticmethod
	def from_dict(obj: dict) -> 'DataSources':
		_name = obj.get("name")
		_data = [x for x in obj.get("data")]
		return DataSources(_name, _data)

In [15]:
def add_source(x: object, source_list) -> str:
    for source in source_list:
        if x['subreddit'] in source['data']:
            return source['name']
    return ""

In [16]:
def add_compressed_data(x: object) -> str:
    try:
        data = open(x['path'], 'rb').read()
        return data
    except Exception as e:
        print(e)
        return ""

In [17]:
%%time

all_data = pd.read_parquet("data/processed_raw_data.parquet", engine='pyarrow', filesystem=file_system)
display(all_data)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000d16,SFWNextDoorGirls,princessxo699,Thoughts about my NYE party outfit?,a woman sitting on a table holding a pink flower,9951b4f82caeb8ba2bd9f79f8d422450,/r/SFWNextDoorGirls/comments/1000d16/thoughts_...,https://i.imgur.com/GgFEagO.jpg,GgFEagO.jpg,D:\data\images\SFWNextDoorGirls\GgFEagO.jpg,D:\data\images\SFWNextDoorGirls\thumbnail\GgFE...,False,False
2,1000fg0,HotGirlNextDoor,BlkBrd1312,(IKTR),a woman in a bikini with a cell phone in her hand,be8dd55e34216bec1e15e03fa296eacc,/r/HotGirlNextDoor/comments/1000fg0/iktr/,https://i.redd.it/nwa7hts2jb9a1.jpg,nwa7hts2jb9a1.jpg,D:\data\images\HotGirlNextDoor\nwa7hts2jb9a1.jpg,D:\data\images\HotGirlNextDoor\thumbnail\nwa7h...,False,False
3,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
4,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20219,10sb8fs,itookapicture,PsychWardFrog,ITAP. A streetlight that I tampered with in th...,a view of a street light with a green light at...,91da79fa8e6bb1149f5952e771086723,/r/itookapicture/comments/10sb8fs/itap_a_stree...,https://i.redd.it/ld3ppo0k1yfa1.jpg,91da79fa8e6bb1149f5952e771086723.jpg,D:\data\images\itookapicture\91da79fa8e6bb1149...,D:\data\images\itookapicture\thumbnail\91da79f...,True,False
20220,10sb41o,itookapicture,mollygwenandmoe,ITAP of The Pamlico Sound,a black and white photo of a tree trunk trunk ...,087efa4d453a9c4177af88403a0f089a,/r/itookapicture/comments/10sb41o/itap_of_the_...,https://i.redd.it/f1839aag0yfa1.jpg,087efa4d453a9c4177af88403a0f089a.jpg,D:\data\images\itookapicture\087efa4d453a9c417...,D:\data\images\itookapicture\thumbnail\087efa4...,True,False
20221,10sb2tx,itookapicture,Bubbly_Competition21,ITAP of a gondolier,boats are parked in a row of poles on a boat,031790a8c563771eff5b60f1e7df32e2,/r/itookapicture/comments/10sb2tx/itap_of_a_go...,https://i.redd.it/caqf7m960yfa1.jpg,031790a8c563771eff5b60f1e7df32e2.jpg,D:\data\images\itookapicture\031790a8c563771ef...,D:\data\images\itookapicture\thumbnail\031790a...,True,False
20222,10sb20f,itookapicture,Pixelphotos,ITAP of Redondo Beach Pier,a black and white photo of a boat in the water,1e8280d34d7bedcabd250e9f81e7b02f,/r/itookapicture/comments/10sb20f/itap_of_redo...,https://i.redd.it/g5wgd26yzxfa1.jpg,1e8280d34d7bedcabd250e9f81e7b02f.jpg,D:\data\images\itookapicture\1e8280d34d7bedcab...,D:\data\images\itookapicture\thumbnail\1e8280d...,True,False


CPU times: total: 281 ms
Wall time: 1.22 s


Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000d16,SFWNextDoorGirls,princessxo699,Thoughts about my NYE party outfit?,a woman sitting on a table holding a pink flower,9951b4f82caeb8ba2bd9f79f8d422450,/r/SFWNextDoorGirls/comments/1000d16/thoughts_...,https://i.imgur.com/GgFEagO.jpg,GgFEagO.jpg,D:\data\images\SFWNextDoorGirls\GgFEagO.jpg,D:\data\images\SFWNextDoorGirls\thumbnail\GgFE...,False,False
2,1000fg0,HotGirlNextDoor,BlkBrd1312,(IKTR),a woman in a bikini with a cell phone in her hand,be8dd55e34216bec1e15e03fa296eacc,/r/HotGirlNextDoor/comments/1000fg0/iktr/,https://i.redd.it/nwa7hts2jb9a1.jpg,nwa7hts2jb9a1.jpg,D:\data\images\HotGirlNextDoor\nwa7hts2jb9a1.jpg,D:\data\images\HotGirlNextDoor\thumbnail\nwa7h...,False,False
3,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
4,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20219,10sb8fs,itookapicture,PsychWardFrog,ITAP. A streetlight that I tampered with in th...,a view of a street light with a green light at...,91da79fa8e6bb1149f5952e771086723,/r/itookapicture/comments/10sb8fs/itap_a_stree...,https://i.redd.it/ld3ppo0k1yfa1.jpg,91da79fa8e6bb1149f5952e771086723.jpg,D:\data\images\itookapicture\91da79fa8e6bb1149...,D:\data\images\itookapicture\thumbnail\91da79f...,True,False
20220,10sb41o,itookapicture,mollygwenandmoe,ITAP of The Pamlico Sound,a black and white photo of a tree trunk trunk ...,087efa4d453a9c4177af88403a0f089a,/r/itookapicture/comments/10sb41o/itap_of_the_...,https://i.redd.it/f1839aag0yfa1.jpg,087efa4d453a9c4177af88403a0f089a.jpg,D:\data\images\itookapicture\087efa4d453a9c417...,D:\data\images\itookapicture\thumbnail\087efa4...,True,False
20221,10sb2tx,itookapicture,Bubbly_Competition21,ITAP of a gondolier,boats are parked in a row of poles on a boat,031790a8c563771eff5b60f1e7df32e2,/r/itookapicture/comments/10sb2tx/itap_of_a_go...,https://i.redd.it/caqf7m960yfa1.jpg,031790a8c563771eff5b60f1e7df32e2.jpg,D:\data\images\itookapicture\031790a8c563771ef...,D:\data\images\itookapicture\thumbnail\031790a...,True,False
20222,10sb20f,itookapicture,Pixelphotos,ITAP of Redondo Beach Pier,a black and white photo of a boat in the water,1e8280d34d7bedcabd250e9f81e7b02f,/r/itookapicture/comments/10sb20f/itap_of_redo...,https://i.redd.it/g5wgd26yzxfa1.jpg,1e8280d34d7bedcabd250e9f81e7b02f.jpg,D:\data\images\itookapicture\1e8280d34d7bedcab...,D:\data\images\itookapicture\thumbnail\1e8280d...,True,False


CPU times: total: 93.8 ms
Wall time: 665 ms


In [18]:
%%time

filtered_on_exist = all_data.where(lambda x: x['exists']).dropna(how='all').reset_index(drop=True)
filtered_on_exist = filtered_on_exist.where(lambda x: x['caption'] != "").dropna(how='all').reset_index(drop=True)
display(filtered_on_exist)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
2,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13577,10sb8fs,itookapicture,PsychWardFrog,ITAP. A streetlight that I tampered with in th...,a view of a street light with a green light at...,91da79fa8e6bb1149f5952e771086723,/r/itookapicture/comments/10sb8fs/itap_a_stree...,https://i.redd.it/ld3ppo0k1yfa1.jpg,91da79fa8e6bb1149f5952e771086723.jpg,D:\data\images\itookapicture\91da79fa8e6bb1149...,D:\data\images\itookapicture\thumbnail\91da79f...,True,False
13578,10sb41o,itookapicture,mollygwenandmoe,ITAP of The Pamlico Sound,a black and white photo of a tree trunk trunk ...,087efa4d453a9c4177af88403a0f089a,/r/itookapicture/comments/10sb41o/itap_of_the_...,https://i.redd.it/f1839aag0yfa1.jpg,087efa4d453a9c4177af88403a0f089a.jpg,D:\data\images\itookapicture\087efa4d453a9c417...,D:\data\images\itookapicture\thumbnail\087efa4...,True,False
13579,10sb2tx,itookapicture,Bubbly_Competition21,ITAP of a gondolier,boats are parked in a row of poles on a boat,031790a8c563771eff5b60f1e7df32e2,/r/itookapicture/comments/10sb2tx/itap_of_a_go...,https://i.redd.it/caqf7m960yfa1.jpg,031790a8c563771eff5b60f1e7df32e2.jpg,D:\data\images\itookapicture\031790a8c563771ef...,D:\data\images\itookapicture\thumbnail\031790a...,True,False
13580,10sb20f,itookapicture,Pixelphotos,ITAP of Redondo Beach Pier,a black and white photo of a boat in the water,1e8280d34d7bedcabd250e9f81e7b02f,/r/itookapicture/comments/10sb20f/itap_of_redo...,https://i.redd.it/g5wgd26yzxfa1.jpg,1e8280d34d7bedcabd250e9f81e7b02f.jpg,D:\data\images\itookapicture\1e8280d34d7bedcab...,D:\data\images\itookapicture\thumbnail\1e8280d...,True,False


CPU times: total: 78.1 ms
Wall time: 78.2 ms


Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,thumbnail_path,exists,curated
0,1000cej,CityPorn,OtterlyFoxy,New York in the fog,cars are parked on the side of the road in the...,7a8d96e378c15c8ab8440ac311f12c11,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,https://i.redd.it/4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,D:\data\images\CityPorn\4emw5uldib9a1.jpg,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,True,False
1,1000glf,AmIhotAF,toolate_sharkbait,Just looking for entertainment,blonde woman with blonde hair and tattoos on h...,e554c1ed7ffa2740436ac082068b2824,/r/AmIhotAF/comments/1000glf/just_looking_for_...,https://i.redd.it/4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,True,False
2,1000j1n,greentext,trent8051,Anon wants Elon cut,a man with a beard and a beard sitting in fron...,1dec3dabb5e46cde01855d06089c287a,/r/greentext/comments/1000j1n/anon_wants_elon_...,https://i.redd.it/3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,D:\data\images\greentext\3mewbe0wjb9a1.jpg,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,True,False
3,1000mjs,spaceporn,MorningStar_imangi,Northern Lights above Lofoten,a view of a view of a large green and purple a...,2c39ce1290fba541abd0b004b09da6b2,/r/spaceporn/comments/1000mjs/northern_lights_...,https://i.redd.it/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,/data/images/spaceporn/7s5aafaqkb9a1.jpg,/data/images/spaceporn/thumbnail/7s5aafaqkb9a1...,True,False
4,1000qpd,spaceporn,MorningStar_imangi,Viking Lights,a scene of a boat is sitting on the shore of a...,0f72de47c69ff50eca5fa3990215f4ac,/r/spaceporn/comments/1000qpd/viking_lights/,https://i.redd.it/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,/data/images/spaceporn/abojw7lqlb9a1.jpg,/data/images/spaceporn/thumbnail/abojw7lqlb9a1...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13577,10sb8fs,itookapicture,PsychWardFrog,ITAP. A streetlight that I tampered with in th...,a view of a street light with a green light at...,91da79fa8e6bb1149f5952e771086723,/r/itookapicture/comments/10sb8fs/itap_a_stree...,https://i.redd.it/ld3ppo0k1yfa1.jpg,91da79fa8e6bb1149f5952e771086723.jpg,D:\data\images\itookapicture\91da79fa8e6bb1149...,D:\data\images\itookapicture\thumbnail\91da79f...,True,False
13578,10sb41o,itookapicture,mollygwenandmoe,ITAP of The Pamlico Sound,a black and white photo of a tree trunk trunk ...,087efa4d453a9c4177af88403a0f089a,/r/itookapicture/comments/10sb41o/itap_of_the_...,https://i.redd.it/f1839aag0yfa1.jpg,087efa4d453a9c4177af88403a0f089a.jpg,D:\data\images\itookapicture\087efa4d453a9c417...,D:\data\images\itookapicture\thumbnail\087efa4...,True,False
13579,10sb2tx,itookapicture,Bubbly_Competition21,ITAP of a gondolier,boats are parked in a row of poles on a boat,031790a8c563771eff5b60f1e7df32e2,/r/itookapicture/comments/10sb2tx/itap_of_a_go...,https://i.redd.it/caqf7m960yfa1.jpg,031790a8c563771eff5b60f1e7df32e2.jpg,D:\data\images\itookapicture\031790a8c563771ef...,D:\data\images\itookapicture\thumbnail\031790a...,True,False
13580,10sb20f,itookapicture,Pixelphotos,ITAP of Redondo Beach Pier,a black and white photo of a boat in the water,1e8280d34d7bedcabd250e9f81e7b02f,/r/itookapicture/comments/10sb20f/itap_of_redo...,https://i.redd.it/g5wgd26yzxfa1.jpg,1e8280d34d7bedcabd250e9f81e7b02f.jpg,D:\data\images\itookapicture\1e8280d34d7bedcab...,D:\data\images\itookapicture\thumbnail\1e8280d...,True,False


CPU times: total: 78.1 ms
Wall time: 78.6 ms


In [19]:
%%time

sources = [
	{"name": "CityDiffusion", "data": ["CityPorn"]},
	{"name": "NatureDiffusion", "data": ["EarthPorn"]},
	{"name": "CosmicDiffusion", "data": ["spaceporn"]},
	{"name": "ITAPDiffusion", "data": ["itookapicture"]},
	{"name": "MemeDiffusion", "data": ["memes"]},
	{"name": "SexyDiffusion",
	 "data": [
		 "sfwpetite",
		 "selfies",
		 "Amicute",
		 "amihot",
		 "AmIhotAF",
		 "HotGirlNextDoor",
		 "SFWNextDoorGirls",
		 "SFWRedheads"]
	 }
]
sources_df = pd.DataFrame.from_records(sources)
display(sources_df)

Unnamed: 0,name,data
0,CityDiffusion,[CityPorn]
1,NatureDiffusion,[EarthPorn]
2,CosmicDiffusion,[spaceporn]
3,ITAPDiffusion,[itookapicture]
4,MemeDiffusion,[memes]
5,SexyDiffusion,"[sfwpetite, selfies, Amicute, amihot, AmIhotAF..."


CPU times: total: 0 ns
Wall time: 5.51 ms


Unnamed: 0,name,data
0,CityDiffusion,[CityPorn]
1,NatureDiffusion,[EarthPorn]
2,CosmicDiffusion,[spaceporn]
3,ITAPDiffusion,[itookapicture]
4,MemeDiffusion,[memes]
5,SexyDiffusion,"[sfwpetite, selfies, Amicute, amihot, AmIhotAF..."


CPU times: total: 15.6 ms
Wall time: 10.6 ms


In [20]:
%%time

smaller_exportable_df = pd.DataFrame(data=[filtered_on_exist['path'], filtered_on_exist['image_name'], filtered_on_exist['caption'], filtered_on_exist['title'], filtered_on_exist['subreddit']], index = ['path', 'image_name', 'caption', 'title', 'subreddit']).T

with ProgressBar():
    smaller_exportable_df['name'] = smaller_exportable_df.progress_apply(lambda x: add_source(x, sources), axis=1)

display(smaller_exportable_df)

global: 100%|██████████| 13582/13582 [00:00<00:00, 86425.39it/s]


Unnamed: 0,path,image_name,caption,title,subreddit,name
0,D:\data\images\CityPorn\4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion
1,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion
2,D:\data\images\greentext\3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion
...,...,...,...,...,...,...
13577,D:\data\images\itookapicture\91da79fa8e6bb1149...,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion
13578,D:\data\images\itookapicture\087efa4d453a9c417...,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion
13579,D:\data\images\itookapicture\031790a8c563771ef...,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion
13580,D:\data\images\itookapicture\1e8280d34d7bedcab...,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion


CPU times: total: 438 ms
Wall time: 412 ms


global: 100%|██████████| 13582/13582 [00:00<00:00, 80957.73it/s]


Unnamed: 0,path,image_name,caption,title,subreddit,name
0,D:\data\images\CityPorn\4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion
1,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion
2,D:\data\images\greentext\3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion
...,...,...,...,...,...,...
13577,D:\data\images\itookapicture\91da79fa8e6bb1149...,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion
13578,D:\data\images\itookapicture\087efa4d453a9c417...,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion
13579,D:\data\images\itookapicture\031790a8c563771ef...,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion
13580,D:\data\images\itookapicture\1e8280d34d7bedcab...,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion


CPU times: total: 500 ms
Wall time: 487 ms


In [21]:
%%time

with cb:
    dask_frame = dd.from_pandas(smaller_exportable_df, npartitions=10)
    smaller_exportable_df['image_data'] = dask_frame.apply(lambda x: add_compressed_data(x), meta=('str', object), axis=1).compute()

display(smaller_exportable_df)

[                                        ] | 0% Completed | 485.80 us

global:   0%|          | 0/10 [00:00<?, ?it/s]

[                                        ] | 0% Completed | 1.75 s ms[Errno 2] No such file or directory: '/data/images/OldLadiesBakingPies/l4nlorgi200a1.jpg'
[                                        ] | 0% Completed | 32.93 s[Errno 2] No such file or directory: '/data/images/oldladiesbakingpies/HB81Wjt.jpg'
[Errno 2] No such file or directory: '/data/images/oldladiesbakingpies/oIEsM3v.jpg'
[########################################] | 100% Completed | 52.18 s


Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,D:\data\images\greentext\3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
13577,D:\data\images\itookapicture\91da79fa8e6bb1149...,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13578,D:\data\images\itookapicture\087efa4d453a9c417...,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13579,D:\data\images\itookapicture\031790a8c563771ef...,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13580,D:\data\images\itookapicture\1e8280d34d7bedcab...,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 13.6 s
Wall time: 52.8 s
[                                        ] | 0% Completed | 461.30 us

global:   0%|          | 0/10 [00:00<?, ?it/s]

[                                        ] | 0% Completed | 119.65 ms[Errno 2] No such file or directory: '/data/images/OldLadiesBakingPies/l4nlorgi200a1.jpg'
[####                                    ] | 10% Completed | 25.09 ss[Errno 2] No such file or directory: '/data/images/oldladiesbakingpies/HB81Wjt.jpg'
[Errno 2] No such file or directory: '/data/images/oldladiesbakingpies/oIEsM3v.jpg'
[########################################] | 100% Completed | 42.98 s
[########################################] | 100% Completed | 43.05 s


Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,D:\data\images\greentext\3mewbe0wjb9a1.jpg,3mewbe0wjb9a1.jpg,a man with a beard and a beard sitting in fron...,Anon wants Elon cut,greentext,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
13577,D:\data\images\itookapicture\91da79fa8e6bb1149...,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13578,D:\data\images\itookapicture\087efa4d453a9c417...,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13579,D:\data\images\itookapicture\031790a8c563771ef...,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13580,D:\data\images\itookapicture\1e8280d34d7bedcab...,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 11.6 s
Wall time: 43.3 s


In [46]:
%%time

filtered_again = smaller_exportable_df.where(lambda x: x['image_data'] != "").dropna(how='all').reset_index(drop=True)
filtered_again = filtered_again.where(lambda x: x['name'] != "").dropna(how='all').reset_index(drop=True)
display(filtered_again)

Unnamed: 0,path,image_name,caption,title,subreddit,name,image_data
0,D:\data\images\CityPorn\4emw5uldib9a1.jpg,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,D:\data\images\AmIhotAF\4xyb1vgbjb9a1.jpg,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,/data/images/spaceporn/7s5aafaqkb9a1.jpg,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,/data/images/spaceporn/abojw7lqlb9a1.jpg,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,D:\data\images\sfwpetite\v3ra9g4vrb9a1.jpg,v3ra9g4vrb9a1.jpg,blonde haired woman in a bikini top and bikini...,Braids,sfwpetite,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...,...
10848,D:\data\images\itookapicture\91da79fa8e6bb1149...,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10849,D:\data\images\itookapicture\087efa4d453a9c417...,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10850,D:\data\images\itookapicture\031790a8c563771ef...,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10851,D:\data\images\itookapicture\1e8280d34d7bedcab...,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


CPU times: total: 859 ms
Wall time: 2.47 s


In [None]:
%%time

final = filtered_again.drop(columns=['path'])
display(final)

grouped = final.groupby('name')
groupings = [grouped.get_group(x) for x in grouped.groups]



with pbar:
	for group in groupings:
		pbar.set_description(f"Writing {group['name'].iloc[0]}")
		group.to_parquet(f"data/curated/{group['name'].iloc[0]}.parquet")

Unnamed: 0,image_name,caption,title,subreddit,name,image_data
0,4emw5uldib9a1.jpg,cars are parked on the side of the road in the...,New York in the fog,CityPorn,CityDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,4xyb1vgbjb9a1.jpg,blonde woman with blonde hair and tattoos on h...,Just looking for entertainment,AmIhotAF,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,7s5aafaqkb9a1.jpg,a view of a view of a large green and purple a...,Northern Lights above Lofoten,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,abojw7lqlb9a1.jpg,a scene of a boat is sitting on the shore of a...,Viking Lights,spaceporn,CosmicDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,v3ra9g4vrb9a1.jpg,blonde haired woman in a bikini top and bikini...,Braids,sfwpetite,SexyDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
...,...,...,...,...,...,...
10848,91da79fa8e6bb1149f5952e771086723.jpg,a view of a street light with a green light at...,ITAP. A streetlight that I tampered with in th...,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10849,087efa4d453a9c4177af88403a0f089a.jpg,a black and white photo of a tree trunk trunk ...,ITAP of The Pamlico Sound,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10850,031790a8c563771eff5b60f1e7df32e2.jpg,boats are parked in a row of poles on a boat,ITAP of a gondolier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
10851,1e8280d34d7bedcabd250e9f81e7b02f.jpg,a black and white photo of a boat in the water,ITAP of Redondo Beach Pier,itookapicture,ITAPDiffusion,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


Writing NatureDiffusion:  67%|██████▋   | 4/6 [01:26<00:34, 17.18s/it]

In [52]:
lines = []
for record in final.to_dict(orient='records'):
	subreddit = record['subreddit']
	name = record['name']
	prompt = record['title']
	caption = record['caption']
	line = f"<|startoftext|><|model|>{name}<|prompt|>{prompt}<|text|>{caption}<|endoftext|>" + "\n"
	lines.append(line)
with open("training.txt", "wb") as f:
	for line in lines:
		f.write(line.encode("utf-8"))