In [1]:
from shared_code.utility.spark.set_environ import *
import os
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] =  sys.executable

set_azure_env()

from pyspark.sql.functions import udf
from shared_code.utility.storage.table import TableAdapter
from shared_code.utility.schemas.spark_table_schema import *

spark = get_session('n')


In [2]:
table_broker: TableAdapter = TableAdapter()

@udf(returnType=StringType())
def get_missing_name(path):
	try:
		return os.path.split(path)[-1]
	except:
		return ""

@udf(returnType=StringType())
def get_correct_path(path, sub, name):
	normal_path = os.path.normpath(path)
	if "reddit_images" in normal_path:
		updated_path = os.path.join("D:", os.sep, "data", "images", sub, name)
		return updated_path
	if "raw_reddit_images" in normal_path:
		updated_path = os.path.join("D:", os.sep, "data", "images", sub, name)
		return updated_path
	else:
		updated_path = os.path.join("D:", os.sep, "data", "images", sub, name)
		return updated_path

@udf(returnType=BooleanType())
def check_if_exists(path):
	return os.path.exists(path)

@udf(returnType=BooleanType())
def check_if_curated(curated: bool):
	if curated is None:
		return False
	if curated == "True":
		return True
	else:
		return False

@udf(returnType=StringType())
def get_thumbnail_path(path:str, sub:str, name:str):
	if path == "" or path is None or path == "None" or path == "NaN" or len(path) < 5:
		expected_path = os.path.join("D:", os.sep, "data", "images", sub, "thumbnails", name)
		if os.path.exists(expected_path):
			return expected_path
		else:
			return ""
	else:
		if os.path.exists(path):
			return path
		else:
			return ""

In [None]:
def update(thing):
	print(f"=== Updating record: {thing} ===")
	table_broker.upsert_entity_to_table("training", thing)

In [3]:
table_name = "training"
table_adapter: TableAdapter = TableAdapter()
raw_data = table_adapter.get_all_entities(table_name)
spark_df = spark.createDataFrame(raw_data, schema=image_table_schema)

named_frame = spark_df.withColumn("image_name", get_missing_name("image")).drop("image").withColumnRenamed("image_name", "image")

print(f"=== Total records From Cloud: {named_frame.count()} ===")
display(named_frame.limit(10).toPandas())

=== Total records From Cloud: 8331 ===


Unnamed: 0,PartitionKey,RowKey,text,id,author,url,flair,permalink,hash,subreddit,caption,exists,image,updated_caption,small_image,curated
0,training,1000cej,New York in the fog,1000cej,OtterlyFoxy,https://i.redd.it/4emw5uldib9a1.jpg,,/r/CityPorn/comments/1000cej/new_york_in_the_fog/,7a8d96e378c15c8ab8440ac311f12c11,CityPorn,A person is riding a bicycle on a city street.,True,4emw5uldib9a1.jpg,a city street filled with lots of tall buildings,D:\data\images\CityPorn\thumbnail\4emw5uldib9a...,False
1,training,1000d16,Thoughts about my NYE party outfit?,1000d16,princessxo699,https://i.imgur.com/GgFEagO.jpg,Outfit of the Day,/r/SFWNextDoorGirls/comments/1000d16/thoughts_...,9951b4f82caeb8ba2bd9f79f8d422450,SFWNextDoorGirls,A little girl in a pink dress is holding a pin...,True,GgFEagO.jpg,a woman sitting on a table holding a pink flower,D:\data\images\SFWNextDoorGirls\thumbnail\GgFE...,False
2,training,1000fg0,(IKTR),1000fg0,BlkBrd1312,https://i.redd.it/nwa7hts2jb9a1.jpg,HotGirlNextDoor,/r/HotGirlNextDoor/comments/1000fg0/iktr/,be8dd55e34216bec1e15e03fa296eacc,HotGirlNextDoor,A woman in a pink bathing suit holds a baby in...,True,nwa7hts2jb9a1.jpg,a woman in a bathing suit with a cell phone in...,D:\data\images\HotGirlNextDoor\thumbnail\nwa7h...,False
3,training,1000glf,Just looking for entertainment,1000glf,toolate_sharkbait,https://i.redd.it/4xyb1vgbjb9a1.jpg,Female (18+),/r/AmIhotAF/comments/1000glf/just_looking_for_...,e554c1ed7ffa2740436ac082068b2824,AmIhotAF,A woman in a pink dress is smiling at the camera.,True,4xyb1vgbjb9a1.jpg,a beautiful young woman in a black dress posin...,D:\data\images\AmIhotAF\thumbnail\4xyb1vgbjb9a...,False
4,training,1000j1n,Anon wants Elon cut,1000j1n,trent8051,https://i.redd.it/3mewbe0wjb9a1.jpg,,/r/greentext/comments/1000j1n/anon_wants_elon_...,1dec3dabb5e46cde01855d06089c287a,greentext,A man and a woman are sitting at a table with ...,True,3mewbe0wjb9a1.jpg,a collage of photos showing a man in a suit an...,D:\data\images\greentext\thumbnail\3mewbe0wjb9...,False
5,training,1000qiq,Who’s going to be my new years kiss tonight? 🤧,1000qiq,fireybubbulegum,https://i.redd.it/0mgrt95plb9a1.jpg,,/r/SFWRedheads/comments/1000qiq/whos_going_to_...,7b0be38a08e3316051c6e5a3753bc791,SFWRedheads,A woman in a red dress is talking to a man in ...,False,0mgrt95plb9a1.jpg,,,False
6,training,1001fon,honk if you like my sweater 😁💕,1001fon,glam_pire,https://i.redd.it/gp4fhsdorb9a1.jpg,,/r/SFWRedheads/comments/1001fon/honk_if_you_li...,feed678c3b74ed1d0b7011f28e8f283f,SFWRedheads,A woman in a red dress is talking to a man in ...,False,gp4fhsdorb9a1.jpg,,,False
7,training,1001ghp,Braids,1001ghp,eromorphic,https://i.redd.it/v3ra9g4vrb9a1.jpg,,/r/sfwpetite/comments/1001ghp/braids/,16ca1536c4f6936d3e994df7cb40ab8e,sfwpetite,A girl in a pink bathing suit walks down the b...,True,v3ra9g4vrb9a1.jpg,a beautiful young woman in a bikini posing for...,D:\data\images\sfwpetite\thumbnail\v3ra9g4vrb9...,False
8,training,1001ih5,New Year's Elf,1001ih5,ms_twine_grace,https://i.redd.it/bhigr5rbsb9a1.jpg,,/r/SFWNextDoorGirls/comments/1001ih5/new_years...,9ff18e9afb7b566c0d50add45cbfb989,SFWNextDoorGirls,A little girl in a pink dress is dancing on a ...,False,bhigr5rbsb9a1.jpg,,,False
9,training,1001kp4,Happy New Years Eve,1001kp4,Hotblondie69420,https://i.redd.it/dn7lcclvsb9a1.jpg,Female (18+),/r/AmIhotAF/comments/1001kp4/happy_new_years_eve/,7137ef4fe19a97c757553bfa62b23a33,AmIhotAF,A woman in a white dress is talking to a man i...,False,dn7lcclvsb9a1.jpg,,,False


In [4]:
print("=== Starting Path Normalization ===")

out_data = spark_df\
	.withColumn("vimage", get_correct_path("image", "subreddit", "image_name"))\
	.withColumn("vexists", check_if_exists("image"))\
	.withColumn("vcurated", check_if_curated("curated"))\
	.withColumn("vsmall_image", get_thumbnail_path("small_image", "subreddit", "image_name"))

rows = out_data.rdd.collect()

print("=== After Normalization ===")

=== Starting Path Normalization ===
=== After Normalization ===


In [5]:
out_data_1 = spark.createDataFrame(rows)\
	.drop("image")\
	.drop("exists")\
	.drop("curated")\
	.drop("small_image")\
	.withColumnRenamed("vimage", "image")\
	.withColumnRenamed("vexists", "exists")\
	.withColumnRenamed("vcurated", "curated")\
	.withColumnRenamed("vsmall_image", "small_image")\
	.select("PartitionKey", "RowKey","image", "text", "id", "author", "url", "flair", "permalink", "hash", "subreddit", "caption", "exists", "image_name", "updated_caption", "small_image", "curated")

In [6]:
final_df = spark.createDataFrame(out_data_1.collect(), schema=image_table_schema)

print(f"=== Total records to update: {final_df.count()} ===")
for i, record in enumerate(final_df.toPandas().to_dict(orient='records')):
	if i % 100 == final_df.count():
		print(f"Updating record {i} of {final_df.count()} ")
	try:
		table_broker.upsert_entity_to_table(table_name, record)
	except:
		print(f"=== Failed to update record: {record} ===")
		table_broker = TableAdapter()
		continue

=== After Normalization ===


Unnamed: 0,PartitionKey,RowKey,image,text,id,author,url,flair,permalink,hash,subreddit,caption,exists,image_name,updated_caption,small_image,curated
0,training,10hbca4,D:\data\images\SFWRedheads\aq6c0qxfjbda1.jpg,Sticking her tongue out,10hbca4,HonestRunner,https://i.redd.it/aq6c0qxfjbda1.jpg,,/r/SFWRedheads/comments/10hbca4/sticking_her_t...,22373eb7333862b14bdd1d5e8ee40881,SFWRedheads,A girl in a pink dress is smiling.,True,aq6c0qxfjbda1.jpg,a woman in a black dress is looking at the camera,D:\data\images\SFWRedheads\thumbnail\aq6c0qxfj...,False
1,training,10hcdl6,D:\data\images\SFWRedheads\9e6udfft7ada1.jpg,Ready for the weekend :),10hcdl6,lynxicat,https://i.redd.it/9e6udfft7ada1.jpg,,/r/SFWRedheads/comments/10hcdl6/ready_for_the_...,b7d58c29b39053d29cf6d9bbca90bec2,SFWRedheads,A girl in a pink shirt and blue jeans is posin...,True,9e6udfft7ada1.jpg,a woman in a red shirt with a red bow tie,D:\data\images\SFWRedheads\thumbnail\9e6udfft7...,False
2,training,10hcg26,D:\data\images\SFWRedheads\zf3ewwy1qbda1.jpg,The temp just realized it’s January 🙄,10hcg26,ivymiaexplicit,https://i.redd.it/zf3ewwy1qbda1.jpg,,/r/SFWRedheads/comments/10hcg26/the_temp_just_...,da2aa8e3bf15e09384b376cb330137c2,SFWRedheads,A girl in a pink shirt and blue jeans is stand...,False,zf3ewwy1qbda1.jpg,,,False
3,training,10hcnni,D:\data\images\SFWRedheads\97bq87rarbda1.jpg,Do you like my red hair?,10hcnni,Jolenebbyxo,https://i.redd.it/97bq87rarbda1.jpg,,/r/SFWRedheads/comments/10hcnni/do_you_like_my...,c22d3984139de3e6b1c962f7076b91db,SFWRedheads,A girl in a pink dress is talking to a girl in...,False,97bq87rarbda1.jpg,,,False
4,training,10hdbuk,D:\data\images\SFWRedheads\lsyz6i9fwbda1.jpg,I'm back follow me❤️ #red #redhead #redheads #...,10hdbuk,Ashely6ginger,https://i.redd.it/lsyz6i9fwbda1.jpg,,/r/SFWRedheads/comments/10hdbuk/im_back_follow...,6c33f940b210e48183e5e9d49a6ce0b4,SFWRedheads,A woman in a pink shirt and blue jeans is talk...,False,lsyz6i9fwbda1.jpg,,,False
5,training,10hdj0i,D:\data\images\greentext\2h10z58igada1.jpg,/tv/anon watches a 2023 show,10hdj0i,AlgerianBeerEnjoyer,https://i.redd.it/2h10z58igada1.jpg,,/r/greentext/comments/10hdj0i/tvanon_watches_a...,371293dcc94c65310ad6024c8d905eb1,greentext,,True,2h10z58igada1.jpg,,D:\data\images\greentext\thumbnail\2h10z58igad...,False
6,training,10hfrpn,D:\data\images\SFWRedheads\s0zuspu5hcda1.jpg,me and my birthday cake! i made it and I'm so ...,10hfrpn,miremi137,https://i.redd.it/s0zuspu5hcda1.jpg,,/r/SFWRedheads/comments/10hfrpn/me_and_my_birt...,fbbe1dec20768ebd858df1b3f3701174,SFWRedheads,A girl in a pink dress is standing in front of...,True,s0zuspu5hcda1.jpg,a woman in a pink dress holding a pink cake,D:\data\images\SFWRedheads\thumbnail\s0zuspu5h...,False
7,training,10hg2vh,D:\data\images\greentext\opa93o8d2bda1.jpg,Anon is from the future,10hg2vh,Ironzombie39,https://i.redd.it/opa93o8d2bda1.jpg,,/r/greentext/comments/10hg2vh/anon_is_from_the...,fc2196dcc4e698afbec39912918bfae4,greentext,,True,opa93o8d2bda1.jpg,,D:\data\images\greentext\thumbnail\opa93o8d2bd...,False
8,training,10hgca6,D:\data\images\SFWRedheads\884zlhw9mcda1.jpg,Laid back,10hgca6,HonestRunner,https://i.redd.it/884zlhw9mcda1.jpg,,/r/SFWRedheads/comments/10hgca6/laid_back/,113ee7ff496236268c01507712513ec1,SFWRedheads,A girl in a pink dress is smiling.,True,884zlhw9mcda1.jpg,a beautiful young woman sitting on a bed,D:\data\images\SFWRedheads\thumbnail\884zlhw9m...,False
9,training,10hgnol,D:\data\images\AmIhotAF\da9WDxth.jpg,Finally 18y and Daddy threw a party😇🍭,10hgnol,rosehamil,https://i.imgur.com/da9WDxth.jpg,Female (18+),/r/AmIhotAF/comments/10hgnol/finally_18y_and_d...,8883026a4acc5f77078c7e6e005c06f3,AmIhotAF,A woman in a blue bathing suit holds a baby in...,False,da9WDxth.jpg,,,False


=== Total records to update: 8331 ===
Updating record 0 of 8331
Updating record 100 of 8331
Updating record 200 of 8331
=== Failed to update record: {'PartitionKey': 'training', 'RowKey': 'yx88j6', 'image': 'D:\\data\\images\\AmIhotAF\\6yr4enoojf0a1.jpg', 'text': 'F24 I’m down for hook up I do charge add snap stephnie_clarke', 'id': 'yx88j6', 'author': 'Stephanieclarke-555', 'url': 'https://i.redd.it/6yr4enoojf0a1.jpg', 'flair': 'Female (18+)', 'permalink': '/r/AmIhotAF/comments/yx88j6/f24_im_down_for_hook_up_i_do_charge_add_snap/', 'hash': '9e40cc65f8029ccba411c1f77bcdaa5f', 'subreddit': 'AmIhotAF', 'caption': 'A woman in a red dress is posing for a picture with her hair in a ponytail.', 'exists': True, 'image_name': '6yr4enoojf0a1.jpg', 'updated_caption': 'a woman in a pink dress with a tattoo on her arm', 'small_image': 'D:\\data\\images\\AmIhotAF\\thumbnail\\6yr4enoojf0a1.jpg', 'curated': False} ===
Updating record 300 of 8331
Updating record 400 of 8331
Updating record 500 of 8331

In [9]:
print(f"=== Normalization Of Paths Complete ===")
# print(f"Total records to write: {writable_df.count()}")
# display(writable_df.limit(10).toPandas())
# writable_df.write.parquet("D:\\data\\processed\\reddit_images.parquet", mode="overwrite")

Total records to write: 8331


Unnamed: 0,PartitionKey,RowKey,image,text,id,author,url,flair,permalink,hash,subreddit,caption,exists,image_name,updated_caption,small_image,curated
0,training,10hbca4,D:\data\images\SFWRedheads\aq6c0qxfjbda1.jpg,Sticking her tongue out,10hbca4,HonestRunner,https://i.redd.it/aq6c0qxfjbda1.jpg,,/r/SFWRedheads/comments/10hbca4/sticking_her_t...,22373eb7333862b14bdd1d5e8ee40881,SFWRedheads,A girl in a pink dress is smiling.,True,aq6c0qxfjbda1.jpg,a woman in a black dress is looking at the camera,D:\data\images\SFWRedheads\thumbnail\aq6c0qxfj...,False
1,training,10hcdl6,D:\data\images\SFWRedheads\9e6udfft7ada1.jpg,Ready for the weekend :),10hcdl6,lynxicat,https://i.redd.it/9e6udfft7ada1.jpg,,/r/SFWRedheads/comments/10hcdl6/ready_for_the_...,b7d58c29b39053d29cf6d9bbca90bec2,SFWRedheads,A girl in a pink shirt and blue jeans is posin...,True,9e6udfft7ada1.jpg,a woman in a red shirt with a red bow tie,D:\data\images\SFWRedheads\thumbnail\9e6udfft7...,False
2,training,10hcg26,D:\data\images\SFWRedheads\zf3ewwy1qbda1.jpg,The temp just realized it’s January 🙄,10hcg26,ivymiaexplicit,https://i.redd.it/zf3ewwy1qbda1.jpg,,/r/SFWRedheads/comments/10hcg26/the_temp_just_...,da2aa8e3bf15e09384b376cb330137c2,SFWRedheads,A girl in a pink shirt and blue jeans is stand...,False,zf3ewwy1qbda1.jpg,,,False
3,training,10hcnni,D:\data\images\SFWRedheads\97bq87rarbda1.jpg,Do you like my red hair?,10hcnni,Jolenebbyxo,https://i.redd.it/97bq87rarbda1.jpg,,/r/SFWRedheads/comments/10hcnni/do_you_like_my...,c22d3984139de3e6b1c962f7076b91db,SFWRedheads,A girl in a pink dress is talking to a girl in...,False,97bq87rarbda1.jpg,,,False
4,training,10hdbuk,D:\data\images\SFWRedheads\lsyz6i9fwbda1.jpg,I'm back follow me❤️ #red #redhead #redheads #...,10hdbuk,Ashely6ginger,https://i.redd.it/lsyz6i9fwbda1.jpg,,/r/SFWRedheads/comments/10hdbuk/im_back_follow...,6c33f940b210e48183e5e9d49a6ce0b4,SFWRedheads,A woman in a pink shirt and blue jeans is talk...,False,lsyz6i9fwbda1.jpg,,,False
5,training,10hdj0i,D:\data\images\greentext\2h10z58igada1.jpg,/tv/anon watches a 2023 show,10hdj0i,AlgerianBeerEnjoyer,https://i.redd.it/2h10z58igada1.jpg,,/r/greentext/comments/10hdj0i/tvanon_watches_a...,371293dcc94c65310ad6024c8d905eb1,greentext,,True,2h10z58igada1.jpg,,D:\data\images\greentext\thumbnail\2h10z58igad...,False
6,training,10hfrpn,D:\data\images\SFWRedheads\s0zuspu5hcda1.jpg,me and my birthday cake! i made it and I'm so ...,10hfrpn,miremi137,https://i.redd.it/s0zuspu5hcda1.jpg,,/r/SFWRedheads/comments/10hfrpn/me_and_my_birt...,fbbe1dec20768ebd858df1b3f3701174,SFWRedheads,A girl in a pink dress is standing in front of...,True,s0zuspu5hcda1.jpg,a woman in a pink dress holding a pink cake,D:\data\images\SFWRedheads\thumbnail\s0zuspu5h...,False
7,training,10hg2vh,D:\data\images\greentext\opa93o8d2bda1.jpg,Anon is from the future,10hg2vh,Ironzombie39,https://i.redd.it/opa93o8d2bda1.jpg,,/r/greentext/comments/10hg2vh/anon_is_from_the...,fc2196dcc4e698afbec39912918bfae4,greentext,,True,opa93o8d2bda1.jpg,,D:\data\images\greentext\thumbnail\opa93o8d2bd...,False
8,training,10hgca6,D:\data\images\SFWRedheads\884zlhw9mcda1.jpg,Laid back,10hgca6,HonestRunner,https://i.redd.it/884zlhw9mcda1.jpg,,/r/SFWRedheads/comments/10hgca6/laid_back/,113ee7ff496236268c01507712513ec1,SFWRedheads,A girl in a pink dress is smiling.,True,884zlhw9mcda1.jpg,a beautiful young woman sitting on a bed,D:\data\images\SFWRedheads\thumbnail\884zlhw9m...,False
9,training,10hgnol,D:\data\images\AmIhotAF\da9WDxth.jpg,Finally 18y and Daddy threw a party😇🍭,10hgnol,rosehamil,https://i.imgur.com/da9WDxth.jpg,Female (18+),/r/AmIhotAF/comments/10hgnol/finally_18y_and_d...,8883026a4acc5f77078c7e6e005c06f3,AmIhotAF,A woman in a blue bathing suit holds a baby in...,False,da9WDxth.jpg,,,False


In [11]:
# processed = spark.read.parquet("D:\\data\\processed\\reddit_images.parquet")
# print(f"=== Total written to Parquet: {processed.count()} for D:\\data\\processed\\reddit_images.parquet ===")

=== Total written to Parquet: 8331 for D:\data\processed\reddit_images.parquet ===
