In [1]:
from pyspark.sql import SparkSession
from vars import *
from datetime import date
from functions import flatten_json, loadConfigs
from pyspark.sql.functions import lit
from pyspark.sql.functions import col,explode

spark = SparkSession.builder \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars", "/jars/postgresql-42.2.5.jar") \
    .getOrCreate()
loadConfigs(spark.sparkContext)

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
today = date.today().strftime('%Y%m%d')
today = 20230326
output_file = "author_flair_richtext"

In [3]:


df_raw = spark.read.option("header", "true") \
    .json(f"s3a://{minio_bucket}/raw/popular_{today}.json")

In [4]:
df_raw = df_raw.select(explode(df_raw.data.children.data).alias("data"))
df_raw = df_raw.select("data.*")

In [5]:
df_author_flair_richtext = df_raw.select("id", "author", "author_flair_richtext") \
                      .withColumnRenamed("id", "post_id")

In [6]:
df_author_flair_exploded = df_author_flair_richtext.select("post_id", "author", explode("author_flair_richtext").alias("author_flair_richtext"))

In [7]:
df_author_flair_cleaned = df_author_flair_exploded.select("*", "author_flair_richtext.*")
df_author_flair_cleaned = df_author_flair_cleaned.drop("author_flair_richtext", "u")

In [8]:
df_author_flair_renamed = df_author_flair_cleaned.withColumnRenamed("a", "additional_attributes") \
                                                                   .withColumnRenamed("e", "type") \
                                                                   .withColumnRenamed("t", "text")

In [9]:
df_final = df_author_flair_renamed.dropDuplicates()

In [10]:
df_final = df_final.withColumn("dateid", lit(today))

In [11]:
df_final.show()

+-------+------------------+---------------------+-----+--------------------+--------+
|post_id|            author|additional_attributes| type|                text|  dateid|
+-------+------------------+---------------------+-----+--------------------+--------+
|11isfra|            elch3w|                 null| text|       MAYMAYMAKERS |20230326|
|1225o1s|      gandalf45435|              :natsm:|emoji|                null|20230326|
|12251en|            cbbBot|               :rcbb:|emoji|                null|20230326|
|12251en|            cbbBot|                 null| text| /r/CollegeBasket...|20230326|
|1223h92|MarvelsGrantMan136|                 null| text|r/Movies contributor|20230326|
|12251en|            cbbBot|               :ncaa:|emoji|                null|20230326|
|1228spi|      DaFunkJunkie|           :verified:|emoji|                null|20230326|
|1225o1s|      gandalf45435|                 null| text| Dyrus Microwave ...|20230326|
|12251en|            cbbBot|               

In [12]:
df_final.write.mode("overwrite").parquet(f"s3a://{minio_bucket}/processed/{output_file}/{output_file}_{today}")