In [1]:
from pyspark.sql import SparkSession
from vars import *
from datetime import date
from functions import flatten_json, loadConfigs
from pyspark.sql.functions import lit
from pyspark.sql.functions import col,explode

spark = SparkSession.builder \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars", "/jars/postgresql-42.2.5.jar") \
    .getOrCreate()
loadConfigs(spark.sparkContext)

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
today = date.today().strftime('%Y%m%d')
today = 20230326
output_file = "gildings"

In [3]:
df_raw = spark.read.option("header", "true") \
    .json(f"s3a://{minio_bucket}/raw/popular_{today}.json")

In [4]:
df_raw = df_raw.select(explode(df_raw.data.children.data).alias("data"))
df_raw = df_raw.select("data.*")

In [5]:
df_gildings = df_raw.select("id", "author", "gilded", "gildings") \
                      .withColumnRenamed("id", "post_id")

In [6]:
df_gildings.printSchema()

root
 |-- post_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: struct (nullable = true)
 |    |-- gid_1: long (nullable = true)
 |    |-- gid_2: long (nullable = true)
 |    |-- gid_3: long (nullable = true)



In [7]:
df_gildings = df_gildings.select("*", "gildings.*")

In [8]:
df_renamed = df_gildings.withColumnRenamed("gid_1", "gild_silver") \
                        .withColumnRenamed("gid_2", "gild_gold") \
                        .withColumnRenamed("gid_3", "gild_platinum")

In [9]:
df_final = df_renamed.dropDuplicates()

In [10]:
df_final = df_final.withColumn("dateid", lit(today))

In [11]:
df_final.show()

+-------+------------------+------+------------------+-----------+---------+-------------+--------+
|post_id|            author|gilded|          gildings|gild_silver|gild_gold|gild_platinum|  dateid|
+-------+------------------+------+------------------+-----------+---------+-------------+--------+
|1228spi|      DaFunkJunkie|     1|   {null, 1, null}|       null|        1|         null|20230326|
|122kctz|    Armpit_Penguin|     0|{null, null, null}|       null|     null|         null|20230326|
|11isfra|            elch3w|     0|{null, null, null}|       null|     null|         null|20230326|
|122gu9d|           newnemo|     0|{null, null, null}|       null|     null|         null|20230326|
|1225ct4|waitingforthesun92|     0|{null, null, null}|       null|     null|         null|20230326|
|122a253|      EatMoarPussy|     0|{null, null, null}|       null|     null|         null|20230326|
|122keu9|      King_DeandDe|     0|{null, null, null}|       null|     null|         null|20230326|


In [12]:
df_final.write.mode("overwrite").parquet(f"s3a://{minio_bucket}/processed/{output_file}/{output_file}_{today}")