In [1]:
from pyspark.sql import SparkSession
from vars import *
from datetime import date
from functions import flatten_json, loadConfigs
from pyspark.sql.functions import lit
from pyspark.sql.functions import col,explode

spark = SparkSession.builder \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars", "/jars/postgresql-42.2.5.jar") \
    .getOrCreate()
loadConfigs(spark.sparkContext)

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [26]:
today = date.today().strftime('%Y%m%d')
today = 20230326

df_raw = spark.read.option("header", "true") \
    .json(f"s3a://{minio_bucket}/raw/popular_{today}.json")

In [27]:
df_raw = df_raw.select(explode(df_raw.data.children.data).alias("data"))
df_raw = df_raw.select("data.*")

In [28]:
df_awardings = df_raw.select("id", "author", "all_awardings") \
                      .withColumnRenamed("id", "post_id")

In [29]:
df_awardings_exploded = df_awardings.select("post_id", "author", explode("all_awardings").alias("all_awardings"))

In [30]:
df_awardings_cleaned = df_awardings_exploded.select("*", "all_awardings.*")
df_awardings_cleaned = df_awardings_cleaned.drop("all_awardings")

In [31]:
columns_to_drop = ["resized_icons","resized_static_icons","icon_format","icon_height","icon_url","icon_width","static_icon_height",
                   "static_icon_url","static_icon_width","sticky_duration_seconds"]

In [33]:
df_final = df_awardings_cleaned.drop(*[col for col in df_awardings_cleaned.columns if any(s in col for s in columns_to_drop)])

In [34]:
df_final = df_final.dropDuplicates()

In [35]:
df_final.write.mode("overwrite").parquet(f"s3a://{minio_bucket}/processed/all_awardings/all_awardings_{today}.parquet")