In [0]:
# 1️ Load External Locations (Using Unity Catalog)
checkpoint = spark.sql("describe external location `dev-checkpoints`").select("url").collect()[0][0]
landing = spark.sql("describe external location `dev-landing`").select("url").collect()[0][0]
bronze = spark.sql("describe external location `dev-bronze`").select("url").collect()[0][0]
silver = spark.sql("describe external location `dev-silver`").select("url").collect()[0][0]
gold = spark.sql("describe external location `dev-gold`").select("url").collect()[0][0]

In [0]:
# Read Environment Variable for Unity Catalog
dbutils.widgets.text(name="env", defaultValue='', label='Enter the environment in lower case')
env = dbutils.widgets.get("env")



In [0]:
# Define catalog and schemas
catalog_name = f"{env}_catalog"
silver_schema = "silver"
gold_schema = "gold"
gold_table_name = "reddit_gold"  

In [0]:
from pyspark.sql.functions import avg, count

#  Read Silver Table in Batch Mode
def read_silver_table(environment, table_name):

    table_path = f"`{environment}_catalog`.`silver`.`{table_name}`"
    print(f' Reading {table_path} (Batch Mode)...')
    df_silver = spark.read.table(table_path)
    print(f' Read {table_path} Success!')
    return df_silver

df_silver = read_silver_table(env, "cleaned_redditposts")


df_gold = df_silver.dropDuplicates().na.fill({"title": "Unknown", "author": "Anonymous"})
print(" Data Cleaning Done")





In [0]:
# Aggregate Sentiment
df_subreddit_sentiment = df_gold.groupBy("subreddit").agg(
    avg("positive_score").alias("avg_positive"),
    avg("neutral_score").alias("avg_neutral"),
    avg("negative_score").alias("avg_negative"),
    count("id").alias("post_count")
)
print(" Aggregated Sentiment by Subreddit")

df_author_sentiment = df_gold.groupBy("author").agg(
    avg("compound_score").alias("avg_compound_score"),
    count("id").alias("total_posts")
)
print(" Aggregated Sentiment by Author")

In [0]:
# Write Gold Table to Unity Catalog
gold_table_path = f"`{env}_catalog`.`gold`.`{gold_table_name}`"
df_gold.write.mode("overwrite").format("delta").saveAsTable(gold_table_path)

print(f'Gold Table Saved: {gold_table_path}')
print("Gold Processing Completed!")


In [0]:
%sql
select * from dev_catalog.gold.reddit_gold

In [0]:
%sql
select count(*) from dev_catalog.gold.reddit_gold