In [0]:
from pyspark.sql import functions as F

In [0]:
df_classified = spark.table("gold.conversations_classified")

In [0]:
# Count category frequency
df_category_counts = (
    df_classified
        .groupBy("top_category")
        .agg(F.count("*").alias("count"))
        .orderBy(F.desc("count"))
)

display(df_category_counts)

In [0]:
# Total unique users
total_users = df_classified.select("ISID").distinct().count()

# Users per category with penetration percentage
df_penetration = (
    df_classified
        .groupBy("top_category")
        .agg(
            F.countDistinct("ISID").alias("user_count")
        )
        .withColumn(
            "penetration_pct",
            (F.col("user_count") / total_users * 100).cast("decimal(5,2)")
        )
        .orderBy(F.desc("penetration_pct"))
)

display(df_penetration)

In [0]:
total_categories = df_category_counts.count()
most_popular_category = df_category_counts.first()["top_category"]

summary_data = [
    ("Total Users", total_users),
    ("Total Unique Categories", total_categories),
    ("Most Popular Category", most_popular_category)
]

df_summary = spark.createDataFrame(summary_data, ["Metric", "Value"])
display(df_summary)

In [0]:
# spark.sql("DROP TABLE gold.dashboard_category_counts")
# spark.sql("DROP TABLE gold.dashboard_category_penetration")

# Save aggregated tables for dashboard use
df_category_counts.write.mode("overwrite").saveAsTable("gold.dashboard_category_counts")
df_penetration.write.mode("overwrite").saveAsTable("gold.dashboard_category_penetration")