In [1]:
import os
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from pyspark.sql.functions import col

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
# Set up Snowflake options
sf_options = {
    "sfURL": os.getenv("SNOWFLAKE_ACCOUNT") + ".snowflakecomputing.com",
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": "ACCOUNTADMIN"  # optional if you have role management
}

In [4]:
# Spark session
spark = SparkSession.builder \
    .appName("LoadFactCommentToSnowflake") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.24,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2") \
    .getOrCreate()

In [5]:
# Read Parquet
fact_comment_df = spark.read.parquet("D:/Portfolio/reddit-analytics-pipeline/data/DW_cache/fact_comment")

In [14]:
fact_comment_df.show(5)
fact_comment_df.printSchema()
print(fact_comment_df.count())

+--------------------+-------+-------------+-------------------+--------------------+--------+----------------+----------+--------------------+--------------------+
|          comment_id|user_id|subreddit_key|   created_datetime|                body|   score|controversiality|    gilded|                 ups|               downs|
+--------------------+-------+-------------+-------------------+--------------------+--------+----------------+----------+--------------------+--------------------+
|          Blood Bath| 155033|           15|               null|                null|    null|            null|      null|       Borrowed Time|                   0|
| and I believe th...| 155031|          152|               null|                null|    null|            null|      null| so it was a diff...|          1430453592|
| and filled the w...|     23|          174|               null| where is the goo...| and see|              no| and death| even if the Morm...| and will yet bri...|
|         

In [13]:
from pyspark.sql.functions import count

# Group by subreddit_id and count
duplicates = fact_comment_df.groupBy("comment_id","created_datetime","user_id") \
    .agg(count("*").alias("count")) \
    .filter("count > 1")

# Show duplicated subreddit_ids
duplicates.show(truncate=False)

+----------+----------------+-------+-----+
|comment_id|created_datetime|user_id|count|
+----------+----------------+-------+-----+
+----------+----------------+-------+-----+



In [12]:
if duplicates.count()>0:
    fact_comment_df = fact_comment_df.dropDuplicates(["subreddit_key", "created_datetime", "user_id"])

In [15]:
from pyspark.sql.functions import to_date

fact_comment_df = fact_comment_df.withColumn('created_date', to_date('created_datetime'))

unique_dates = fact_comment_df.select('created_date').distinct().collect()
print(unique_dates)

[Row(created_date=datetime.date(2015, 5, 19)), Row(created_date=datetime.date(2015, 5, 10)), Row(created_date=datetime.date(2015, 5, 16)), Row(created_date=datetime.date(2015, 5, 25)), Row(created_date=datetime.date(2015, 5, 6)), Row(created_date=datetime.date(2015, 5, 3)), Row(created_date=datetime.date(2015, 5, 18)), Row(created_date=datetime.date(2015, 5, 23)), Row(created_date=datetime.date(2015, 5, 21)), Row(created_date=datetime.date(2015, 5, 9)), Row(created_date=datetime.date(2015, 5, 29)), Row(created_date=datetime.date(2015, 5, 11)), Row(created_date=datetime.date(2015, 5, 5)), Row(created_date=datetime.date(2015, 5, 14)), Row(created_date=datetime.date(2015, 5, 22)), Row(created_date=datetime.date(2015, 5, 13)), Row(created_date=datetime.date(2015, 5, 12)), Row(created_date=datetime.date(2015, 5, 2)), Row(created_date=datetime.date(2015, 5, 24)), Row(created_date=datetime.date(2015, 5, 28)), Row(created_date=datetime.date(2015, 5, 31)), Row(created_date=datetime.date(2015, 5

In [16]:
fact_comment_df.write \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "FACT_COMMENTS") \
    .mode("overwrite") \
    .save()

In [17]:
spark.stop()