In [1]:
import os
from pyspark.sql import SparkSession
from dotenv import load_dotenv


In [2]:

# Load environment variables
load_dotenv()

True

In [3]:
# Set up Snowflake options
sf_options = {
    "sfURL": os.getenv("SNOWFLAKE_ACCOUNT") + ".snowflakecomputing.com",
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": "SYSADMIN"  # optional if you have role management
}

In [4]:
# Spark session
spark = SparkSession.builder \
    .appName("LoadDimUserToSnowflake") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.24,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2") \
    .getOrCreate()


In [5]:
spark.catalog.clearCache()

# Read Parquet
dim_user_df = spark.read.parquet("D:/Portfolio/reddit-analytics-pipeline/data/DW_cache/dim_user")


In [6]:

dim_user_df.show(5)
dim_user_df.printSchema()

+--------------------+-------+
|              author|user_id|
+--------------------+-------+
|  although entire...|      1|
|      ""What next?""|      2|
| 'This contemplat...|      3|
| **""I will not s...|      4|
|           *eyebite*|      5|
+--------------------+-------+
only showing top 5 rows

root
 |-- author: string (nullable = true)
 |-- user_id: integer (nullable = true)



In [7]:
from pyspark.sql.functions import count

# Group by subreddit_id and count
duplicates = dim_user_df.groupBy("user_id") \
    .agg(count("*").alias("count")) \
    .filter("count > 1")

# Show duplicated subreddit_ids
duplicates.show(truncate=False)

+-------+-----+
|user_id|count|
+-------+-----+
+-------+-----+



In [8]:
# Write to Snowflake
dim_user_df.write \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_USER") \
    .option("internal_staging_file_format", "parquet") \
    .mode("append") \
    .save()


In [9]:
spark.stop()