In [1]:
import os
from pyspark.sql import SparkSession
from dotenv import load_dotenv

In [2]:

# Load environment variables
load_dotenv()

True

In [3]:
# Set up Snowflake options
sf_options = {
    "sfURL": os.getenv("SNOWFLAKE_ACCOUNT") + ".snowflakecomputing.com",
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": "SYSADMIN"  # optional if you have role management
}

In [4]:
# Spark session
spark = SparkSession.builder \
    .appName("LoadDimDateToSnowflake") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.24,net.snowflake:spark-snowflake_2.12:2.10.0-spark_3.2") \
    .getOrCreate()


In [5]:
# Read Parquet
dim_date_df = spark.read.parquet("D:/Portfolio/reddit-analytics-pipeline/data/DW_cache/dim_date")

In [6]:
dim_date_df.show(5)
dim_date_df.printSchema()

+-------------------+----------+----+-----+----+----+------+------+-----------+------------+
|   created_datetime|      date|year|month| day|hour|minute|second|day_of_week|week_of_year|
+-------------------+----------+----+-----+----+----+------+------+-----------+------------+
|               null|      null|null| null|null|null|  null|  null|       null|        null|
|1969-12-31 19:00:03|1969-12-31|1969|   12|  31|  19|     0|     3|        Wed|           1|
|2015-05-01 00:00:00|2015-05-01|2015|    5|   1|   0|     0|     0|        Fri|          18|
|2015-05-01 00:00:01|2015-05-01|2015|    5|   1|   0|     0|     1|        Fri|          18|
|2015-05-01 00:00:02|2015-05-01|2015|    5|   1|   0|     0|     2|        Fri|          18|
+-------------------+----------+----+-----+----+----+------+------+-----------+------------+
only showing top 5 rows

root
 |-- created_datetime: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month:

In [7]:
dim_date_df.write \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_DATE") \
    .mode("overwrite") \
    .save()

In [8]:
spark.stop()