In [0]:
from pyspark.sql.functions import col, to_timestamp, date_format, hour, dayofweek, expr, monotonically_increasing_id, explode
from pyspark.sql.types import StringType

In [0]:
CATALOG_NAME = "spotify_etl"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SILVER_SCHEMA}")
print(f"Sursa (Bronze): {CATALOG_NAME}.{BRONZE_SCHEMA}")
print(f"Destinația (Silver): {CATALOG_NAME}.{SILVER_SCHEMA}")

In [0]:
print("Creating silver.dim_time...")

# Extract all unique timestamps from the play history
time_df = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_play_history") \
    .select(to_timestamp(col("played_at")).alias("play_timestamp")) \
    .distinct()

# Enrich with temporal attributes
dim_time = time_df \
    .withColumn("hour_of_day", hour(col("play_timestamp"))) \
    .withColumn("day_of_week_num", dayofweek(col("play_timestamp"))) \
    .withColumn("weekday_name", date_format(col("play_timestamp"), "E")) \
    .withColumn("month_name", date_format(col("play_timestamp"), "MMM")) \
    .withColumn("quarter", date_format(col("play_timestamp"), "Q").cast("int")) \
    .withColumn("year", date_format(col("play_timestamp"), "yyyy").cast("int"))

# Save the dimension table
dim_time.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_time")

print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_time has been created.")

In [0]:
print("Creating silver.dim_tracks...")

# Load base tables
tracks_df = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_tracks")

# Join metadata and features
dim_tracks = tracks_df.dropDuplicates(["track_id"]) # Ensure uniqueness

dim_tracks.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_tracks")

print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_tracks has been created.")

In [0]:
print("Creating silver.dim_artists...")

dim_artists = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_artists") \
    .dropDuplicates(["artist_id"])

dim_artists.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_artists")
    
print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_artists has been created.")

In [0]:
print("Creating silver.dim_playlists...")

dim_playlists = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_playlists") \
    .dropDuplicates(["playlist_id"])
    
dim_playlists.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_playlists")
    
print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_playlists has been created.")

In [0]:
print("Creating silver.fct_plays...")

# Load raw play history
fct_plays = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_play_history") \
    .withColumn("play_timestamp", to_timestamp(col("played_at"))) \
    .withColumn("play_id", expr("sha2(concat(played_at, track_id), 256)")) \
    .select(
        "play_id",
        "play_timestamp",
        "track_id",
        "artist_id",
        "album_id",
        "duration_ms",
        "context_type"
    ) \
    .dropDuplicates(["play_id"]) # Deduplication based on unique key

fct_plays.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.fct_plays")

print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.fct_plays has been created.")
print("\n--- SILVER PIPELINE FINISHED ---")

In [0]:
spark.sql("SHOW TABLES IN spotify_etl.bronze").show()