In [0]:
from pyspark.sql.functions import col, to_timestamp, date_format, hour, dayofweek, expr, monotonically_increasing_id, explode
from pyspark.sql.types import StringType

In [0]:
CATALOG_NAME = "spotify_etl"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SILVER_SCHEMA}")
print(f"Sursa (Bronze): {CATALOG_NAME}.{BRONZE_SCHEMA}")
print(f"Destina»õia (Silver): {CATALOG_NAME}.{SILVER_SCHEMA}")

In [0]:
print("Creating silver.dim_time...")

# Extract all unique timestamps from the play history
time_df = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_play_history") \
    .select(to_timestamp(col("played_at")).alias("play_timestamp")) \
    .distinct()

# Enrich with temporal attributes
dim_time = time_df \
    .withColumn("hour_of_day", hour(col("play_timestamp"))) \
    .withColumn("day_of_week_num", dayofweek(col("play_timestamp"))) \
    .withColumn("weekday_name", date_format(col("play_timestamp"), "E")) \
    .withColumn("month_name", date_format(col("play_timestamp"), "MMM")) \
    .withColumn("quarter", date_format(col("play_timestamp"), "Q").cast("int")) \
    .withColumn("year", date_format(col("play_timestamp"), "yyyy").cast("int"))

# Save the dimension table
dim_time.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_time")

print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_time has been created.")

In [0]:
print("Creating silver.dim_tracks...")

from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType
from pyspark.sql.functions import lit

# üîí 1Ô∏è‚É£ Schema fixƒÉ (stabilƒÉ pentru Power BI)
silver_dim_tracks_schema = StructType([
    StructField("track_id", StringType()),
    StructField("track_name", StringType()),
    StructField("album_id", StringType()),
    StructField("album_name", StringType()),
    StructField("artist_id", StringType()),
    StructField("artist_name", StringType()),
    StructField("duration_ms", LongType()),
    StructField("popularity", LongType()),
    StructField("explicit", BooleanType()),
    StructField("release_date", StringType()),
    StructField("preview_url", StringType())  # <‚Äî men»õinut chiar dacƒÉ lipse»ôte √Æn sursƒÉ
])

# 2Ô∏è‚É£ Citim datele din Bronze
tracks_df = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_tracks")

# 3Ô∏è‚É£ DacƒÉ preview_url lipse»ôte, o adƒÉugƒÉm cu None
if "preview_url" not in tracks_df.columns:
    tracks_df = tracks_df.withColumn("preview_url", lit(None).cast(StringType()))

# 4Ô∏è‚É£ EliminƒÉm duplicatele »ôi rearanjƒÉm coloanele conform schemei
dim_tracks = tracks_df.dropDuplicates(["track_id"]) \
                      .select([f.name for f in silver_dim_tracks_schema])

# 5Ô∏è‚É£ Scriem √Æn Silver cu schema stabilizatƒÉ
dim_tracks.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_tracks")

print(f"‚úÖ Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_tracks created with stable schema.")

In [0]:
print("Creating silver.dim_artists...")

dim_artists = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_artists") \
    .dropDuplicates(["artist_id"])

dim_artists.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_artists")
    
print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_artists has been created.")

In [0]:
print("Creating silver.dim_playlists...")

from pyspark.sql.types import StructType, StructField, StringType, LongType

# üîí 1Ô∏è‚É£ Schema fixƒÉ ‚Äì men»õinem coloanele esen»õiale, chiar dacƒÉ sunt None
silver_dim_playlists_schema = StructType([
    StructField("playlist_id", StringType()),
    StructField("playlist_name", StringType()),
    StructField("owner_name", StringType()),
    StructField("followers", LongType()),      # <‚Äî men»õinut pentru Power BI compatibilitate
    StructField("total_tracks", LongType()),
    StructField("description", StringType()),
    StructField("snapshot_id", StringType())
])

# 2Ô∏è‚É£ Citim datele din Bronze »ôi ne asigurƒÉm cƒÉ toate coloanele existƒÉ
bronze_playlists_df = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_playlists")

# DacƒÉ followers lipse»ôte din Bronze, √Æl adƒÉugƒÉm cu None
if "followers" not in bronze_playlists_df.columns:
    from pyspark.sql.functions import lit
    bronze_playlists_df = bronze_playlists_df.withColumn("followers", lit(None).cast(LongType()))

# 3Ô∏è‚É£ EliminƒÉm duplicatele »ôi rearanjƒÉm ordinea coloanelor conform schemei
dim_playlists = bronze_playlists_df.dropDuplicates(["playlist_id"]) \
                                   .select([f.name for f in silver_dim_playlists_schema])

# 4Ô∏è‚É£ Scriere stabilƒÉ √Æn Delta
dim_playlists.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.dim_playlists")

print(f"‚úÖ Table {CATALOG_NAME}.{SILVER_SCHEMA}.dim_playlists created with stable schema.")

In [0]:
print("Creating silver.fct_plays...")

# Load raw play history
fct_plays = spark.table(f"{CATALOG_NAME}.{BRONZE_SCHEMA}.bronze_play_history") \
    .withColumn("play_timestamp", to_timestamp(col("played_at"))) \
    .withColumn("play_id", expr("sha2(concat(played_at, track_id), 256)")) \
    .select(
        "play_id",
        "play_timestamp",
        "track_id",
        "artist_id",
        "album_id",
        "duration_ms",
        "context_type"
    ) \
    .dropDuplicates(["play_id"]) # Deduplication based on unique key

fct_plays.write.format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{CATALOG_NAME}.{SILVER_SCHEMA}.fct_plays")

print(f"Table {CATALOG_NAME}.{SILVER_SCHEMA}.fct_plays has been created.")
print("\n--- SILVER PIPELINE FINISHED ---")