In [7]:
from pyspark.sql.functions import col, current_timestamp, abs, hash
from pyspark.sql.types import LongType, StringType, DateType, TimestampType

StatementMeta(, f5137119-5574-4d50-af0c-85fc0fe0094b, 9, Finished, Available, Finished)

In [1]:
catalog_name = "Silver"
dst_catalog_name = "Gold"

StatementMeta(, 9be8d756-2674-4c04-ab80-73f0bad0b5c5, 3, Finished, Available, Finished)

In [6]:
df = (
    spark.table(f"{catalog_name}.clientes")
    .withColumn("updated_at", current_timestamp())
    .withColumn(
        "sk_cliente",
        abs(hash(col("id_cliente")))
    )
    .select(
        col("sk_cliente").cast(StringType()),
        col("id_cliente").cast(LongType()),
        col("nombre_completo").cast(StringType()),
        col("email").cast(StringType()),
        col("ciudad").cast(StringType()),
        col("pais").cast(StringType()),
        col("updated_at").cast(TimestampType())
    )
)

df.write.format("delta").mode("overwrite").saveAsTable(f"{dst_catalog_name}.dim_clientes")


StatementMeta(, f5137119-5574-4d50-af0c-85fc0fe0094b, 8, Finished, Available, Finished)

TypeError: unhashable type: 'Column'

In [10]:
df = (
    spark.table(f"{catalog_name}.productos")
    .withColumn(
        "sk_producto",
        abs(hash(col("id_producto")))
    )
    .withColumn("updated_at", current_timestamp())
    .select(
        "sk_producto",
        "id_producto",
        "nombre",
        "marca",
        "precio",
        "updated_at"
    )
)

df.write.format("delta").mode("overwrite").saveAsTable(f"{dst_catalog_name}.dim_productos")

StatementMeta(, 07b03ea2-7246-4581-8a01-36818b6ad898, 12, Finished, Available, Finished)

In [11]:
df = spark.table("Silver.tiendas")
print(df.columns)

StatementMeta(, 07b03ea2-7246-4581-8a01-36818b6ad898, 13, Finished, Available, Finished)

['id_tienda', 'nombre', 'ciudad', 'pais', 'tipo', 'updated_at']


In [13]:
df = (
    spark.table(f"{catalog_name}.tiendas")
    .withColumn(
        "sk_tienda",
        abs(hash(col("id_tienda")))
    )
    .withColumn("updated_at", current_timestamp())
    .select(
        "sk_tienda",
        "id_tienda",
        "nombre",
        "ciudad",
        "pais",
        "tipo",
        "updated_at"
    )
)

df.write.format("delta").mode("overwrite").saveAsTable(f"{dst_catalog_name}.dim_tiendas")

StatementMeta(, 07b03ea2-7246-4581-8a01-36818b6ad898, 15, Finished, Available, Finished)

In [14]:
from pyspark.sql.functions import expr, sequence, explode, col, date_format, year, month, dayofmonth, current_timestamp

StatementMeta(, 07b03ea2-7246-4581-8a01-36818b6ad898, 16, Finished, Available, Finished)

In [15]:
start_date = "2023-12-31"
end_date   = "2026-12-31"

df = (
    spark.range(1)
    .withColumn("fecha", explode(sequence(expr(f"to_date('{start_date}')"),
                                            expr(f"to_date('{end_date}')"),
                                            expr("interval 1 day"))))
    .select("fecha")
    .withColumn("anio", year(col("fecha")))
    .withColumn("mes", month(col("fecha")))
    .withColumn("dia", dayofmonth(col("fecha")))
    .withColumn("anio_mes", date_format(col("fecha"), "yyyyMM"))
    .withColumn("sk_tiempo", date_format(col("fecha"), "yyyyMMdd").cast("int"))
    .withColumn("updated_at", current_timestamp())
    .select("sk_tiempo", "fecha", "anio", "mes", "dia", "anio_mes", "updated_at")
)

df.write.format("delta").mode("overwrite").saveAsTable(f"{dst_catalog_name}.tiempo")

StatementMeta(, 07b03ea2-7246-4581-8a01-36818b6ad898, 17, Finished, Available, Finished)

In [4]:
from pyspark.sql.functions import when, coalesce, to_date, substring, initcap, trim, col, lit, lower, current_timestamp, broadcast, expr
from pyspark.sql.types import LongType, StringType, DateType, TimestampType, DecimalType, IntegerType
from delta.tables import DeltaTable

StatementMeta(, 9be8d756-2674-4c04-ab80-73f0bad0b5c5, 6, Finished, Available, Finished)

In [5]:
df_ventas = (
    spark.table(f"{catalog_name}.ventas")
)

dim_cliente = spark.table(f"{dst_catalog_name}.clientes").select("sk_cliente", "id_cliente")
dim_tienda = spark.table(f"{dst_catalog_name}.tiendas").select("sk_tienda", "id_tienda")
dim_producto = spark.table(f"{dst_catalog_name}.productos").select("sk_producto", "id_producto")
dim_tiempo = spark.table(f"{dst_catalog_name}.tiempo").select("sk_tiempo", "fecha")


df_enriched = (
    df_ventas
    .join(broadcast(dim_cliente), "id_cliente", "left")
    .join(broadcast(dim_tienda), "id_tienda", "left")
    .join(broadcast(dim_producto), "id_producto", "left")
    .join(broadcast(dim_tiempo), to_date(df_ventas.fecha_venta) == dim_tiempo.fecha, "left")
)


df_enriched = df_enriched.select(
    col("id_venta").cast(LongType()),
    col("sk_cliente").cast(StringType()),
    col("sk_tienda").cast(StringType()),
    col("sk_producto").cast(StringType()),
    col("sk_tiempo").cast(StringType()),
    col("cantidad").cast(IntegerType()),
    col("monto").cast(DecimalType(18, 2)),
    col("fecha_venta").cast(TimestampType()),
    current_timestamp().alias("seq_ts"),  
    current_timestamp().alias("updated_at"),  
    lit("ETL").alias("source_system"),    
    when(col("monto").isNull(), "DELETE") 
        .otherwise("UPSERT").alias("operation")
)

df_enriched.write.format("delta").mode("overwrite").saveAsTable(f"{dst_catalog_name}.fact_ventas")


StatementMeta(, 9be8d756-2674-4c04-ab80-73f0bad0b5c5, 7, Finished, Available, Finished)

In [9]:
df = spark.table("Silver.clientes")
df.createOrReplaceTempView("vw_clientes") 
spark.sql("""
CREATE OR REPLACE VIEW vw_clientes AS
SELECT
    id_cliente,
    nombre_completo,
    ciudad,
    pais
FROM Silver.clientes
""")

StatementMeta(, 6568f61c-422d-4579-9a8f-f8ab047f8a45, 11, Finished, Available, Finished)

DataFrame[]

In [6]:
df = spark.table("Gold.clientes")
print(df.columns)
df = spark.table("Gold.productos")
print(df.columns)
df = spark.table("Gold.fact_ventas")
print(df.columns)

StatementMeta(, 9be8d756-2674-4c04-ab80-73f0bad0b5c5, 8, Finished, Available, Finished)

['sk_cliente', 'id_cliente', 'nombre_completo', 'email', 'ciudad', 'pais', 'updated_at']
['sk_producto', 'id_producto', 'nombre', 'marca', 'precio', 'updated_at']
['id_venta', 'sk_cliente', 'sk_tienda', 'sk_producto', 'sk_tiempo', 'cantidad', 'monto', 'fecha_venta', 'seq_ts', 'updated_at', 'source_system', 'operation']


In [1]:
df = spark.table("Gold.tiempo")
print(df.columns)

StatementMeta(, f45117ef-5f70-4d72-8e33-9483e5445f45, 3, Finished, Available, Finished)

['sk_tiempo', 'fecha', 'anio', 'mes', 'dia', 'anio_mes', 'updated_at']
