In [4]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("Delta Spark 4.0.0") \
    .master("local[*]") \
    .enableHiveSupport() \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
#CARGANDO INFORMACIÓN EN DATAFRAMES
dim_ventas = spark.read.format("delta").load("../sql/dw_silver/dim_ventas")
dim_productos = spark.read.format("delta").load("../sql/dw_silver/dim_productos")
dim_clientes = spark.read.format("delta").load("../sql/dw_silver/dim_clientes")
dim_territorio = spark.read.format("delta").load("../sql/dw_silver/dim_territorio")
dim_fecha = spark.read.format("delta").load("../sql/dw_silver/dim_fecha")

In [None]:
#CREACIÓN DE TABLA DE HECHOS
df_hechos = dim_ventas.select(
    "ProductoID",
    "PedidoID",
    "ClienteID",
    "TerritorioID",
    "Cantidad_pedido",
    "Precio_Unitario",
    "Linea_total"
)
df_hechos.show()

In [22]:
df_hechos.write.format("delta").mode("overwrite").save("../sql/dw_gold/df_hechos")

In [20]:
#SUMA DE VENTAS POR CONTINENTE
from pyspark.sql.functions import sum as _sum, round
df_ventas_continente = df_hechos.join(
    dim_territorio,
    on = "TerritorioID",
    how = "inner"
).groupBy("Continente").agg(
    round(_sum("Linea_total"), 2).alias("Suma_Total")
)

df_ventas_continente.show()

+-------------+-------------+
|   Continente|   Suma_Total|
+-------------+-------------+
|       Europe|   9180532.57|
|North America|1.482072734E7|
|      Pacific|   8979109.04|
+-------------+-------------+

