### Comienzo capa Oro (Gold)

In [0]:
df_oro = spark.read.format("delta").load("/mnt/datalake/globalsalesmarketmedallon/global_sales_plata/delta_tables/")

display(df_oro)

In [0]:

from pyspark.sql.functions import count, col

# Count the occurrences of each combination of Order_ID, Product_ID, and Customer_ID
df_counts = df_oro.groupBy("Order_ID", "Product_ID", "Customer_ID").agg(count("*").alias("count"))

# Filter to get only the duplicates
df_duplicates = df_counts.filter(col("count") > 1)

# Join with the original dataframe to get the full details of the duplicates
df_duplicates_details = df_oro.join(df_duplicates, on=["Order_ID", "Product_ID", "Customer_ID"], how="inner")

# Drop duplicates from the original dataframe
df_oro = df_oro.dropDuplicates(["Order_ID", "Product_ID", "Customer_ID",])

display(df_duplicates_details)
display(df_oro)

In [0]:
df_oro.write.format("delta").mode("overwrite").partitionBy("Order_Year").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/delta_tables")
df_oro.write.format("csv").mode("overwrite").partitionBy("Order_Year").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/csv")


# df_clean.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_plata/delta_tables")
# df_clean.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_plata/csv")


In [0]:

df_oro_2011 = spark.read.format("delta").load("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/delta_tables/").filter("Order_Year = 2011")
df_oro_2012 = spark.read.format("delta").load("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/delta_tables/").filter("Order_Year = 2012")
df_oro_2013 = spark.read.format("delta").load("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/delta_tables/").filter("Order_Year = 2013")
df_oro_2014 = spark.read.format("delta").load("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/delta_tables/").filter("Order_Year = 2014")
display(df_oro_2011)
display(df_oro_2014)


### KPIs , agregaciones, promedios, etc

In [0]:
from pyspark.sql.functions import sum

df_ventas_totales_por_anio = df_oro.groupBy("Order_Year").agg(
    sum("Sales").alias("Total_Sales")
)
display(df_ventas_totales_por_anio)

df_ventas_totales_por_anio.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/df_ventas_totales_por_anio/delta_tables")
df_ventas_totales_por_anio.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/df_ventas_totales_por_anio/csv")

In [0]:
df_kpi_profit = df_oro.groupBy("Order_Year").agg(
    sum("Profit").alias("Total_Profit")
)

display(df_kpi_profit)

df_kpi_profit.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/df_kpi_profit/delta_tables")
df_kpi_profit.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/df_kpi_profit/csv")

In [0]:
from pyspark.sql.functions import avg

df_margen_promedio_ganancia = df_oro.groupBy("Order_Year").agg(
    avg("Profit_Margin").alias("Avg_Profit_Margin")
)
display(df_margen_promedio_ganancia)

df_margen_promedio_ganancia.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/margen_promedio_ganancia/delta_tables")
df_margen_promedio_ganancia.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/margen_promedio_ganancia/csv")

In [0]:
from pyspark.sql.functions import countDistinct

df_clientes_unicos = df_oro.groupBy("Order_Year").agg(
    countDistinct("Customer_ID").alias("Unique_Customers")
)
display(df_clientes_unicos)

df_clientes_unicos.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/clientes_unicos/delta_tables")
df_clientes_unicos.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/clientes_unicos/csv")

In [0]:
from pyspark.sql.functions import desc

df_top_products = df_oro.groupBy("Product_Name").agg(
    sum("Quantity").alias("Total_Units_Sold")
).orderBy(desc("Total_Units_Sold")).limit(10)

display(df_top_products)

# Check con sintaxis SQL (adicional)

# df_clean.createOrReplaceTempView("df_oro")

# spark.sql(
#     """
#         SELECT Product_Name, SUM(Quantity) AS Total_Units_Sold
#         FROM df_oro
#         GROUP BY Product_Name
#         ORDER BY Total_Units_Sold DESC
#         LIMIT 10
#     """
# ).display()

df_top_products.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/top_products/delta_tables")
df_top_products.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/top_products/csv")


In [0]:
df_profit_por_pais = df_oro.groupBy("Country").agg(
    sum("Profit").alias("Total_Profit"),
    avg("Profit_Margin").alias("Avg_Profit_Margin")
)
display(df_profit_por_pais)

df_profit_por_pais.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/profit_por_pais/delta_tables")
df_profit_por_pais.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/profit_por_pais/csv")


### KPI General de ganancia por año 

In [0]:
from pyspark.sql.functions import sum, avg, countDistinct

df_kpis_general = df_oro.groupBy("Order_Year").agg(
    sum("Sales").alias("Total_Sales"),
    sum("Profit").alias("Total_Profit"),
    avg("Profit_Margin").alias("Avg_Profit_Margin"),
    countDistinct("Customer_ID").alias("Unique_Customers"),
).orderBy("Total_Profit", ascending= False)

display(df_kpis_general)

df_kpis_general.write.format("delta").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/kpis_general/delta_tables")
df_kpis_general.write.format("csv").mode("overwrite").save("/mnt/datalake/globalsalesmarketmedallon/global_sales_oro/kpis_general/csv")


### Fin capa oro (Gold y KPIs)