In [16]:
date  = spark.sql("""
    SELECT count(distinct store_id_from) as store_qty_fabric, sum(gross_sales) as gross_sales_fabric,
    sum(net_sales) as net_sales_fabric, sum(returned_net_sale) * -1 as returned_net_sale_fabric,
    sum(cost_of_goods_sold) * -1 as cost_of_goods_sold_fabric,sum(units_sold) as units_sold_fabric,
    sum(returned_qty) * -1 as returned_qty_fabric, movement_date as movement_date_fabric
    FROM DEV_GLD_LH.fact_sales 
    --where movement_date = '2022-01-01'
    group by movement_date
    order by movement_date
""")

date.write.option("header", True).option("delimiter", ",").mode("overwrite").csv('Files/csv/fact_sales.csv')

## display(date)

StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 18, Finished, Available, Finished)

---

In [26]:
fabric = spark.read.format("csv").option("header","true").load("Files/csv/fact_sales.csv")
# df now is a Spark DataFrame containing CSV data from "Files/csv/fact_sales.csv".

fabric = fabric.withColumnRenamed('movement_date_fabric','movement_date')
display(fabric)

StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 28, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, db95cd0d-5f47-4905-88a5-250fe040d7d6)

In [9]:
import pandas as pd
df = pd.read_excel(f"{notebookutils.nbResPath}/builtin/Sales_Isra.xlsx")
df = spark.createDataFrame(df)
isra = df.drop('year','month','day_of_month')

from pyspark.sql.functions import to_date, abs, col
from pyspark.sql.types import NumericType

# 1. Formatear la fecha
isra = isra.withColumn("movement_date_isra", to_date("movement_date_isra"))

# 2. Obtener columnas numéricas (excepto la fecha)
numeric_cols = [f.name for f in isra.schema.fields if isinstance(f.dataType, NumericType) and f.name != "movement_date_isra"]

# 3. Aplicar valor absoluto a cada columna numérica
for column in numeric_cols:
    isra = isra.withColumn(column, abs(col(column)))

isra = isra.withColumnRenamed('movement_date_isra','movement_date')
display(isra)

StatementMeta(, 83c932a7-acf9-4d91-836b-4d1095996891, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e761ce54-b62d-4065-8bc0-60af8d7f8927)

In [10]:
isra.printSchema()

StatementMeta(, 83c932a7-acf9-4d91-836b-4d1095996891, 12, Finished, Available, Finished)

root
 |-- movement_date: date (nullable = true)
 |-- store_qty_isra: long (nullable = true)
 |-- gross_sales_isra: double (nullable = true)
 |-- net_sales_isra: double (nullable = true)
 |-- returned_net_sale_isra: double (nullable = true)
 |-- cost_of_goods_sold_isra: double (nullable = true)
 |-- units_sold_isra: long (nullable = true)
 |-- returned_qty_isra: long (nullable = true)



In [29]:
from pyspark.sql.functions import col

# Renombrar columnas del DF isra para evitar conflictos en el join
isra_renamed = isra.select(
    col("movement_date"),
    col("store_qty_isra"),
    col("gross_sales_isra"),
    col("net_sales_isra"),
    col("returned_net_sale_isra"),
    col("cost_of_goods_sold_isra"),
    col("units_sold_isra"),
    col("returned_qty_isra")
)

fabric_renamed = fabric.select(
    col("movement_date"),
    col("store_qty_fabric"),
    col("gross_sales_fabric"),
    col("net_sales_fabric"),
    col("returned_net_sale_fabric"),
    col("cost_of_goods_sold_fabric"),
    col("units_sold_fabric"),
    col("returned_qty_fabric")
)


StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 31, Finished, Available, Finished)

In [30]:
joined_df = fabric_renamed.join(isra_renamed, on="movement_date", how="inner")
from pyspark.sql.functions import expr

diff_df = joined_df.select(
    "movement_date",
    (col("gross_sales_fabric") - col("gross_sales_isra")).alias("diff_gross_sales"),
    (col("net_sales_fabric") - col("net_sales_isra")).alias("diff_net_sales"),
    (col("returned_net_sale_fabric") - col("returned_net_sale_isra")).alias("diff_returned_net_sales"),
    (col("cost_of_goods_sold_fabric") - col("cost_of_goods_sold_isra")).alias("diff_cost_of_goods_sold"),
    (col("units_sold_fabric") - col("units_sold_isra")).alias("diff_units_sold"),
    (col("returned_qty_fabric") - col("returned_qty_isra")).alias("diff_returned_qty"),
    (col("store_qty_fabric") - col("store_qty_isra")).alias("diff_store_qty")
)


StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 32, Finished, Available, Finished)

In [31]:
display(diff_df)

StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 33, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f9b11630-70b6-4045-b316-2ba48261c2f7)

In [33]:
from pyspark.sql.functions import col

filtered_diff_df = diff_df.filter(
    (col("diff_gross_sales") != 0) |
    (col("diff_net_sales") != 0) |
    (col("diff_returned_net_sales") != 0) |
    (col("diff_cost_of_goods_sold") != 0) |
    (col("diff_units_sold") != 0) |
    (col("diff_returned_qty") != 0) |
    (col("diff_store_qty") != 0)
)

display(filtered_diff_df)

StatementMeta(, 8ad9ca73-0195-4f1f-bf24-f30e0180dcaa, 35, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8ded8f1a-d60e-417a-8fef-2bd8e22d3361)

---

---