In [1]:
from gcpspark import *
import pyspark.sql.functions as F
from datetime import datetime, timedelta
import requests
from pyspark import StorageLevel
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import *
import os

Dotenv installed
Loaded variables .env True
Pyspark installed
JAVA:  /usr/lib/jvm/temurin-8-jdk-amd64
DATAPROC:  True
Current GCP Project Name: cencosudx
Current ENVIRONMENT: staging
<class 'Exception'>


In [2]:
url = "https://api.exchangerate-api.com/v4/latest/USD"
response = requests.get(url)
data = response.json()
USD = float(data["rates"]["COP"])
USD

4192.63

In [3]:
spark = create_pyspark(name="co-contribucion5" , 
                       connection="gcp")

File already exists at: /jars/spark-bigquery-with-dependencies_2.12-0.26.0.jar
File already exists at: /jars/gcs-connector-hadoop3-2.2.19.jar
Process: co-contribucion5_1746117729475


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/01 16:42:13 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
25/05/01 16:42:13 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
25/05/01 16:42:13 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
25/05/01 16:42:13 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [4]:
final_sales = spark.read.parquet("gs://staging-da-contribution/CO/final_sales")
final_alerts = spark.read.parquet("gs://staging-da-contribution/CO/final_alerts")

                                                                                

In [5]:
final_sales.printSchema()
final_alerts.printSchema()

root
 |-- EAN: string (nullable = true)
 |-- Item_Id: string (nullable = true)
 |-- Location_Id: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Hour_Sales: integer (nullable = true)
 |-- Sales_Channel: string (nullable = true)
 |-- Quantity_Sales: double (nullable = true)
 |-- NetAmount: double (nullable = true)
 |-- NetCost: double (nullable = true)
 |-- Contribution: double (nullable = true)
 |-- Margen: double (nullable = true)

root
 |-- EAN: string (nullable = true)
 |-- Item_Id: string (nullable = true)
 |-- Hour_Alert: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Location_Id: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Sector: string (nullable = true)



In [6]:
def f_type(df, name):
    df = df.filter(F.col("Type")==name) 
    return df
def f_type_array(df, name):
    df = df.filter(F.col("Type").isin(name)) 
    df = df.groupBy('EAN','Item_Id','Date','Location_Id')\
            .agg(
                    F.collect_set("Type").alias("Type"),
                    F.min("Hour_Alert").alias("Hour_Alert"),
                    F.first("Sector").alias("Sector")
                )
    return df
def f_type_exclusive(df):
    df = df.filter(F.col("Type").isin(name)) 
    df = df.groupBy('EAN','Item_Id','Date','Location_Id')\
            .agg(
                    F.collect_set("Type").alias("Type"),
                    F.min("Hour_Alert").alias("Hour_Alert"),
                    F.first("Sector").alias("Sector")
                )
    return df

In [7]:

window_spec = Window.partitionBy('EAN', 'Item_Id', 'Date', 'Location_Id').orderBy(F.col('Hour_Alert'))

# Use row_number to assign a rank to each row within its group
ranked_df = final_alerts.withColumn('rank', F.row_number().over(window_spec))

# Filter out only the rows with rank 1 (minimum Hour_Alert within each group)
final_alerts_t = ranked_df.filter(F.col('rank') == 1).drop('rank')

In [8]:
final_sales = final_sales.distinct()
final_alerts_t = final_alerts_t.distinct()

In [9]:

final_alerts_t = final_alerts_t.alias("final_alerts_t")
final_sales = final_sales.alias("final_sales")
#df_contribution_0_day = final_alerts_t.join(final_sales, ['EAN','Item_Id','Date','Location_Id'], 'inner')
df_contribution_0_day = final_alerts_t.join(final_sales, ['EAN','Date','Location_Id'], 'inner')
df_contribution_0_day = df_contribution_0_day.select(final_alerts_t.EAN.alias("EAN"),
                                    final_sales.Item_Id.alias("Item_Id"),
                                    final_alerts_t.Date.alias("Date"),
                                    final_alerts_t.Hour_Alert.alias("Hour_Alert"),
                                    final_alerts_t.Location_Id.alias("Location_Id"),
                                    final_alerts_t.Type.alias("Type"),
                                    final_alerts_t.Sector.alias("Sector"),
                                    final_sales.Hour_Sales.alias("Hour_Sales"),
                                    final_sales.Quantity_Sales.alias("Quantity_Sales"),
                                    final_sales.NetAmount.alias("NetAmount"),
                                    final_sales.Contribution.alias("Contribution"),
                                    final_sales.NetCost.alias("NetCost"),
                                          )



df_contribution_0_day = df_contribution_0_day.withColumn("Hour_End",F.when(F.col("Hour_Alert")<F.col("Hour_Sales"),1).otherwise(0))
df_contribution_0_day = df_contribution_0_day.withColumn("Modalidad", F.lit('Cierre local'))

    

In [10]:
df_contribution_0_day.printSchema()


root
 |-- EAN: string (nullable = true)
 |-- Item_Id: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Hour_Alert: integer (nullable = true)
 |-- Location_Id: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Hour_Sales: integer (nullable = true)
 |-- Quantity_Sales: double (nullable = true)
 |-- NetAmount: double (nullable = true)
 |-- Contribution: double (nullable = true)
 |-- NetCost: double (nullable = true)
 |-- Hour_End: integer (nullable = false)
 |-- Modalidad: string (nullable = false)



In [11]:
#segunda parte cuando  fecha de la transaccion se ve un dia despues
final_alerts_t = final_alerts_t.withColumn("Date_End",F.col("Date"))

from pyspark.sql.functions import col, expr, date_add
from pyspark.sql.types import DateType

final_sales = final_sales.withColumn("Date", expr("CAST(Date AS DATE)"))
final_sales = final_sales.withColumn("Date_End", date_add(col("Date"), -1))
final_sales = final_sales.withColumn("Date_End", final_sales["Date_End"].cast("string"))

In [12]:
#segunda parte cuando  fecha de la transaccion se ve un dia despues
#df_contribution_1_day = final_alerts_t.join(final_sales, ['EAN','Item_Id','Date_End','Location_Id'], 'inner')
df_contribution_1_day = final_alerts_t.join(final_sales, ['EAN','Date_End','Location_Id'], 'inner')

df_contribution_1_day = df_contribution_1_day.select(final_alerts_t.EAN.alias("EAN"),
                                    final_sales.Item_Id.alias("Item_Id"),
                                    final_alerts_t.Date.alias("Date"),
                                    final_alerts_t.Hour_Alert.alias("Hour_Alert"),
                                    final_alerts_t.Location_Id.alias("Location_Id"),
                                    final_alerts_t.Type.alias("Type"),
                                    final_alerts_t.Sector.alias("Sector"),
                                    final_sales.Hour_Sales.alias("Hour_Sales"),
                                    final_sales.Quantity_Sales.alias("Quantity_Sales"),
                                    final_sales.NetAmount.alias("NetAmount"),
                                    final_sales.Contribution.alias("Contribution"),
                                    final_sales.NetCost.alias("NetCost"),
                                          )

In [13]:
#segunda parte cuando  fecha de la transaccion durante el dia
from pyspark.sql.functions import lit
final_alerts_t = final_alerts_t.withColumn("Date",F.col("Date_End"))
#df_contribution_all_day = final_alerts_t.join(final_sales, ['EAN','Item_Id','Date_End','Location_Id'], 'inner')
df_contribution_all_day = final_alerts_t.join(final_sales, ['EAN','Date_End','Location_Id'], 'inner')
df_contribution_all_day = df_contribution_all_day.select(final_alerts_t.EAN.alias("EAN"),
                                    final_sales.Item_Id.alias("Item_Id"),
                                    final_alerts_t.Date.alias("Date"),
                                    final_alerts_t.Hour_Alert.alias("Hour_Alert"),
                                    final_alerts_t.Location_Id.alias("Location_Id"),
                                    final_alerts_t.Type.alias("Type"),
                                    final_alerts_t.Sector.alias("Sector"),
                                    final_sales.Hour_Sales.alias("Hour_Sales"),
                                    final_sales.Quantity_Sales.alias("Quantity_Sales"),
                                    final_sales.NetAmount.alias("NetAmount"),
                                    final_sales.Contribution.alias("Contribution"),
                                    final_sales.NetCost.alias("NetCost"),
                                          )
# Combina los DataFrames df_contribution_1 y df_contribution_2 en uno solo
df_contribution_1_day = df_contribution_1_day.union(df_contribution_all_day)
df_contribution_1_day = df_contribution_1_day.withColumn("Hour_End", lit(1))
df_contribution_1_day = df_contribution_1_day.withColumn("Modalidad", F.lit('24 horas'))

df_contribution_all_day = df_contribution_all_day.withColumn("Hour_End", lit(1))
df_contribution_all_day = df_contribution_all_day.withColumn("Modalidad", F.lit('todo el dia'))

In [14]:
df_contribution = df_contribution_0_day.union(df_contribution_1_day)
df_contribution = df_contribution.union(df_contribution_all_day)
df_contribution=df_contribution.withColumn("Contribution_USD",F.col("Contribution")/USD)
df_contribution=df_contribution.withColumn("NetAmount_USD",F.col("NetAmount")/USD)
df_contribution=df_contribution.withColumn("NetCost_USD",F.col("NetCost")/USD)


In [15]:
df_contribution_result = df_contribution.filter(F.col("Hour_End") == 1)\
        .groupBy(F.year("Date"),F.month("Date"),F.col("Type"),F.col("Modalidad"))\
        .agg(
            F.sum("NetAmount").alias("NetAmount"),
            F.sum("NetAmount_USD").alias("NetAmount_USD"),
            F.sum("Contribution").alias("Contribution"),
            F.sum("Contribution_USD").alias("Contribution_USD")
        ).orderBy(F.year("Date").desc(),F.month("Date").desc()).toPandas()


                                                                                

In [16]:
display(df_contribution_result,50)

Unnamed: 0,year(Date),month(Date),Type,Modalidad,NetAmount,NetAmount_USD,Contribution,Contribution_USD
0,2025,4,umv,todo el dia,709698000.0,169272.758333,74368982.0,17738.026489
1,2025,4,umq,Cierre local,54117510.0,12907.770827,-3373857.0,-804.711363
2,2025,4,umv,Cierre local,278351800.0,66390.749183,34306921.0,8182.673167
3,2025,4,premerma,24 horas,479820500.0,114443.801086,-44130238.0,-10525.669568
4,2025,4,apr,24 horas,33218820.0,7923.145138,2881582.0,687.296995
5,2025,4,recepmer,Cierre local,207339400.0,49453.293517,9761878.0,2328.342353
6,2025,4,premerma,todo el dia,239910300.0,57221.900543,-22065119.0,-5262.834784
7,2025,4,umq,24 horas,196840800.0,46949.235905,-19992198.0,-4768.414575
8,2025,4,recepmer,todo el dia,824478800.0,196649.54618,43919272.0,10475.351271
9,2025,4,apr,Cierre local,3167186.0,755.417483,322546.0,76.931663


display(df_contribution_result3)

In [17]:

df_contribution_result.to_csv('tmp_alertas_co.csv' , sep=';', decimal=',', header=True)

In [18]:
print("done")

done


In [19]:
spark.stop()