In [0]:
%run ../../config/variables

In [0]:
ORIGIN_TABLE="slv_sales"
BASE_DF_TARGET_TABLE="gld_sales"
SALES_PER_DISTRICT_TARGET_TABLE="gld_sales_districts"
SALES_PER_EMPLOYEE_TARGET_TABLE="gld_sales_employees"
CUSTOMERS_PER_DISTRICT_TARGET_TABLE="gld_customers_districts"
BEST_CUSTOMERS_TARGET_TABLE="gld_best_customers"
BEST_EMPLOYEES_TARGET_TABLE="gld_best_employees"

In [0]:
#silver_df = spark.readStream.format('delta').option("maxFilesPerTrigger", 10).table(f"{catalog_name}.{silver_schema_name}.{ORIGIN_TABLE}")
#silver_df=spark.read.table(f"{catalog_name}.{silver_schema_name}.{ORIGIN_TABLE}")
#silver_df.createOrReplaceTempView("sales")

# Read Streaming

## Base query

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} AS
SELECT 
    s.order_id
    ,e.name as employee
    ,c.name as customer
    ,s.latitude
    ,s.longitude
    ,s.district
    ,s.quantity_products
    ,s.partition_date
    ,s.event_day
    ,s.event_month
    ,s.event_year
    ,date(s.event_date)
FROM 
    {catalog_name}.{silver_schema_name}.slv_sales as s
    JOIN 
        {catalog_name}.{silver_schema_name}.slv_customers as c
    ON 
        s.customer_id = c.customer_id
    JOIN 
        {catalog_name}.{silver_schema_name}.slv_employees as e
    ON
        s.employee_id=e.employee_id
"""
)
#base_df.createOrReplaceTempView("base_df")

## Total date's sales per district

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{SALES_PER_DISTRICT_TARGET_TABLE} AS
SELECT 
    b.event_date
    ,b.district
    ,COUNT(b.order_id) as total_sales_per_day
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY COUNT(b.order_id) DESC) as sales_rank
    ,ROUND(
        COUNT(b.order_id) * 100.0 / 
        SUM(COUNT(b.order_id)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_total_sales_per_day
    ,SUM(b.quantity_products) as  total_products_sold_per_day
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_products_sold_per_day
FROM 
    {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} b
GROUP BY
    b.event_date, b.district
"""
)

## Total dates's sales per employees

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{SALES_PER_EMPLOYEE_TARGET_TABLE} AS
SELECT 
    b.event_date
    ,b.employee
    ,ROUND(SUM(b.quantity_products)*100.0/COUNT(b.order_id),2) as employee_effectiveness
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY SUM(b.quantity_products) * 1.0 / COUNT(b.order_id) DESC) as effectiveness_rank
    ,COUNT(b.order_id) as  total_sales_per_day
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY COUNT(b.order_id) DESC) as sales_rank
    ,ROUND(
        COUNT(b.order_id) * 100.0 / 
        SUM(COUNT(b.order_id)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_total_sales_per_day
    ,SUM(b.quantity_products) as  total_products_sold_per_day
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_products_sold_per_day
FROM 
    {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} b
GROUP BY
    b.employee, b.event_date
"""
)

## Unique customers per year/month and district

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{CUSTOMERS_PER_DISTRICT_TARGET_TABLE} AS
SELECT 
  b.event_year
  ,b.event_month
  ,b.district
  ,COUNT(DISTINCT b.customer) as total_customers_per_district
  ,ROUND(
        COUNT(DISTINCT b.customer) * 100.0 / 
        SUM(COUNT(DISTINCT b.customer)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_customers_per_district
  ,COUNT(DISTINCT b.customer) - 
  LAG(COUNT(DISTINCT b.customer),1, COUNT(DISTINCT b.customer)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_customers_from_last_month
  ,COUNT(DISTINCT b.employee) as total_employees_per_district
  ,ROUND(
        COUNT(DISTINCT b.employee) * 100.0 / 
        SUM(COUNT(DISTINCT b.employee)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_employees_per_district
  ,COUNT(DISTINCT b.employee) - 
  LAG(COUNT(DISTINCT b.employee),1, COUNT(DISTINCT b.employee)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_employees_from_last_month
  ,ROUND(COUNT(DISTINCT b.customer)/COUNT(DISTINCT b.employee),0) AS employees_per_customer_ratio
  ,SUM(b.quantity_products) AS total_products_sold
  ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_sold
  ,SUM(b.quantity_products) - 
  LAG(SUM(b.quantity_products),1, SUM(b.quantity_products)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_products_sold_from_last_month

FROM  
  {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} b
GROUP BY
  b.event_year,b.event_month,b.district
"""
)

## Best customers per year-month

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{BEST_CUSTOMERS_TARGET_TABLE} AS 
SELECT 
    b.event_year
    ,b.event_month
    ,b.customer
    ,sum(b.quantity_products) as total_products_bought
    ,DENSE_RANK() OVER (PARTITION BY b.event_year,b.event_month ORDER BY sum(b.quantity_products) DESC) as best_customer_rank
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_bought
    ,
    SUM(b.quantity_products) - LAG(SUM(b.quantity_products),1,SUM(b.quantity_products)) OVER (PARTITION BY b.customer ORDER BY b.event_year,b.event_month) AS difference_in_products_bought_from_last_month
FROM
    {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} b
GROUP BY
    b.event_year,b.event_month,b.customer
"""
)

## Best employee per year-month

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.{BEST_EMPLOYEES_TARGET_TABLE} AS
SELECT 
    b.event_year
    ,b.event_month
    ,b.employee
    ,sum(b.quantity_products) as total_products_sold
    ,DENSE_RANK() OVER (PARTITION BY b.event_year,b.event_month ORDER BY sum(b.quantity_products) DESC) as best_employee_rank
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_sold
    ,
    SUM(b.quantity_products) - LAG(SUM(b.quantity_products),1,SUM(b.quantity_products)) OVER (PARTITION BY b.employee ORDER BY b.event_year,b.event_month) AS difference_in_products_sold_from_last_month
FROM
    {catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE} b
GROUP BY
    b.event_year,b.event_month,b.employee
"""
)

# Write streaming

## Base query

In [0]:
# (base_df.writeStream
#     .format("delta")
#     .outputMode("Complete")
#     .partitionBy("event_date")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.gld_sales")
#     .awaitTermination()
# )

#base_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{BASE_DF_TARGET_TABLE}")




## Total date's sales per district

In [0]:
# (df_sales_per_district.writeStream
#     .format("delta")
#     .outputMode("complete")
#     .partitionBy("event_date","district")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.{SALES_PER_DISTRICT_TARGET_TABLE}")
#     .awaitTermination()
# )

#df_sales_per_district.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{SALES_PER_DISTRICT_TARGET_TABLE}")

## Total dates's sales per employees

In [0]:
# (
# df_sales_per_employee.writeStream
#     .format("delta")
#     .outputMode("Complete")
#     .partitionBy("employee","event_date")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.gld_sales_employees")
#     .awaitTermination()
# )

#df_sales_per_employee.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{SALES_PER_EMPLOYEE_TARGET_TABLE}")

## Unique customers per year/month and district

In [0]:
# (df_customers_per_district.writeStream
#     .format("delta")
#     .outputMode("append")
#     .partitionBy("event_year", "event_month", "district")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.{CUSTOMERS_PER_DISTRICT_TARGET_TABLE}")
#     .awaitTermination()
# )


#df_customers_per_district.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{CUSTOMERS_PER_DISTRICT_TARGET_TABLE}")

## Best customers per year-month

In [0]:
# (df_best_customer.writeStream
#     .format("delta")
#     .outputMode("append")
#     .partitionBy("event_year","event_month","customer")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.{BEST_CUSTOMERS_TARGET_TABLE}")
#     .awaitTermination()
# )

#df_best_customer.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{BEST_CUSTOMERS_TARGET_TABLE}")

## Best employee per year-month

In [0]:
# (df_best_employee.writeStream
#     .format("delta")
#     .outputMode("append")
#     .partitionBy("event_year","event_month","employee")
#     .option("checkpointLocation", gold_checkpoint_path)
#     .trigger(availableNow=True)
#     .table(f"{catalog_name}.{gold_schema_name}.{BEST_EMPLOYEES_TARGET_TABLE}")
#     .awaitTermination()
# )

#df_best_employee.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema_name}.{BEST_EMPLOYEES_TARGET_TABLE}")