In [0]:
%run ./variables

## Staging layer

In [0]:
spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS {catalog_name}.{bronze_schema_name}
""")

In [0]:
spark.sql(f"""
  CREATE VOLUME IF NOT EXISTS {catalog_name}.{bronze_schema_name}.{staging_volume_name}
""")

In [0]:
dbutils.fs.mkdirs(customers_volume_path,)
dbutils.fs.mkdirs(employees_volume_path)
dbutils.fs.mkdirs(municipalities_volume_path)
dbutils.fs.mkdirs(neighborhoods_volume_path)
dbutils.fs.mkdirs(sales_volume_path)

## Bronze layer

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{bronze_schema_name}.brz_customers (
        customer_id BIGINT,
        name STRING,
        phone STRING,
        email STRING,
        address STRING
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{bronze_schema_name}.brz_employees (
        employee_id BIGINT,
        name STRING,
        phone STRING,
        email STRING,
        address STRING,
        comission DOUBLE
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{bronze_schema_name}.brz_municipalities (
        DPTOMPIO STRING,
        DPTO_CCDGO STRING,
        MPIO_CCDGO STRING,
        MPIO_CNMBR STRING,
        MPIO_CCNCT STRING,
        geometry BINARY
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{bronze_schema_name}.brz_neighborhoods (
        OBJECTID BIGINT,
        CODIGO STRING,
        NOMBRE STRING,
        IDENTIFICACION STRING,
        LIMITEMUNICIPIOID STRING,
        SUBTIPO_COMUNACORREGIMIENTO BIGINT,
        LINK_DOCUMENTO INT,
        SHAPEAREA DOUBLE,
        SHAPELEN DOUBLE,
        geometry BINARY
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{bronze_schema_name}.brz_sales (
        latitude DOUBLE,
        longitude DOUBLE,
        date STRING,
        customer_id INT,
        employee_id INT,
        quantity_products INT,
        order_id STRING
    )
    PARTITIONED BY (employee_id)
""")

## Silver layer

In [0]:
spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS {catalog_name}.{silver_schema_name}
""")

In [0]:
spark.sql(f"""
  CREATE VOLUME IF NOT EXISTS {catalog_name}.{silver_schema_name}.{silver_volume_name}
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema_name}.slv_customers (
        customer_id BIGINT,
        name STRING,
        phone STRING,
        email STRING,
        address STRING
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema_name}.slv_employees (
        employee_id BIGINT,
        name STRING,
        phone STRING,
        email STRING,
        address STRING,
        comission DOUBLE
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema_name}.slv_municipalities (
        code STRING,
        department_code STRING,
        municipality_code STRING,
        municipality STRING,
        ccnct_code STRING,
        geojson STRING
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema_name}.slv_neighborhoods (
        object_id BIGINT,
        code STRING,
        name STRING,
        identification STRING,
        limit_municipality_id STRING,
        subtype_community_district BIGINT,
        link_document INT,
        shape_area DOUBLE,
        shape_len DOUBLE,
        geojson STRING
    )
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema_name}.slv_sales (
        order_id STRING,
        customer_id INT,
        employee_id INT,
        latitude DOUBLE,
        longitude DOUBLE,
        quantity_products INT,
        district STRING,
        neighborhood STRING,
        event_date TIMESTAMP,
        partition_date STRING,
        event_day INT,
        event_hour INT,
        event_minute INT,
        event_month INT,
        event_second INT,
        event_year INT
    )
    PARTITIONED BY (partition_date)
""")

## Gold layer

### Tables

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_sales (
        order_id STRING,
        employee STRING,
        customer STRING,
        latitude DOUBLE,
        longitude DOUBLE,
        district STRING,
        quantity_products INT,
        partition_date STRING,
        event_day INT,
        event_month INT,
        event_year INT,
        event_date DATE
    )
    PARTITIONED BY (event_date)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_sales_districts (
        event_date DATE,
        district STRING,
        total_sales_per_day LONG NOT NULL,
        sales_rank INT NOT NULL,
        percent_total_sales_per_day DECIMAL(27,2),
        total_products_sold_per_day LONG,
        percent_products_sold_per_day DECIMAL(27,2)
    )
    PARTITIONED BY (event_date,district)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_sales_employees (
        event_date DATE,
        employee STRING,
        employee_effectiveness DECIMAL(27,2),
        effectiveness_rank INT NOT NULL,
        total_sales_per_day LONG NOT NULL,
        sales_rank INT NOT NULL,
        percent_total_sales_per_day DECIMAL(27,2),
        total_products_sold_per_day LONG,
        percent_products_sold_per_day DECIMAL(27,2)
    )
    PARTITIONED BY (employee,event_date)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_customers_districts (
        event_year INT,
        event_month INT,
        district STRING,
        total_customers_per_district LONG NOT NULL,
        percent_total_customers_per_district DECIMAL(27,2),
        difference_in_customers_from_last_month LONG NOT NULL,
        total_employees_per_district LONG NOT NULL,
        percent_total_employees_per_district DECIMAL(27,2),
        difference_in_employees_from_last_month LONG NOT NULL,
        employees_per_customer_ratio DOUBLE,
        total_products_sold LONG,
        percent_total_products_sold DECIMAL(27,2),
        difference_in_products_sold_from_last_month LONG
    )
    PARTITIONED BY (event_year, event_month, district)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_best_customers (
        event_year INT,
        event_month INT,
        customer STRING,
        total_products_bought LONG,
        best_customer_rank INT NOT NULL,
        percent_total_products_bought DECIMAL(27,2),
        difference_in_products_bought_from_last_month LONG
    )
    PARTITIONED BY (event_year, event_month, customer)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.gld_best_employees (
        event_year INT,
        event_month INT,
        employee STRING,
        total_products_sold LONG,
        best_employee_rank INT NOT NULL,
        percent_total_products_sold DECIMAL(27,2),
        difference_in_products_sold_from_last_month LONG
    )
    PARTITIONED BY (event_year, event_month, employee)
""")

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema_name}.demand_forecast (
        district STRING,
        event_datetime TIMESTAMP,
        sum_quantity_products DOUBLE
    )
""")

### Views

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_sales AS
SELECT 
    s.order_id
    ,e.name as employee
    ,c.name as customer
    ,s.latitude
    ,s.longitude
    ,s.district
    ,s.quantity_products
    ,s.partition_date
    ,s.event_day
    ,s.event_month
    ,s.event_year
    ,date(s.event_date)
FROM 
    {catalog_name}.{silver_schema_name}.slv_sales as s
    JOIN 
        {catalog_name}.{silver_schema_name}.slv_customers as c
    ON 
        s.customer_id = c.customer_id
    JOIN 
        {catalog_name}.{silver_schema_name}.slv_employees as e
    ON
        s.employee_id=e.employee_id
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_sales_districts AS
SELECT 
    b.event_date
    ,b.district
    ,COUNT(b.order_id) as total_sales_per_day
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY COUNT(b.order_id) DESC) as sales_rank
    ,ROUND(
        COUNT(b.order_id) * 100.0 / 
        SUM(COUNT(b.order_id)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_total_sales_per_day
    ,SUM(b.quantity_products) as  total_products_sold_per_day
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_products_sold_per_day
FROM 
    {catalog_name}.{gold_schema_name}.vw_gld_sales b
GROUP BY
    b.event_date, b.district
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_sales_employees AS
SELECT 
    b.event_date
    ,b.employee
    ,ROUND(SUM(b.quantity_products)*100.0/COUNT(b.order_id),2) as employee_effectiveness
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY SUM(b.quantity_products) * 1.0 / COUNT(b.order_id) DESC) as effectiveness_rank
    ,COUNT(b.order_id) as  total_sales_per_day
    ,DENSE_RANK() OVER (PARTITION BY b.event_date ORDER BY COUNT(b.order_id) DESC) as sales_rank
    ,ROUND(
        COUNT(b.order_id) * 100.0 / 
        SUM(COUNT(b.order_id)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_total_sales_per_day
    ,SUM(b.quantity_products) as  total_products_sold_per_day
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_date), 
        2
    ) AS percent_products_sold_per_day
FROM 
    {catalog_name}.{gold_schema_name}.vw_gld_sales b
GROUP BY
    b.employee, b.event_date
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_customers_districts AS
SELECT 
  b.event_year
  ,b.event_month
  ,b.district
  ,COUNT(DISTINCT b.customer) as total_customers_per_district
  ,ROUND(
        COUNT(DISTINCT b.customer) * 100.0 / 
        SUM(COUNT(DISTINCT b.customer)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_customers_per_district
  ,COUNT(DISTINCT b.customer) - 
  LAG(COUNT(DISTINCT b.customer),1, COUNT(DISTINCT b.customer)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_customers_from_last_month
  ,COUNT(DISTINCT b.employee) as total_employees_per_district
  ,ROUND(
        COUNT(DISTINCT b.employee) * 100.0 / 
        SUM(COUNT(DISTINCT b.employee)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_employees_per_district
  ,COUNT(DISTINCT b.employee) - 
  LAG(COUNT(DISTINCT b.employee),1, COUNT(DISTINCT b.employee)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_employees_from_last_month
  ,ROUND(COUNT(DISTINCT b.customer)/COUNT(DISTINCT b.employee),0) AS employees_per_customer_ratio
  ,SUM(b.quantity_products) AS total_products_sold
  ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_sold
  ,SUM(b.quantity_products) - 
  LAG(SUM(b.quantity_products),1, SUM(b.quantity_products)) OVER (PARTITION BY b.district ORDER BY b.event_year,b.event_month) AS difference_in_products_sold_from_last_month

FROM  
  {catalog_name}.{gold_schema_name}.vw_gld_sales b
GROUP BY
  b.event_year,b.event_month,b.district
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_best_customers AS 
SELECT 
    b.event_year
    ,b.event_month
    ,b.customer
    ,sum(b.quantity_products) as total_products_bought
    ,DENSE_RANK() OVER (PARTITION BY b.event_year,b.event_month ORDER BY sum(b.quantity_products) DESC) as best_customer_rank
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_bought
    ,
    SUM(b.quantity_products) - LAG(SUM(b.quantity_products),1,SUM(b.quantity_products)) OVER (PARTITION BY b.customer ORDER BY b.event_year,b.event_month) AS difference_in_products_bought_from_last_month
FROM
    {catalog_name}.{gold_schema_name}.vw_gld_sales b
GROUP BY
    b.event_year,b.event_month,b.customer
"""
)

In [0]:
spark.sql(f"""
CREATE OR REPLACE VIEW {catalog_name}.{gold_schema_name}.vw_gld_best_employees AS
SELECT 
    b.event_year
    ,b.event_month
    ,b.employee
    ,sum(b.quantity_products) as total_products_sold
    ,DENSE_RANK() OVER (PARTITION BY b.event_year,b.event_month ORDER BY sum(b.quantity_products) DESC) as best_employee_rank
    ,ROUND(
        SUM(b.quantity_products) * 100.0 / 
        SUM(SUM(b.quantity_products)) OVER (PARTITION BY b.event_year,b.event_month), 
        2
    ) AS percent_total_products_sold
    ,
    SUM(b.quantity_products) - LAG(SUM(b.quantity_products),1,SUM(b.quantity_products)) OVER (PARTITION BY b.employee ORDER BY b.event_year,b.event_month) AS difference_in_products_sold_from_last_month
FROM
    {catalog_name}.{gold_schema_name}.vw_gld_sales b
GROUP BY
    b.event_year,b.event_month,b.employee
"""
)