In [0]:
from pyspark.sql.functions import col, avg, sum, expr, lpad, coalesce, lit, log, exp, dayofweek, when
from databricks.feature_engineering import FeatureEngineeringClient

In [0]:
%run ../config/variables

## Load base data

In [0]:
df = spark.table(f'{catalog_name}.{silver_schema_name}.slv_sales')

## Engineering features

In [0]:
df_base = (
            df
            .withColumn('event_date', col('event_date').cast('date')) \
            .withColumn('event_datetime', expr("to_timestamp(concat(event_date, ' ', lpad(event_hour, 2, '0')), 'yyyy-MM-dd HH')"))
            .groupBy('district', 'event_date', 'event_hour', 'event_datetime').agg(
                                                                                    avg('latitude').alias('avg_latitude'),
                                                                                    avg('longitude').alias('avg_longitude'),
                                                                                    sum('quantity_products').alias('sum_quantity_products')
            )

          )

In [0]:
df_avg_coordinates = df_base.groupBy('district').agg(
    avg('avg_latitude').alias('avg_latitude_district'),
    avg('avg_longitude').alias('avg_longitude_district'),
)

In [0]:
districts = df_base.select('district').distinct()
dates = df_base.select('event_date').distinct()
hours = spark.range(0,24).withColumnRenamed('id', 'event_hour')

df_dataset = (
    districts.crossJoin(dates).crossJoin(hours) 
    .withColumn('event_datetime', expr("to_timestamp(concat(event_date, ' ', lpad(event_hour, 2, '0')), 'yyyy-MM-dd HH')"))
    .join(df_base, on=['district', 'event_date', 'event_hour', 'event_datetime'], how='left')
    .withColumn("sum_quantity_products", coalesce(col("sum_quantity_products"), lit(0)))
    .join(df_avg_coordinates, on='district', how='left')
    .withColumn('avg_latitude', coalesce(col('avg_latitude'), col('avg_latitude_district')))
    .withColumn('avg_longitude', coalesce(col('avg_longitude'), col('avg_longitude_district')))
    .drop('avg_latitude_district', 'avg_longitude_district')
    .withColumn('prev_quantity_products', expr("lag(sum_quantity_products, 1) over(partition by district order by event_datetime)")) 
    .withColumn('prev_quantity_products_2', expr("lag(sum_quantity_products, 2) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_3', expr("lag(sum_quantity_products, 3) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_4', expr("lag(sum_quantity_products, 4) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_5', expr("lag(sum_quantity_products, 5) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_6', expr("lag(sum_quantity_products, 6) over(partition by district order by event_datetime)"))
    .withColumn('event_weekday', dayofweek(col('event_datetime')))
    .filter(col('prev_quantity_products').isNotNull() & col('prev_quantity_products_2').isNotNull() & col('prev_quantity_products_3').isNotNull() & col('prev_quantity_products_4').isNotNull() & col('prev_quantity_products_5').isNotNull() & col('prev_quantity_products_6').isNotNull())
) 

## Save to feature store

In [0]:
fe = FeatureEngineeringClient()

In [0]:
# fe.create_table(
#     name = f'{catalog_name}.{gold_schema_name}.features_demand_forecast',
#     primary_keys=['district','event_datetime'],
#     df = df_dataset,
#     partition_columns=['event_date'],
#     description = 'Sales Features Aggregated by District for Demand forecasting',
# )

In [0]:
fe.write_table(
    name = f'{catalog_name}.{gold_schema_name}.features_demand_forecast',
    df = df_dataset,
    mode = 'merge'
)