In [0]:
df = spark.table('adb_big_data_unal_water.silver.slv_sales')

In [0]:
display(df)

In [0]:
from pyspark.sql.functions import col, avg, sum, expr, lpad, coalesce, lit, log, exp, dayofweek, when

In [0]:
# df.write.format('delta').mode('overwrite').saveAsTable('adb_big_data_unal_water.default.model_base_data')

In [0]:
df = spark.table('adb_big_data_unal_water.default.model_base_data') \
            .withColumn('event_date', col('event_date').cast('date')) \
            .withColumn('event_datetime', expr("to_timestamp(concat(event_date, ' ', lpad(event_hour, 2, '0')), 'yyyy-MM-dd HH')"))

In [0]:
display(df)

In [0]:
df_ds = df.groupBy('district', 'event_date', 'event_hour', 'event_datetime').agg(
    avg('latitude').alias('avg_latitude'),
    avg('longitude').alias('avg_longitude'),
    sum('quantity_products').alias('sum_quantity_products'),
)

In [0]:
display(df_ds)

Databricks visualization. Run in Databricks to view.

In [0]:
df_avg_coordinates = df_ds.groupBy('district').agg(
    avg('avg_latitude').alias('avg_latitude_district'),
    avg('avg_longitude').alias('avg_longitude_district'),
)

In [0]:
keys = df_ds.select('district', 'event_date').distinct()
hours = spark.range(0,24).withColumnRenamed('id', 'event_hour')

df_complete = (
    keys.crossJoin(hours) 
    .withColumn('event_datetime', expr("to_timestamp(concat(event_date, ' ', lpad(event_hour, 2, '0')), 'yyyy-MM-dd HH')"))
    .join(df_ds, on=['district', 'event_date', 'event_hour', 'event_datetime'], how='left')
    .withColumn("sum_quantity_products", coalesce(col("sum_quantity_products"), lit(0)))
    .join(df_avg_coordinates, on='district', how='left')
    .withColumn('avg_latitude', coalesce(col('avg_latitude'), col('avg_latitude_district')))
    .withColumn('avg_longitude', coalesce(col('avg_longitude'), col('avg_longitude_district')))
    .drop('avg_latitude_district', 'avg_longitude_district')
) 

In [0]:
display(df_complete)

Databricks visualization. Run in Databricks to view.

In [0]:
df_dataset = (
    df_complete
    # .withColumn('log_avg_latitude', log(col('avg_latitude')))
    # .withColumn('log_avg_longitude', log(-col('avg_longitude')))
    # .withColumn('exp_avg_latitude', exp(col('avg_latitude')))
    # .withColumn('exp_avg_longitude', exp(-col('avg_longitude')))
    .withColumn('prev_quantity_products', expr("lag(sum_quantity_products, 1) over(partition by district order by event_datetime)")) 
    .withColumn('prev_quantity_products_2', expr("lag(sum_quantity_products, 2) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_3', expr("lag(sum_quantity_products, 3) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_4', expr("lag(sum_quantity_products, 4) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_5', expr("lag(sum_quantity_products, 5) over(partition by district order by event_datetime)"))
    .withColumn('prev_quantity_products_6', expr("lag(sum_quantity_products, 6) over(partition by district order by event_datetime)"))
    .withColumn('event_weekday', dayofweek(col('event_datetime')))
    .filter(col('prev_quantity_products').isNotNull() & col('prev_quantity_products_2').isNotNull() & col('prev_quantity_products_3').isNotNull() & col('prev_quantity_products_4').isNotNull() & col('prev_quantity_products_5').isNotNull() & col('prev_quantity_products_6').isNotNull())
)

In [0]:
display(df_dataset.orderBy('district', 'event_datetime', 'event_hour'))

In [0]:
# df_dataset.write.format('delta').mode('overwrite').saveAsTable('adb_big_data_unal_water.default.model_dataset')

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, avg, sum, expr, lpad, coalesce, lit, log, exp, dayofweek, when

In [0]:
df_dataset = spark.table('adb_big_data_unal_water.default.model_dataset')

In [0]:
df_train = df_dataset.filter(col('event_date') < '2025-07-19')
df_test = df_dataset.filter(col('event_date') >= '2025-07-19')

In [0]:
display(df_train.count(), df_test.count())

In [0]:
indexer = StringIndexer(inputCol='district', outputCol='district_index')
encoder = OneHotEncoder(inputCols=['district_index','event_weekday','event_hour'], outputCols=['district_vec','event_weekday_vec','event_hour_vec'])
assembler_1 = VectorAssembler(inputCols=['district_vec', 'event_weekday_vec', 'avg_latitude', 'avg_longitude', 'event_hour_vec','prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3', 'prev_quantity_products_4', 'prev_quantity_products_5', 'prev_quantity_products_6'], outputCol='features')
assembler_2 = VectorAssembler(inputCols=['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3', 'prev_quantity_products_4', 'prev_quantity_products_5', 'prev_quantity_products_6'], outputCol='features2')
evaluator = RegressionEvaluator(labelCol='sum_quantity_products', predictionCol='prediction', metricName='rmse')

In [0]:
linear_regression_1 = LinearRegression(featuresCol='features', labelCol='sum_quantity_products')
linear_regression_2 = LinearRegression(featuresCol='features2', labelCol='sum_quantity_products')

pipeline_1 = Pipeline(stages=[indexer, encoder, assembler_1, linear_regression_1])
pipeline_2 = Pipeline(stages=[indexer, encoder, assembler_2, linear_regression_2])

## Linear regression features 1

In [0]:
model_1 = pipeline_1.fit(df_train)


In [0]:
df_train_predict_1 = model_1.transform(df_train)
df_test_predict_1 = model_1.transform(df_test)

Databricks visualization. Run in Databricks to view.

In [0]:
display(df_train_predict_1.union(df_test_predict_1).select('event_datetime', 'sum_quantity_products','district','prediction'))

Databricks visualization. Run in Databricks to view.

In [0]:
evaluator.evaluate(df_train_predict_1), evaluator.evaluate(df_test_predict_1)

## Linear regression features 2

In [0]:
model_2 = pipeline_2.fit(df_train)

In [0]:
df_train_predict_2 = model_2.transform(df_train)
df_test_predict_2 = model_2.transform(df_test)

Databricks visualization. Run in Databricks to view.

In [0]:
display(df_train_predict_1.union(df_test_predict_1).select('event_datetime', 'sum_quantity_products','district','prediction'))

Databricks visualization. Run in Databricks to view.

In [0]:
evaluator.evaluate(df_train_predict_2), evaluator.evaluate(df_test_predict_2)

## Random forest

In [0]:
random_forest_1 = RandomForestRegressor(featuresCol='features', labelCol='sum_quantity_products')
random_forest_2 = RandomForestRegressor(featuresCol='features2', labelCol='sum_quantity_products')

pipeline_3 = Pipeline(stages=[indexer, encoder, assembler_1, random_forest_1])
pipeline_4 = Pipeline(stages=[indexer, encoder, assembler_2, random_forest_2])

In [0]:
model_3 = pipeline_3.fit(df_train)
df_train_predict_3 = model_3.transform(df_train)
df_test_predict_3 = model_3.transform(df_test)
display(df_train_predict_3.union(df_test_predict_3).select('event_datetime', 'sum_quantity_products','district','prediction'))

Databricks visualization. Run in Databricks to view.

In [0]:
evaluator.evaluate(df_train_predict_3), evaluator.evaluate(df_test_predict_3)

In [0]:
model_4 = pipeline_4.fit(df_train)
df_train_predict_4 = model_4.transform(df_train)
df_test_predict_4 = model_4.transform(df_test)
display(df_train_predict_4.union(df_test_predict_4).select('event_datetime', 'sum_quantity_products','district','prediction'))

Databricks visualization. Run in Databricks to view.

In [0]:
evaluator.evaluate(df_train_predict_4), evaluator.evaluate(df_test_predict_4)