In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, avg, sum, expr, lpad, coalesce, lit, log, exp, dayofweek, when
import mlflow
from databricks.feature_engineering import FeatureEngineeringClient

fe = FeatureEngineeringClient()
mlflow.autolog(disable=True)

In [0]:
%run ../config/variables

## Load data

In [0]:
df_dataset = fe.read_table(name=f'{catalog_name}.{gold_schema_name}.features_demand_forecast')
mlflow_dataset = mlflow.data.load_delta(table_name = f'{catalog_name}.{gold_schema_name}.features_demand_forecast', name = 'features_demand_forecast')

In [0]:
# Display data (only for demo purpose TODO: remove when deplyoing)
display(df_dataset)

Databricks visualization. Run in Databricks to view.

## Train - test split

Se cortan los datos en 2025-07-19

In [0]:
cut_off_date = '2025-07-19'
df_train = df_dataset.filter(col('event_date') < cut_off_date)
df_test = df_dataset.filter(col('event_date') >= cut_off_date)

## Preprocessing

In [0]:
features_dict = {
    'features1': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products'],
    'features2': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2'],
    'features3': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3'],
    'features4': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3', 'prev_quantity_products_4'],
    'features5': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3', 'prev_quantity_products_4', 'prev_quantity_products_5'],
    'features6': ['district_vec', 'event_weekday', 'avg_latitude', 'avg_longitude', 'event_hour', 'prev_quantity_products', 'prev_quantity_products_2', 'prev_quantity_products_3', 'prev_quantity_products_4', 'prev_quantity_products_5', 'prev_quantity_products_6'],
}

indexer = StringIndexer(inputCol='district', outputCol='district_index', handleInvalid='keep')
encoder = OneHotEncoder(inputCols=['district_index','event_weekday','event_hour'], outputCols=['district_vec','event_weekday_vec','event_hour_vec'], handleInvalid='keep')
assembler_1 = VectorAssembler(inputCols=features_dict['features1'], outputCol='features1')
assembler_2 = VectorAssembler(inputCols=features_dict['features2'], outputCol='features2')
assembler_3 = VectorAssembler(inputCols=features_dict['features3'], outputCol='features3')
assembler_4 = VectorAssembler(inputCols=features_dict['features4'], outputCol='features4')
assembler_5 = VectorAssembler(inputCols=features_dict['features5'], outputCol='features5')
assembler_6 = VectorAssembler(inputCols=features_dict['features6'], outputCol='features6')

## Model building and fitting with Cross validation

In [0]:
random_forest = RandomForestRegressor(featuresCol='features1', labelCol='sum_quantity_products')
evaluator = RegressionEvaluator(labelCol='sum_quantity_products', predictionCol='prediction', metricName='rmse')

pipeline = Pipeline(stages=[indexer, encoder, assembler_1, assembler_2, assembler_3, assembler_4, assembler_5, assembler_6, random_forest])

params = (ParamGridBuilder()
          .addGrid(random_forest.featuresCol, ['features1', 'features2', 'features3', 'features4', 'features5', 'features6'])
          .addGrid(random_forest.maxDepth, [10, 20, 30])
          .addGrid(random_forest.numTrees, [10, 20, 30])
          .build()

          )

In [0]:
from mlflow.models.signature import infer_signature

experiment_name = '/Workspace/Shared/experiments/demand_forecast'

mlflow.set_registry_uri('databricks-uc')
mlflow.set_tracking_uri('databricks')

if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    mlflow.log_input(mlflow_dataset, context="source")
    mlflow.log_param('cut_off_date', cut_off_date)
    cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=params, numFolds=3)
    cv_model = cv.fit(df_train)
    predictions = cv_model.transform(df_test)
    
    rmse_train = evaluator.evaluate(cv_model.transform(df_train))
    rmse_test = evaluator.evaluate(predictions)

    features_col = cv_model.bestModel.stages[-1].getOrDefault('featuresCol')
    sample_df = predictions.select(features_col).limit(10).toPandas()
    sample_pred = predictions.select('prediction').limit(10).toPandas()
    signature = infer_signature(sample_df, sample_pred)
    

    mlflow.log_metric('rmse_train', rmse_train)
    mlflow.log_metric('rmse_test', rmse_test)
    mlflow.log_param('max_depth', cv_model.bestModel.stages[-1].getOrDefault('maxDepth'))
    mlflow.log_param('num_trees', cv_model.bestModel.stages[-1].getOrDefault('numTrees'))
    mlflow.log_param('features', features_dict[features_col])
    mlflow.spark.log_model(cv_model.bestModel, 'model', signature=signature, registered_model_name=f'{catalog_name}.{gold_schema_name}.demand_forecast_model')


In [0]:
# Display data (only for demo purpose TODO: remove when deplyoing)
predictions_train = cv_model.transform(df_train)
predictions_test = cv_model.transform(df_test)
display(predictions_train.union(predictions_test).select('district', 'event_datetime','sum_quantity_products', 'prediction'))

Databricks visualization. Run in Databricks to view.