From f456c0b976ee9221851b409b54ca005d8544fe73 Mon Sep 17 00:00:00 2001 From: Rahul Kumar Date: Fri, 9 Dec 2022 18:40:47 +0530 Subject: [PATCH 1/3] avoid warning --- .../auto-ml-forecasting-backtest-many-models.ipynb | 1 - .../auto-ml-forecasting-many-models.ipynb | 5 +++-- .../scripts/data_preprocessing_tabular.py | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb index 11ff8e6a2a..2c2296ede6 100644 --- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb @@ -368,7 +368,6 @@ "\n", "forecasting_parameters = ForecastingParameters(\n", " time_column_name=TIME_COLNAME,\n", - " drop_column_names=\"Revenue\",\n", " forecast_horizon=6,\n", " time_series_id_column_names=partition_column_names,\n", " cv_step_size=\"auto\",\n", diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb index 2f07e3b159..8029296ff6 100644 --- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb @@ -433,7 +433,6 @@ "\n", "forecasting_parameters = ForecastingParameters(\n", " time_column_name=\"WeekStarting\",\n", - " drop_column_names=\"Revenue\",\n", " forecast_horizon=6,\n", " time_series_id_column_names=partition_column_names,\n", " cv_step_size=\"auto\",\n", @@ -469,7 +468,9 @@ "\n", "Reuse of previous results (``allow_reuse``) is key when using pipelines in a collaborative environment since eliminating unnecessary reruns offers agility. Reuse is the default behavior when the ``script_name``, ``inputs``, and the parameters of a step remain the same. When reuse is allowed, results from the previous run are immediately sent to the next step. If ``allow_reuse`` is set to False, a new run will always be generated for this step during pipeline execution.\n", "\n", - "> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input." + "> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n", + "\n", + "> Note that we **drop column** \"Revenue\" from the dataset in this step as this is not relevant for forecasting with the dataset used in this example. **Please modify the logic based on your data**." ] }, { diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py index f2e11453d4..050df79267 100644 --- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py @@ -11,6 +11,12 @@ def main(args): dataset = run_context.input_datasets["train_10_models"] df = dataset.to_pandas_dataframe() + # Drop the column "Revenue" from the dataset + # Please remove if this is not required + drop_column_name = "Revenue" + if drop_column_name in df.columns: + df.drop(drop_column_name, axis=1, inplace=True) + # Apply any data pre-processing techniques here df.to_parquet(output / "data_prepared_result.parquet", compression=None) From 191eb16342a0b68c37b290ef50e9b34919d6e64b Mon Sep 17 00:00:00 2001 From: Rahul Kumar Date: Fri, 9 Dec 2022 23:13:00 +0530 Subject: [PATCH 2/3] update reason for dropping column --- .../auto-ml-forecasting-many-models.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb index 8029296ff6..ef122603a7 100644 --- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb @@ -470,7 +470,7 @@ "\n", "> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n", "\n", - "> Note that we **drop column** \"Revenue\" from the dataset in this step as this is not relevant for forecasting with the dataset used in this example. **Please modify the logic based on your data**." + "> Note that we **drop column** \"Revenue\" from the dataset in this step to avoid information leak as \"Quantity\" = \"Revenue\" / \"Price\". **Please modify the logic based on your data**." ] }, { From 25a910316a68cb85be56068e21cd3b904293e141 Mon Sep 17 00:00:00 2001 From: Rahul Kumar Date: Sat, 10 Dec 2022 00:00:30 +0530 Subject: [PATCH 3/3] update data_preprocessing_tabular script --- .../scripts/data_preprocessing_tabular.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py index 050df79267..99ad74d7b0 100644 --- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py +++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py @@ -11,8 +11,8 @@ def main(args): dataset = run_context.input_datasets["train_10_models"] df = dataset.to_pandas_dataframe() - # Drop the column "Revenue" from the dataset - # Please remove if this is not required + # Drop the column "Revenue" from the dataset to avoid information leak as + # "Quantity" = "Revenue" / "Price". Please modify the logic based on your data. drop_column_name = "Revenue" if drop_column_name in df.columns: df.drop(drop_column_name, axis=1, inplace=True)