From 753c4ca27bfacbc38246bea50234706ea956c83b Mon Sep 17 00:00:00 2001
From: Rahul Kumar <74648335+iamrk04@users.noreply.github.com>
Date: Sat, 10 Dec 2022 14:25:06 +0530
Subject: [PATCH] Avoid warning in Many-Models Notebook (#1971)

* avoid warning

* update reason for dropping column

* update data_preprocessing_tabular script

Co-authored-by: Rahul Kumar <rahulkuma@microsoft.com>
---
 .../auto-ml-forecasting-backtest-many-models.ipynb          | 1 -
 .../auto-ml-forecasting-many-models.ipynb                   | 5 +++--
 .../scripts/data_preprocessing_tabular.py                   | 6 ++++++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb
index 11ff8e6a2a..2c2296ede6 100644
--- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb
+++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb
@@ -368,7 +368,6 @@
     "\n",
     "forecasting_parameters = ForecastingParameters(\n",
     "    time_column_name=TIME_COLNAME,\n",
-    "    drop_column_names=\"Revenue\",\n",
     "    forecast_horizon=6,\n",
     "    time_series_id_column_names=partition_column_names,\n",
     "    cv_step_size=\"auto\",\n",
diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb
index 2f07e3b159..ef122603a7 100644
--- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb
+++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/auto-ml-forecasting-many-models.ipynb
@@ -433,7 +433,6 @@
     "\n",
     "forecasting_parameters = ForecastingParameters(\n",
     "    time_column_name=\"WeekStarting\",\n",
-    "    drop_column_names=\"Revenue\",\n",
     "    forecast_horizon=6,\n",
     "    time_series_id_column_names=partition_column_names,\n",
     "    cv_step_size=\"auto\",\n",
@@ -469,7 +468,9 @@
     "\n",
     "Reuse of previous results (``allow_reuse``) is key when using pipelines in a collaborative environment since eliminating unnecessary reruns offers agility. Reuse is the default behavior when the ``script_name``, ``inputs``, and the parameters of a step remain the same. When reuse is allowed, results from the previous run are immediately sent to the next step. If ``allow_reuse`` is set to False, a new run will always be generated for this step during pipeline execution.\n",
     "\n",
-    "> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input."
+    "> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n",
+    "\n",
+    "> Note that we **drop column** \"Revenue\" from the dataset in this step to avoid information leak as \"Quantity\" = \"Revenue\" / \"Price\". **Please modify the logic based on your data**."
    ]
   },
   {
diff --git a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py
index f2e11453d4..99ad74d7b0 100644
--- a/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py
+++ b/v1/python-sdk/tutorials/automl-with-azureml/forecasting-many-models/scripts/data_preprocessing_tabular.py
@@ -11,6 +11,12 @@ def main(args):
     dataset = run_context.input_datasets["train_10_models"]
     df = dataset.to_pandas_dataframe()
 
+    # Drop the column "Revenue" from the dataset to avoid information leak as
+    # "Quantity" = "Revenue" / "Price". Please modify the logic based on your data.
+    drop_column_name = "Revenue"
+    if drop_column_name in df.columns:
+        df.drop(drop_column_name, axis=1, inplace=True)
+
     # Apply any data pre-processing techniques here
 
     df.to_parquet(output / "data_prepared_result.parquet", compression=None)