remove news summary dataset from repo, download using script in sample (

#2250) * remove news summary dataset from repo, download using script in sample * fix to cli sample to remove dataset from repo * typo fix * formatting
Azure · May 10, 2023 · 0b96822 · 0b96822
1 parent 0abe0b5
commit 0b96822
Show file tree

Hide file tree

Showing 12 changed files with 134 additions and 11,249 deletions.
diff --git a/cli/foundation-models/system/finetune/summarization/download-dataset.py b/cli/foundation-models/system/finetune/summarization/download-dataset.py
@@ -0,0 +1,87 @@
+# import library to parse command line arguments
+import argparse, os
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
+# add an argument to specify the config name of the dataset
+parser.add_argument(
+    "--config_name", type=str, default="3.0.0", help="config name of the dataset"
+)
+# argument to save a fraction of the dataset
+parser.add_argument(
+    "--fraction", type=float, default=0.05, help="fraction of the dataset to save"
+)
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="./news-summary-dataset",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset, config_name=args.config_name):
+    print(f"Loading {split} split of {args.dataset} dataset...")
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, args.config_name, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.select(range(int(dataset.num_rows * args.fraction))).to_json(
+        os.path.join(args.download_dir, f"{split}.jsonl")
+    )
+
+train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
+validation_df = pd.read_json(
+    os.path.join(args.download_dir, "validation.jsonl"), lines=True
+)
+# this dataset doesn't have test data, so split the validation_df into test_df and validation_df
+test_df = validation_df.sample(frac=0.5, random_state=42)
+validation_df.drop(test_df.index, inplace=True)
+# drop the id column as it is not needed for fine tuning
+train_df.drop(columns=["id"], inplace=True)
+validation_df.drop(columns=["id"], inplace=True)
+test_df.drop(columns=["id"], inplace=True)
+
+
+# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder
+train_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
+)
+validation_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_validation.jsonl"),
+    orient="records",
+    lines=True,
+)
+test_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
+)
+
+
+# generate sample scoring data
+# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe
+import pandas as pd
+import json
+
+test_df = pd.read_json(
+    os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
+)
+# take 1 random sample
+test_df = test_df.sample(n=1)
+# rebuild index
+test_df.reset_index(drop=True, inplace=True)
+# rename the highlights column to ground_truth_summary
+test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
+# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
+test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
+# save the json object to a file named sample_score.json in the ./emotion-dataset folder
+with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
+    json.dump(test_json, f)
diff --git a/cli/foundation-models/system/finetune/summarization/news-summary.sh b/cli/foundation-models/system/finetune/summarization/news-summary.sh
@@ -3,9 +3,9 @@ set -x
 # the data files are available in the same folder as the above notebook
 
 # script inputs
-subscription_id="<SUBSCRIPTION_ID>"
-resource_group_name="<RESOURCE_GROUP>"
-workspace_name="<WORKSPACE_NAME>"
+subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>"
+resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>",
+workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>",
 registry_name="azureml"
 
 compute_cluster="gpu-cluster-big"
@@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2"
 
 
 # training data
-train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl"
+train_data="./news-summary-dataset/small_train.jsonl"
 # validation data
-validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl"
+validation_data="./news-summary-dataset/small_validation.jsonl"
 # test data
-test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl"
+test_data="./news-summary-dataset/small_test.jsonl"
 # evaluation config
-evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json"
+evaluation_config="./summarization-config.json"
 # scoring_file
-scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json"
+scoring_file="./news-summary-dataset/sample_score.json"
 
 # finetuning job parameters
 finetuning_pipeline_component="summarization_pipeline"
@@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail
 num_train_epochs=3
 learning_rate=2e-5
 
+
+
 # 1. Setup pre-requisites
 
 if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
@@ -73,6 +75,13 @@ else
     }
 fi
 
+# download the dataset
+
+python ./download-dataset.py || {
+    echo "Failed to download dataset"
+    exit 1
+}
+
 # 2. Check if the model exists in the registry
 # need to confirm model show command works for registries outside the tenant (aka system registry)
 if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name 

diff --git a/cli/foundation-models/system/finetune/summarization/summarization-config.json b/cli/foundation-models/system/finetune/summarization/summarization-config.json
@@ -0,0 +1,5 @@
+{
+    "metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
+    "aggregator": true,
+    "stemmer": true
+}
diff --git a/.../news-summary-dataset/download-dataset.py → ...inetune/summarization/download-dataset.py b/.../news-summary-dataset/download-dataset.py → ...inetune/summarization/download-dataset.py
@@ -3,10 +3,10 @@
 
 parser = argparse.ArgumentParser()
 # add an argument to specify a dataset name to download
-parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
+parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
 # add an argument to specify the config name of the dataset
 parser.add_argument(
-    "--config_name", type=str, default="plain_text", help="config name of the dataset"
+    "--config_name", type=str, default="3.0.0", help="config name of the dataset"
 )
 # argument to save a fraction of the dataset
 parser.add_argument(
@@ -16,7 +16,7 @@
 parser.add_argument(
     "--download_dir",
     type=str,
-    default="data",
+    default="./news-summary-dataset",
     help="directory to download the dataset to",
 )
 args = parser.parse_args()

diff --git a/...on/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json b/...on/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json