Update notebooks (#2315)

* Update notebooks * remove datasets * fixing formatting issues --------- Co-authored-by: Pavan Manoj Jonnalagadda <pavanmanojj@microsoft.com>
Azure · May 19, 2023 · e4da127 · e4da127
1 parent 214d603
commit e4da127
Show file tree

Hide file tree

Showing 35 changed files with 4,956 additions and 2 deletions.
diff --git a/cli/foundation-models/system/finetune/question-answering/deploy.yml b/cli/foundation-models/system/finetune/question-answering/deploy.yml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: demo
+instance_type: Standard_DS3_v2
+instance_count: 1
diff --git a/cli/foundation-models/system/finetune/question-answering/download-dataset.py b/cli/foundation-models/system/finetune/question-answering/download-dataset.py
@@ -0,0 +1,77 @@
+# import library to parse command line arguments
+import argparse, os
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="./squad-dataset",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset):
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
+    # print dataset features
+
+# load the train.jsonl and validation.jsonl files from the ./squad-dataset/ folder and show first 5 rows
+import pandas as pd
+
+train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
+validation_df = pd.read_json(
+    os.path.join(args.download_dir, "validation.jsonl"), lines=True
+)
+
+# save 5% of the rows from the train dataframe into files with small_ prefix in the ./squad-dataset folder
+train_df.sample(frac=0.05).to_json(
+    os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
+)
+# the original dataset does not have a test split, so split the validation dataframe into validation and test dataframes equally
+validation_df, test_df = (
+    validation_df[: len(validation_df) // 2],
+    validation_df[len(validation_df) // 2 :],
+)
+# save 5% of the rows from the validation and test dataframes into files with small_ prefix in the ./squad-dataset folder
+validation_df.sample(frac=0.05).to_json(
+    os.path.join(args.download_dir, "small_validation.jsonl"),
+    orient="records",
+    lines=True,
+)
+test_df.sample(frac=0.05).to_json(
+    os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
+)
+
+# read ./squad-dataset/small_test.jsonl into a pandas dataframe
+import json
+
+test_df = pd.read_json("./squad-dataset/small_test.jsonl", orient="records", lines=True)
+# take 10 random samples
+test_df = test_df.sample(n=10)
+# rebuild index
+test_df.reset_index(drop=True, inplace=True)
+# flatten the json object in the "answer" column with the keys "answer_start" and "text"
+json_struct = json.loads(test_df.to_json(orient="records"))
+test_df = pd.json_normalize(json_struct)
+# drop id and title columns
+test_df = test_df.drop(columns=["id", "title"])
+
+# create a json object with "inputs" as key and a list of json objects with "question" and "context" as keys
+test_df_copy = test_df[["question", "context"]]
+test_json = {"input_data": test_df_copy.to_dict("split")}
+
+# write the json object to a file named sample_score.json in the ./squad-dataset folder
+with open("./squad-dataset/sample_score.json", "w") as f:
+    json.dump(test_json, f)
diff --git a/cli/foundation-models/system/finetune/question-answering/extractive-qa-pipeline.yml b/cli/foundation-models/system/finetune/question-answering/extractive-qa-pipeline.yml
@@ -0,0 +1,89 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+experiment_name: question-answering-extractive-qna
+
+inputs:
+  compute_model_import: gpu-cluster-big
+  compute_preprocess: gpu-cluster-big
+  compute_finetune: gpu-cluster-big
+  compute_model_evaluation: gpu-cluster-big
+
+  # specify the foundation model available in the azureml system registry
+  mlflow_model_path: 
+    path: azureml://registries/azureml/models/bert-based-uncased/versions/3
+    # huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line
+
+  # map the dataset files to parameters
+  train_file_path: 
+    type: uri_file
+    path: "squad-dataset/small_train.jsonl"
+  validation_file_path:
+    type: uri_file
+    path: "squad-dataset/small_validation.jsonl"
+  test_file_path:
+    type: uri_file
+    path: "squad-dataset/small_test.jsonl"
+  evaluation_config_path:
+    type: uri_file
+    path: "../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"
+
+  # The following parameters map to the dataset fields
+  # the question whose answer needs to be extracted from the provided context 
+  # question_key parameter maps to the "question" field in the SQuAD dataset
+  question_key: "question"
+  # the context that contains the answer to the question
+  # context_key parameter maps to the "context" field in the SQuAD dataset
+  context_key: "context"
+  # The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
+  # answers_key parameter maps to the "answers" field in the SQuAD dataset
+  answers_key: "answers"
+  # Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
+  # in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
+  answer_start_key: "answer_start"
+  # Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
+  # in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
+  answer_text_key: "text"
+
+  # training settings
+  number_of_gpu_to_use_finetuning: 2
+  num_train_epochs: 3
+  learning_rate: 2e-5
+
+outputs:
+  # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
+  # registering the model is required to deploy the model to an online or batch endpoint
+  trained_model:
+    type: mlflow_model
+
+settings:
+  force_rerun: true
+
+jobs:
+  question_answering_pipeline:
+    type: pipeline
+    component: azureml://registries/azureml/components/question_answering_pipeline/labels/latest
+    inputs:
+      mlflow_model_path: ${{parent.inputs.mlflow_model_path}} 
+
+      compute_model_import: ${{parent.inputs.compute_model_import}}
+      compute_preprocess: ${{parent.inputs.compute_preprocess}}
+      compute_finetune: ${{parent.inputs.compute_finetune}}
+      compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}}
+
+      train_file_path: ${{parent.inputs.train_file_path}}
+      validation_file_path: ${{parent.inputs.validation_file_path}}
+      test_file_path: ${{parent.inputs.test_file_path}}
+      evaluation_config: ${{parent.inputs.evaluation_config_path}}
+
+      question_key: ${{parent.inputs.question_key}}
+      context_key: ${{parent.inputs.context_key}}
+      answers_key: ${{parent.inputs.answers_key}}
+      answer_start_key: ${{parent.inputs.answer_start_key}}
+      answer_text_key: ${{parent.inputs.answer_text_key}}
+
+      number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}}
+      num_train_epochs: ${{parent.inputs.num_train_epochs}}
+      learning_rate: ${{parent.inputs.learning_rate}}
+    outputs:
+      mlflow_model_folder: ${{parent.outputs.trained_model}}
diff --git a/cli/foundation-models/system/finetune/question-answering/extractive-qa.sh b/cli/foundation-models/system/finetune/question-answering/extractive-qa.sh
@@ -0,0 +1,193 @@
+set -x
+# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection
+# the data files are available in the same folder as the above notebook
+
+# script inputs
+subscription_id="<SUBSCRIPTION_ID>"
+resource_group_name="<RESOURCE_GROUP>"
+workspace_name="<WORKSPACE_NAME>"
+registry_name="azureml"
+
+compute_cluster="gpu-cluster-big"
+# if above compute cluster does not exist, create it with the following vm size
+compute_sku="Standard_ND40rs_v2"
+# This is the number of GPUs in a single node of the selected 'vm_size' compute. 
+# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
+# Setting this to more than the number of GPUs will result in an error.
+gpus_per_node=2 
+# This is the foundation model for finetuning
+model_name="bert-base-uncased"
+# using the latest version of the model - not working yet
+model_version=3
+
+version=$(date +%s)
+finetuned_model_name=$model_name"-extractive-qna"
+endpoint_name="ext-qna-$version"
+deployment_sku="Standard_DS3_v2"
+
+
+# training data
+train_data="squad-dataset/small_train.jsonl"
+# validation data
+validation_data="squad-dataset/small_validation.jsonl"
+# test data
+test_data="squad-dataset/small_test.jsonl"
+# evaluation config
+evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"
+# scoring_file
+scoring_file="squad-dataset/sample_score.json"
+
+# finetuning job parameters
+finetuning_pipeline_component="question_answering_pipeline"
+# The following parameters map to the dataset fields
+# the question whose answer needs to be extracted from the provided context 
+# question_key parameter maps to the "question" field in the SQuAD dataset
+question_key="question"
+# the context that contains the answer to the question
+# context_key parameter maps to the "context" field in the SQuAD dataset
+context_key="context"
+# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
+# answers_key parameter maps to the "answers" field in the SQuAD dataset
+answers_key="answers"
+# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
+# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
+answer_start_key="answer_start"
+# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
+# in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
+answer_text_key="text"
+# Training settings
+number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
+num_train_epochs=3
+learning_rate=2e-5
+
+# 1. Setup pre-requisites
+
+if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
+   [ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \
+   [ "$workspace_name" = "<WORKSPACE_NAME>" ]; then
+    echo "Please update the script with the subscription_id, resource_group_name and workspace_name"
+    exit 1
+fi
+
+az account set -s $subscription_id
+workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"
+
+# check if $compute_cluster exists, else create it
+if az ml compute show --name $compute_cluster $workspace_info
+then
+    echo "Compute cluster $compute_cluster already exists"
+else
+    echo "Creating compute cluster $compute_cluster"
+    az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || {
+        echo "Failed to create compute cluster $compute_cluster"
+        exit 1
+    }
+fi
+
+# download the dataset
+
+python ./download-dataset.py || {
+    echo "Failed to download dataset"
+    exit 1
+}
+
+# 2. Check if the model exists in the registry
+# need to confirm model show command works for registries outside the tenant (aka system registry)
+if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name 
+then
+    echo "Model $model_name:$model_version does not exist in registry $registry_name"
+    exit 1
+fi
+
+# 3. Check if training data, validation data and test data exist
+if [ ! -f $train_data ]; then
+    echo "Training data $train_data does not exist"
+    exit 1
+fi
+if [ ! -f $validation_data ]; then
+    echo "Validation data $validation_data does not exist"
+    exit 1
+fi
+if [ ! -f $test_data ]; then
+    echo "Test data $test_data does not exist"
+    exit 1
+fi
+
+# 4. Submit finetuning job using pipeline.yml
+
+# check if the finetuning pipeline component exists
+if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
+then
+    echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
+    exit 1
+fi
+
+# need to switch to using latest version for model, currently blocked with a bug.
+# submit finetuning job
+parent_job_name=$( az ml job create --file ./extractive-qa-pipeline.yml $workspace_info --query name -o tsv --set \
+  jobs.question_answering_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
+  inputs.compute_model_import=$compute_cluster \
+  inputs.compute_preprocess=$compute_cluster \
+  inputs.compute_finetune=$compute_cluster \
+  inputs.compute_model_evaluation=$compute_cluster \
+  inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \
+  inputs.train_file_path.path=$train_data \
+  inputs.validation_file_path.path=$validation_data \
+  inputs.test_file_path.path=$test_data \
+  inputs.evaluation_config.path=$evaluation_config \
+  inputs.question_key=$question_key \
+  inputs.context_key=$context_key \
+  inputs.answers_key=$answers_key \
+  inputs.answer_start_key=$answer_start_key \
+  inputs.answer_text_key=$answer_text_key \
+  inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \
+  inputs.num_train_epochs=$num_train_epochs \
+  inputs.learning_rate=$learning_rate ) || {
+    echo "Failed to submit finetuning job"
+    exit 1
+  }
+
+az ml job stream --name $parent_job_name $workspace_info || {
+    echo "job stream failed"; exit 1;
+}
+
+# 5. Create model in workspace from train job output
+az ml model create --name $finetuned_model_name --version $version --type mlflow_model \
+ --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info  || {
+    echo "model create in workspace failed"; exit 1;
+}
+
+# 6. Deploy the model to an endpoint
+# create online endpoint 
+az ml online-endpoint create --name $endpoint_name $workspace_info  || {
+    echo "endpoint create failed"; exit 1;
+}
+
+# deploy model from registry to endpoint in workspace
+# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list
+az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \
+  endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \
+  instance_type=$deployment_sku || {
+    echo "deployment create failed"; exit 1;
+}
+
+# 7. Try a sample scoring request
+
+# Check if scoring data file exists
+if [ -f $scoring_file ]; then
+    echo "Invoking endpoint $endpoint_name with following input:\n\n"
+    cat $scoring_file
+    echo "\n\n"
+else
+    echo "Scoring file $scoring_file does not exist"
+    exit 1
+fi
+
+az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || {
+    echo "endpoint invoke failed"; exit 1;
+}
+
+# 8. Delete the endpoint
+az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || {
+    echo "endpoint delete failed"; exit 1;
+}
diff --git a/cli/foundation-models/system/finetune/summarization/deploy.yml b/cli/foundation-models/system/finetune/summarization/deploy.yml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: demo
+instance_type: Standard_DS3_v2
+instance_count: 1
diff --git a/cli/foundation-models/system/finetune/summarization/download-dataset.py b/cli/foundation-models/system/finetune/summarization/download-dataset.py
@@ -81,7 +81,8 @@
 # rename the highlights column to ground_truth_summary
 test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
 # create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
-test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
+test_df_copy = test_df[["article"]]
+test_json = {"input_data": test_df_copy.to_dict("split")}
 # save the json object to a file named sample_score.json in the ./emotion-dataset folder
 with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
     json.dump(test_json, f)