diff --git a/cli/foundation-models/system/finetune/question-answering/deploy.yml b/cli/foundation-models/system/finetune/question-answering/deploy.yml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/question-answering/deploy.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/question-answering/download-dataset.py b/cli/foundation-models/system/finetune/question-answering/download-dataset.py new file mode 100644 index 0000000000..8fac4db4ba --- /dev/null +++ b/cli/foundation-models/system/finetune/question-answering/download-dataset.py @@ -0,0 +1,77 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="squad", help="dataset name") +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="./squad-dataset", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features + +# load the train.jsonl and validation.jsonl files from the ./squad-dataset/ folder and show first 5 rows +import pandas as pd + +train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) +validation_df = pd.read_json( + os.path.join(args.download_dir, "validation.jsonl"), lines=True +) + +# save 5% of the rows from the train dataframe into files with small_ prefix in the ./squad-dataset folder +train_df.sample(frac=0.05).to_json( + os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True +) +# the original dataset does not have a test split, so split the validation dataframe into validation and test dataframes equally +validation_df, test_df = ( + validation_df[: len(validation_df) // 2], + validation_df[len(validation_df) // 2 :], +) +# save 5% of the rows from the validation and test dataframes into files with small_ prefix in the ./squad-dataset folder +validation_df.sample(frac=0.05).to_json( + os.path.join(args.download_dir, "small_validation.jsonl"), + orient="records", + lines=True, +) +test_df.sample(frac=0.05).to_json( + os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True +) + +# read ./squad-dataset/small_test.jsonl into a pandas dataframe +import json + +test_df = pd.read_json("./squad-dataset/small_test.jsonl", orient="records", lines=True) +# take 10 random samples +test_df = test_df.sample(n=10) +# rebuild index +test_df.reset_index(drop=True, inplace=True) +# flatten the json object in the "answer" column with the keys "answer_start" and "text" +json_struct = json.loads(test_df.to_json(orient="records")) +test_df = pd.json_normalize(json_struct) +# drop id and title columns +test_df = test_df.drop(columns=["id", "title"]) + +# create a json object with "inputs" as key and a list of json objects with "question" and "context" as keys +test_df_copy = test_df[["question", "context"]] +test_json = {"input_data": test_df_copy.to_dict("split")} + +# write the json object to a file named sample_score.json in the ./squad-dataset folder +with open("./squad-dataset/sample_score.json", "w") as f: + json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/question-answering/extractive-qa-pipeline.yml b/cli/foundation-models/system/finetune/question-answering/extractive-qa-pipeline.yml new file mode 100644 index 0000000000..31cf74919e --- /dev/null +++ b/cli/foundation-models/system/finetune/question-answering/extractive-qa-pipeline.yml @@ -0,0 +1,89 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: question-answering-extractive-qna + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/bert-based-uncased/versions/3 + # huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "squad-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "squad-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "squad-dataset/small_test.jsonl" + evaluation_config_path: + type: uri_file + path: "../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json" + + # The following parameters map to the dataset fields + # the question whose answer needs to be extracted from the provided context  + # question_key parameter maps to the "question" field in the SQuAD dataset + question_key: "question" + # the context that contains the answer to the question + # context_key parameter maps to the "context" field in the SQuAD dataset + context_key: "context" + # The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values + # answers_key parameter maps to the "answers" field in the SQuAD dataset + answers_key: "answers" + # Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter. + # in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer" + answer_start_key: "answer_start" + # Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter + # in the SQuAD dataset, the answer_text_key maps to "text" under "answer" + answer_text_key: "text" + + # training settings + number_of_gpu_to_use_finetuning: 2 + num_train_epochs: 3 + learning_rate: 2e-5 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + +jobs: + question_answering_pipeline: + type: pipeline + component: azureml://registries/azureml/components/question_answering_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + evaluation_config: ${{parent.inputs.evaluation_config_path}} + + question_key: ${{parent.inputs.question_key}} + context_key: ${{parent.inputs.context_key}} + answers_key: ${{parent.inputs.answers_key}} + answer_start_key: ${{parent.inputs.answer_start_key}} + answer_text_key: ${{parent.inputs.answer_text_key}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + learning_rate: ${{parent.inputs.learning_rate}} + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/question-answering/extractive-qa.sh b/cli/foundation-models/system/finetune/question-answering/extractive-qa.sh new file mode 100644 index 0000000000..878368bf1f --- /dev/null +++ b/cli/foundation-models/system/finetune/question-answering/extractive-qa.sh @@ -0,0 +1,193 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="" +workspace_name="" +registry_name="azureml" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_ND40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=2 +# This is the foundation model for finetuning +model_name="bert-base-uncased" +# using the latest version of the model - not working yet +model_version=3 + +version=$(date +%s) +finetuned_model_name=$model_name"-extractive-qna" +endpoint_name="ext-qna-$version" +deployment_sku="Standard_DS3_v2" + + +# training data +train_data="squad-dataset/small_train.jsonl" +# validation data +validation_data="squad-dataset/small_validation.jsonl" +# test data +test_data="squad-dataset/small_test.jsonl" +# evaluation config +evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json" +# scoring_file +scoring_file="squad-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="question_answering_pipeline" +# The following parameters map to the dataset fields +# the question whose answer needs to be extracted from the provided context  +# question_key parameter maps to the "question" field in the SQuAD dataset +question_key="question" +# the context that contains the answer to the question +# context_key parameter maps to the "context" field in the SQuAD dataset +context_key="context" +# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values +# answers_key parameter maps to the "answers" field in the SQuAD dataset +answers_key="answers" +# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter. +# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer" +answer_start_key="answer_start" +# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter +# in the SQuAD dataset, the answer_text_key maps to "text" under "answer" +answer_text_key="text" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +learning_rate=2e-5 + +# 1. Setup pre-requisites + +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./extractive-qa-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.question_answering_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.evaluation_config.path=$evaluation_config \ + inputs.question_key=$question_key \ + inputs.context_key=$context_key \ + inputs.answers_key=$answers_key \ + inputs.answer_start_key=$answer_start_key \ + inputs.answer_text_key=$answer_text_key \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.learning_rate=$learning_rate ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} diff --git a/cli/foundation-models/system/finetune/summarization/deploy.yml b/cli/foundation-models/system/finetune/summarization/deploy.yml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/summarization/deploy.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/summarization/download-dataset.py b/cli/foundation-models/system/finetune/summarization/download-dataset.py index e2b98eac10..e93db5801e 100644 --- a/cli/foundation-models/system/finetune/summarization/download-dataset.py +++ b/cli/foundation-models/system/finetune/summarization/download-dataset.py @@ -81,7 +81,8 @@ # rename the highlights column to ground_truth_summary test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True) # create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe -test_json = {"inputs": {"input_string": test_df["article"].tolist()}} +test_df_copy = test_df[["article"]] +test_json = {"input_data": test_df_copy.to_dict("split")} # save the json object to a file named sample_score.json in the ./emotion-dataset folder with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f: json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/summarization/news-summary-pipeline.yml b/cli/foundation-models/system/finetune/summarization/news-summary-pipeline.yml new file mode 100644 index 0000000000..a2c33045f8 --- /dev/null +++ b/cli/foundation-models/system/finetune/summarization/news-summary-pipeline.yml @@ -0,0 +1,76 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: summarization-news-summary + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/t5-small/versions/3 + # huggingface_id: 't5-small' # if you want to use a huggingface model, uncomment this line and comment the above line + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "news-summary-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "news-summary-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "news-summary-dataset/small_test.jsonl" + evaluation_config_path: + type: uri_file + path: "../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json" + + + # The following parameters map to the dataset fields + # document_key parameter maps to the "article" field in the news summary dataset + document_key: "article" + # summary_key parameter maps to the "highlights" field in the news summary dataset + summary_key: "highlights" + + # training settings + number_of_gpu_to_use_finetuning: 2 + num_train_epochs: 3 + learning_rate: 2e-5 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + +jobs: + summarization_pipeline: + type: pipeline + component: azureml://registries/azureml/components/summarization_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + evaluation_config: ${{parent.inputs.evaluation_config_path}} + + document_key: ${{parent.inputs.document_key}} + summary_key: ${{parent.inputs.summary_key}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + learning_rate: ${{parent.inputs.learning_rate}} + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/summarization/news-summary.sh b/cli/foundation-models/system/finetune/summarization/news-summary.sh new file mode 100644 index 0000000000..05b0d193a5 --- /dev/null +++ b/cli/foundation-models/system/finetune/summarization/news-summary.sh @@ -0,0 +1,181 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="", +workspace_name="WORKSPACE_NAME>", +registry_name="azureml" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_ND40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=2 +# This is the foundation model for finetuning +model_name="t5-small" +# using the latest version of the model - not working yet +model_version=3 + +version=$(date +%s) +finetuned_model_name=$model_name"-news-summary" +endpoint_name="news-summary-$version" +deployment_sku="Standard_DS3_v2" + + +# training data +train_data="./news-summary-dataset/small_train.jsonl" +# validation data +validation_data="./news-summary-dataset/small_validation.jsonl" +# test data +test_data="./news-summary-dataset/small_test.jsonl" +# evaluation config +evaluation_config="./summarization-config.json" +# scoring_file +scoring_file="./news-summary-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="summarization_pipeline" +# The following parameters map to the dataset fields +# document_key parameter maps to the "article" field in the news summary dataset +document_key="article" +# summary_key parameter maps to the "highlights" field in the news summary dataset +summary_key="highlights" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +learning_rate=2e-5 + + + +# 1. Setup pre-requisites + +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./news-summary-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.summarization_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.evaluation_config.path=$evaluation_config \ + inputs.document_key=$document_key \ + inputs.summary_key=$summary_key \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.learning_rate=$learning_rate ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} diff --git a/cli/foundation-models/system/finetune/text-classification/deploy.yml b/cli/foundation-models/system/finetune/text-classification/deploy.yml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/text-classification/deploy.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/text-classification/download-dataset.py b/cli/foundation-models/system/finetune/text-classification/download-dataset.py new file mode 100644 index 0000000000..cbcbb32645 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-classification/download-dataset.py @@ -0,0 +1,108 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset", type=str, default="dair-ai/emotion", help="dataset name" +) +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset_subset", type=str, default="split", help="dataset subset name" +) +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="emotion-dataset", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names +from functools import partial + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features + +# get label2id and id2label mapping + +# get any split of data +split = get_dataset_split_names(args.dataset)[0] +dataset = load_dataset(args.dataset, split=split) + +labels = dataset.features["label"].names + +id2label = {} +label2id = {} + +for i, label in enumerate(labels): + id2label[i] = label + label2id[label] = i + +label_mapping = {"id2label": id2label, "label2id": label2id} + +import json + +with open(os.path.join(args.download_dir, "label.json"), "w") as f: + json.dump(label_mapping, f) + +# load the id2label json element of the ./emotion-dataset/label.json file into pandas table with keys as 'label' column of int64 type and values as 'label_string' column as string type +import json +import pandas as pd + +with open(os.path.join(args.download_dir, "label.json")) as f: + id2label = json.load(f) + id2label = id2label["id2label"] + label_df = pd.DataFrame.from_dict( + id2label, orient="index", columns=["label_string"] + ) + label_df["label"] = label_df.index.astype("int64") + label_df = label_df[["label", "label_string"]] + +test_df = pd.read_json(os.path.join(args.download_dir, "test.jsonl"), lines=True) +train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) +validation_df = pd.read_json( + os.path.join(args.download_dir, "validation.jsonl"), lines=True +) +# join the train, validation and test dataframes with the id2label dataframe to get the label_string column +train_df = train_df.merge(label_df, on="label", how="left") +validation_df = validation_df.merge(label_df, on="label", how="left") +test_df = test_df.merge(label_df, on="label", how="left") + +# save 10% of the rows from the train, validation and test dataframes into files with small_ prefix in the ./emotion-dataset folder +train_df.sample(frac=0.1).to_json( + "./emotion-dataset/small_train.jsonl", orient="records", lines=True +) +validation_df.sample(frac=0.1).to_json( + "./emotion-dataset/small_validation.jsonl", orient="records", lines=True +) +test_df.sample(frac=0.1).to_json( + "./emotion-dataset/small_test.jsonl", orient="records", lines=True +) + +# read ./emotion-dataset/small_test.jsonl into a pandas dataframe +test_df = pd.read_json(os.path.join(args.download_dir, "small_test.jsonl"), lines=True) +# take 10 random samples +test_df = test_df.sample(n=10) +# rebuild index +test_df.reset_index(drop=True, inplace=True) +# rename the label_string column to ground_truth_label +test_df = test_df.rename(columns={"label_string": "ground_truth_label"}) + +# create a json object with the key as "inputs" and value as a list of values from the text column of the test dataframe +test_df_copy = test_df[["text"]] +test_json = {"input_data": test_df_copy.to_dict("split")} +# save the json object to a file named sample_score.json in the ./emotion-dataset folder +with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f: + json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/text-classification/emotion-detection-pipeline.yml b/cli/foundation-models/system/finetune/text-classification/emotion-detection-pipeline.yml new file mode 100644 index 0000000000..4d31adbd66 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-classification/emotion-detection-pipeline.yml @@ -0,0 +1,74 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: notebooks-test + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/bert-based-uncased/versions/3 + # huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "emotion-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "emotion-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "emotion-dataset/small_test.jsonl" + evaluation_config_path: + type: uri_file + path: "../../../../../sdk/python/foundation-models/system/finetune/text-classification/text-classification-config.json" + + + # The following parameters map to the dataset fields + sentence1_key: "text" + label_key: "label_string" + + # training settings + number_of_gpu_to_use_finetuning: 2 + num_train_epochs: 3 + learning_rate: 2e-5 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + +jobs: + text_classification_pipeline: + type: pipeline + component: azureml://registries/azureml/components/text_classification_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + evaluation_config: ${{parent.inputs.evaluation_config_path}} + + sentence1_key: ${{parent.inputs.sentence1_key}} + label_key: ${{parent.inputs.label_key}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + learning_rate: ${{parent.inputs.learning_rate}} + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/text-classification/emotion-detection.sh b/cli/foundation-models/system/finetune/text-classification/emotion-detection.sh new file mode 100644 index 0000000000..66f4b6d364 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-classification/emotion-detection.sh @@ -0,0 +1,177 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="", +workspace_name="WORKSPACE_NAME>", +registry_name="azureml" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_nd40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=2 +# This is the foundation model for finetuning +model_name="bert-base-uncased" +# using the latest version of the model - not working yet +model_version=3 + +version=$(date +%s) +finetuned_model_name=$model_name"-emotion-detection" +endpoint_name="emotion-$version" +deployment_sku="Standard_DS3_v2" + + +# training data +train_data="emotion-dataset/small_train.jsonl" +# validation data +validation_data="emotion-dataset/small_validation.jsonl" +# test data +test_data="emotion-dataset/small_test.jsonl" +# evaluation config +evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/text-classification/text-classification-config.json" +# scoring_file +scoring_file="emotion-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="text_classification_pipeline" +# The following parameters map to the dataset fields +sentence1_key="text" +label_key="label_string" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +learning_rate=2e-5 + +# 1. Setup pre-requisites + +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./emotion-detection-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.text_classification_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.evaluation_config.path=$evaluation_config \ + inputs.sentence1_key=$sentence1_key \ + inputs.label_key=$label_key \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.learning_rate=$learning_rate ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} diff --git a/cli/foundation-models/system/finetune/token-classification/deploy.yml b/cli/foundation-models/system/finetune/token-classification/deploy.yml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/token-classification/deploy.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/token-classification/download-dataset.py b/cli/foundation-models/system/finetune/token-classification/download-dataset.py new file mode 100644 index 0000000000..65c7644eed --- /dev/null +++ b/cli/foundation-models/system/finetune/token-classification/download-dataset.py @@ -0,0 +1,82 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="conll2003", help="dataset name") +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="conll2003-dataset", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +def format_ner_tags(example, class_names): + example["text"] = " ".join(example["tokens"]) + example["ner_tags_str"] = [class_names[id] for id in example["ner_tags"]] + return example + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names +from functools import partial + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + dataset = dataset.map( + partial(format_ner_tags, class_names=dataset.features["ner_tags"].feature.names) + ) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features + +import pandas as pd + +# load test.jsonl, train.jsonl and validation.jsonl form the ./conll2003-dataset folder into pandas dataframes +test_df = pd.read_json(os.path.join(args.download_dir, "test.jsonl"), lines=True) +train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) +validation_df = pd.read_json( + os.path.join(args.download_dir, "validation.jsonl"), lines=True +) + +# save 10% of the rows from the train, validation and test dataframes into files with small_ prefix in the ./conll2003-dataset folder +train_df.sample(frac=0.1).to_json( + os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True +) +validation_df.sample(frac=0.1).to_json( + os.path.join(args.download_dir, "small_validation.jsonl"), + orient="records", + lines=True, +) +test_df.sample(frac=0.1).to_json( + os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True +) + + +# read ./conll2003-dataset/small_test.jsonl into a pandas dataframe +test_df = pd.read_json(os.path.join(args.download_dir, "small_test.jsonl"), lines=True) +# take 10 random samples +test_df = test_df.sample(n=10) +# drop the id, pos_tags, chunk_tags, ner_tags column +test_df.drop(columns=["id", "pos_tags", "chunk_tags", "ner_tags"], inplace=True) +# rebuild index +test_df.reset_index(drop=True, inplace=True) +# rename the ner_tags_str column to ground_truth_label +test_df = test_df.rename(columns={"ner_tags_str": "ground_truth_tags"}) + +import json + +# create a json object with the key as "inputs" and value as a list of values from the text column of the test dataframe +test_df_copy = test_df[["tokens"]] +test_json = {"input_data": test_df_copy.to_dict("split")} +# save the json object to a file named sample_score.json in the ./conll2003-dataset folder +with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f: + json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/token-classification/token-classification-pipeline.yml b/cli/foundation-models/system/finetune/token-classification/token-classification-pipeline.yml new file mode 100644 index 0000000000..05975de2f5 --- /dev/null +++ b/cli/foundation-models/system/finetune/token-classification/token-classification-pipeline.yml @@ -0,0 +1,74 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: token-classification-ner + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/bert-based-uncased/versions/3 + # huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "conll2003-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "conll2003-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "conll2003-dataset/small_test.jsonl" + evaluation_config_path: + type: uri_file + path: "../../../../../sdk/python/foundation-models/system/finetune/token-classification/token-classification-config.json" + + + # The following parameters map to the dataset fields + token_key: "tokens" + tag_key: "ner_tags_str" + + # training settings + number_of_gpu_to_use_finetuning: 2 + num_train_epochs: 3 + learning_rate: 2e-5 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + +jobs: + token_classification_pipeline: + type: pipeline + component: azureml://registries/azureml/components/token_classification_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + evaluation_config: ${{parent.inputs.evaluation_config_path}} + + token_key: ${{parent.inputs.token_key}} + tag_key: ${{parent.inputs.tag_key}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + learning_rate: ${{parent.inputs.learning_rate}} + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/token-classification/token-classification.sh b/cli/foundation-models/system/finetune/token-classification/token-classification.sh new file mode 100644 index 0000000000..8b5accf937 --- /dev/null +++ b/cli/foundation-models/system/finetune/token-classification/token-classification.sh @@ -0,0 +1,177 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="" +workspace_name="" +registry_name="azureml" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_ND40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=2 +# This is the foundation model for finetuning +model_name="bert-base-uncased" +# using the latest version of the model - not working yet +model_version=3 + +version=$(date +%s) +finetuned_model_name=$model_name"-ner" +endpoint_name="ner-$version" +deployment_sku="Standard_DS3_v2" + + +# training data +train_data="conll2003-dataset/small_train.jsonl" +# validation data +validation_data="conll2003-dataset/small_validation.jsonl" +# test data +test_data="conll2003-dataset/small_test.jsonl" +# evaluation config +evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/token-classification/token-classification-config.json" +# scoring_file +scoring_file="conll2003-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="token_classification_pipeline" +# The following parameters map to the dataset fields +token_key="tokens" +tag_key="ner_tags_str" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +learning_rate=2e-5 + +# 1. Setup pre-requisites + +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./token-classification-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.token_classification_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.evaluation_config.path=$evaluation_config \ + inputs.token_key=$token_key \ + inputs.tag_key=$tag_key \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.learning_rate=$learning_rate ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} diff --git a/cli/foundation-models/system/finetune/translation/deploy.yml b/cli/foundation-models/system/finetune/translation/deploy.yml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/translation/deploy.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/translation/download-dataset.py b/cli/foundation-models/system/finetune/translation/download-dataset.py new file mode 100644 index 0000000000..e77bf56891 --- /dev/null +++ b/cli/foundation-models/system/finetune/translation/download-dataset.py @@ -0,0 +1,85 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="wmt16", help="dataset name") +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset_subset", type=str, default="ro-en", help="dataset subset name" +) +# argument to save a fraction of the dataset +parser.add_argument( + "--fraction", type=float, default=0.05, help="fraction of the dataset to save" +) +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="wmt16-en-ro-dataset", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +def format_translation(example): + for key in example["translation"]: + example[key] = example["translation"][key] + return example + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names + +for split in get_dataset_split_names(args.dataset, args.dataset_subset): + # load the split of the dataset + dataset = load_dataset(args.dataset, args.dataset_subset, split=split) + dataset = dataset.map(format_translation, remove_columns=["translation"]) + # save the split of the dataset to the download directory as json lines file + dataset.select(range(int(dataset.num_rows * args.fraction))).to_json( + os.path.join(args.download_dir, f"{split}.jsonl") + ) + + +# load the train.jsonl, test.jsonl and validation.jsonl files from the ./wmt16-en-ro-dataset/ folder and show first 5 rows +train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) +validation_df = pd.read_json( + os.path.join(args.download_dir, "validation.jsonl"), lines=True +) +test_df = pd.read_json(os.path.join(args.download_dir, "test.jsonl"), lines=True) + +# save 20% of the rows from the dataframes into files with small_ prefix in the ./wmt16-en-ro-dataset folder +train_df.sample(frac=0.2).to_json( + os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True +) +validation_df.sample(frac=0.2).to_json( + os.path.join(args.download_dir, "small_validation.jsonl"), + orient="records", + lines=True, +) +test_df.sample(frac=0.2).to_json( + os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True +) + +# read ./wmt16-en-ro-dataset/small_test.jsonl into a pandas dataframe +import pandas as pd +import json + +test_df = pd.read_json( + os.path.join(args.download_dir, "test.jsonl"), orient="records", lines=True +) +# take 1 random sample +test_df = test_df.sample(n=1) +# rebuild index +test_df.reset_index(drop=True, inplace=True) + +# create a json object with the key as "inputs" and value as a list of values from the en column of the test dataframe +test_df_copy = test_df[["en"]] +test_json = {"input_data": test_df_copy.to_dict("split")} +# save the json object to a file named sample_score.json in the ./wmt16-en-ro-dataset folder +with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f: + json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/translation/translation-pipeline.yml b/cli/foundation-models/system/finetune/translation/translation-pipeline.yml new file mode 100644 index 0000000000..bbccbbcb55 --- /dev/null +++ b/cli/foundation-models/system/finetune/translation/translation-pipeline.yml @@ -0,0 +1,76 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: translation-wmt16-en-ro + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/t5-small/versions/3 + # huggingface_id: 't5-small' # if you want to use a huggingface model, uncomment this line and comment the above line + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "wmt16-en-ro-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "wmt16-en-ro-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "wmt16-en-ro-dataset/small_test.jsonl" + evaluation_config_path: + type: uri_file + path: "../../../../../sdk/python/foundation-models/system/finetune/translation/translation-config.json" + + + # The following parameters map to the dataset fields + # source_lang parameter maps to the "en" field in the wmt16 dataset + source_lang: "en" + # target_lang parameter maps to the "ro" field in the wmt16 dataset + target_lang: "ro" + + # training settings + number_of_gpu_to_use_finetuning: 2 + num_train_epochs: 3 + learning_rate: 2e-5 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + +jobs: + translation_pipeline: + type: pipeline + component: azureml://registries/azureml/components/translation_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + evaluation_config: ${{parent.inputs.evaluation_config_path}} + + source_lang: ${{parent.inputs.source_lang}} + target_lang: ${{parent.inputs.target_lang}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + learning_rate: ${{parent.inputs.learning_rate}} + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/translation/translation.sh b/cli/foundation-models/system/finetune/translation/translation.sh new file mode 100644 index 0000000000..623b5c024d --- /dev/null +++ b/cli/foundation-models/system/finetune/translation/translation.sh @@ -0,0 +1,178 @@ +#! /bin/bash +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="" +workspace_name="" +registry_name="azureml" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_ND40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=2 +# This is the foundation model for finetuning +model_name="t5-small" +# using the latest version of the model - not working yet +model_version=3 + +version=$(date +%s) +finetuned_model_name=$model_name"-wmt16-en-ro" +endpoint_name="translation-en-ro-$version" +deployment_sku="Standard_DS3_v2" + + +# training data +train_data="wmt16-en-ro-dataset/small_train.jsonl" +# validation data +validation_data="wmt16-en-ro-dataset/small_validation.jsonl" +# test data +test_data="wmt16-en-ro-dataset/small_test.jsonl" +# evaluation config +evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/translation/translation-config.json" +# scoring_file +scoring_file="wmt16-en-ro-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="translation_pipeline" +# The following parameters map to the dataset fields +# source_lang parameter maps to the "en" field in the wmt16 dataset +source_lang="en" +# target_lang parameter maps to the "ro" field in the wmt16 dataset +target_lang="ro" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +learning_rate=2e-5 + +# 1. Setup pre-requisites +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./translation-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.translation_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.evaluation_config.path=$evaluation_config \ + inputs.source_lang=$source_lang \ + inputs.target_lang=$target_lang \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.learning_rate=$learning_rate ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} diff --git a/sdk/python/foundation-models/system/finetune/question-answering/download-dataset.py b/sdk/python/foundation-models/system/finetune/question-answering/download-dataset.py new file mode 100644 index 0000000000..e9ffb0b999 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/question-answering/download-dataset.py @@ -0,0 +1,28 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="squad", help="dataset name") +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="data", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features diff --git a/sdk/python/foundation-models/system/finetune/question-answering/extractive-qa.ipynb b/sdk/python/foundation-models/system/finetune/question-answering/extractive-qa.ipynb new file mode 100644 index 0000000000..eaaf6b97ef --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/question-answering/extractive-qa.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question Answering - Extractive Q&A with the SQUAD (Wikipedia Q&A) dataset\n", + "\n", + "This sample shows how to use `question-answering` components from the `azureml` system registry to fine tune a model to extract answers from a given context using the SQUAD dataset. We then deploy it to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n", + "\n", + "### Training data\n", + "We will use the [SQUAD](https://huggingface.co/datasets/squad) dataset. The [original source](https://rajpurkar.github.io/SQuAD-explorer/) of dataset describes it as follows: _\"Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\"_\n", + "\n", + "### Model\n", + "Models that can perform the `fill-mask` task are generally good foundation models to fine tune for `question-answering`, specifically the extractive Q&A type. We will use the `bert-base-uncased` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick a model to fine tune.\n", + "* Pick and explore training data.\n", + "* Configure the fine tuning job.\n", + "* Run the fine tuning job.\n", + "* Register the fine tuned model. \n", + "* Deploy the fine tuned model for real time inference.\n", + "* Clean up resources. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install azure-ai-ml\n", + "%pip install azure-identity\n", + "%pip install datasets==2.9.0\n", + "%pip install mlflow\n", + "%pip install azureml-mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import (\n", + " DefaultAzureCredential,\n", + " InteractiveBrowserCredential,\n", + " ClientSecretCredential,\n", + ")\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential=credential)\n", + "except:\n", + " workspace_ml_client = MLClient(\n", + " credential,\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " workspace_name=\"\",\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(credential, registry_name=\"azureml\")\n", + "\n", + "experiment_name = \"question-answering-extractive-qna\"\n", + "\n", + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + "except Exception as ex:\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpu_count_found = False\n", + "workspace_compute_sku_list = workspace_ml_client.compute.list_sizes()\n", + "available_sku_sizes = []\n", + "for compute_sku in workspace_compute_sku_list:\n", + " available_sku_sizes.append(compute_sku.name)\n", + " if compute_sku.name.lower() == compute.size.lower():\n", + " gpus_per_node = compute_sku.gpus\n", + " gpu_count_found = True\n", + "# if gpu_count_found not found, then print an error\n", + "if gpu_count_found:\n", + " print(f\"Number of GPU's in copute {compute.size}: {gpus_per_node}\")\n", + "else:\n", + " raise ValueError(\n", + " f\"Number of GPU's in copute {compute.size} not found. Available skus are: {available_sku_sizes}.\"\n", + " f\"This should not happen. Please check the selected compute cluster: {compute_cluster} and try again.\"\n", + " )\n", + "# CPU based finetune works only for single-node single-process\n", + "if gpus_per_node == 0:\n", + " print(\n", + " \"WARNING! Selected compute doesn't have GPU. CPU based finetune is experimental and works on a single process in a single node\"\n", + " )\n", + " gpus_per_node = 1\n", + "\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a foundation model to fine tune\n", + "\n", + "Models that support `fill-mask` tasks are good candidates to fine tune for extractive Q&A style `question answering`. You can browse these models in the Model Catalog in the AzureML Studio, filtering by the `fill-mask` task. In this example, we use the `bert-base-uncased` model. If you have opened this notebook for a different model, replace the model name and version accordingly. \n", + "\n", + "Note the model id property of the model. This will be passed as input to the fine tuning job. This is also available as the `Asset ID` field in model details page in AzureML Studio Model Catalog. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"bert-base-uncased\"\n", + "model_version = \"3\"\n", + "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", + "print(\n", + " \"\\n\\nUsing model name: {0}, version: {1}, id: {2} for fine tuning\".format(\n", + " foundation_model.name, foundation_model.version, foundation_model.id\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the dataset for fine-tuning the model\n", + "\n", + "We use the [SQUAD](https://huggingface.co/datasets/squad) dataset. The next few cells show basic data preparation for fine tuning:\n", + "* Visualize some data rows. Take note of the dataset fields: `question`, `context`, `answers`, `id` and `title`. The `answers` field has `start_key` and `text` fields in json format inside the `answers` field . The keys `question` and `context`, `answers`, `answer_start` and `text` are the relevant fields that need to be mapped to the parameters of the fine tuning pipeline.\n", + "* The dataset does not have a test split, split test into two halves, one for test and other for validation.\n", + "* We want this sample to run quickly, so save smaller `train` and `validation` files containing 5% of the original. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n", + "import os\n", + "\n", + "exit_status = os.system(\"python ./download-dataset.py --download_dir squad-dataset\")\n", + "if exit_status != 0:\n", + " raise Exception(\"Error downloading dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load the train.jsonl and validation.jsonl files from the ./squad-dataset/ folder and show first 5 rows\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\n", + " \"display.max_colwidth\", 0\n", + ") # set the max column width to 0 to display the full text\n", + "train_df = pd.read_json(\"squad-dataset/train.jsonl\", lines=True)\n", + "validation_df = pd.read_json(\"squad-dataset/validation.jsonl\", lines=True)\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save 5% of the rows from the train dataframe into files with small_ prefix in the ./squad-dataset folder\n", + "train_df.sample(frac=0.05).to_json(\n", + " \"./squad-dataset/small_train.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "# the original dataset does not have a test split, so split the validation dataframe into validation and test dataframes equally\n", + "validation_df, test_df = (\n", + " validation_df[: len(validation_df) // 2],\n", + " validation_df[len(validation_df) // 2 :],\n", + ")\n", + "# save 5% of the rows from the validation and test dataframes into files with small_ prefix in the ./squad-dataset folder\n", + "validation_df.sample(frac=0.05).to_json(\n", + " \"./squad-dataset/small_validation.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "test_df.sample(frac=0.05).to_json(\n", + " \"./squad-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the fine tuning job using the the model and data as inputs\n", + " \n", + "Create the job that uses the `question-answering` pipeline component. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component\n", + "from azure.ai.ml import PyTorchDistribution, Input\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"question_answering_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def create_pipeline():\n", + " question_answering_pipeline = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry id identified in step #3\n", + " mlflow_model_path=foundation_model.id,\n", + " # huggingface_id = 'bert-base-uncased', # if you want to use a huggingface model, uncomment this line and comment the above line\n", + " compute_model_import=compute_cluster,\n", + " compute_preprocess=compute_cluster,\n", + " compute_finetune=compute_cluster,\n", + " compute_model_evaluation=compute_cluster,\n", + " # map the dataset splits to parameters\n", + " train_file_path=Input(\n", + " type=\"uri_file\", path=\"./squad-dataset/small_train.jsonl\"\n", + " ),\n", + " validation_file_path=Input(\n", + " type=\"uri_file\", path=\"./squad-dataset/small_validation.jsonl\"\n", + " ),\n", + " test_file_path=Input(type=\"uri_file\", path=\"./squad-dataset/small_test.jsonl\"),\n", + " evaluation_config=Input(\n", + " type=\"uri_file\", path=\"./question-answering-config.json\"\n", + " ),\n", + " # The following parameters map to the dataset fields\n", + " # the question whose answer needs to be extracted from the provided context\n", + " # question_key parameter maps to the \"question\" field in the SQuAD dataset\n", + " question_key=\"question\",\n", + " # the context that contains the answer to the question\n", + " # context_key parameter maps to the \"context\" field in the SQuAD dataset\n", + " context_key=\"context\",\n", + " # The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values\n", + " # answers_key parameter maps to the \"answers\" field in the SQuAD dataset\n", + " answers_key=\"answers\",\n", + " # Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.\n", + " # in the SQuAD dataset, the answer_start_key maps \"answer_start\" under \"answer\"\n", + " answer_start_key=\"answer_start\",\n", + " # Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter\n", + " # in the SQuAD dataset, the answer_text_key maps to \"text\" under \"answer\"\n", + " answer_text_key=\"text\",\n", + " # training settings\n", + " number_of_gpu_to_use_finetuning=gpus_per_node, # set to the number of GPUs available in the compute\n", + " num_train_epochs=2,\n", + " learning_rate=2e-5,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of the pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": question_answering_pipeline.outputs.mlflow_model_folder\n", + " }\n", + "\n", + "\n", + "pipeline_object = create_pipeline()\n", + "\n", + "# don't use cached results from previous jobs\n", + "pipeline_object.settings.force_rerun = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job\n", + "pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + ")\n", + "# wait for the pipeline job to complete\n", + "workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review training and evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more. \n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + pipeline_job.name + \"'\"\n", + "runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + ")\n", + "training_run = None\n", + "evaluation_run = None\n", + "# get the training and evaluation runs.\n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if \"epoch\" in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif \"exact_match\" in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if training_run:\n", + " print(\"Training metrics:\\n\\n\")\n", + " print(json.dumps(training_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Training job found\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if evaluation_run:\n", + " print(\"Evaluation metrics:\\n\\n\")\n", + " print(json.dumps(evaluation_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Evaluation job found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# check if the `trained_model` output is available\n", + "print(\"pipeline job outputs: \", workspace_ml_client.jobs.get(pipeline_job.name).outputs)\n", + "\n", + "# fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = \"azureml://jobs/{0}/outputs/{1}\".format(\n", + " pipeline_job.name, \"trained_model\"\n", + ")\n", + "\n", + "finetuned_model_name = model_name + \"-extractive-qna\"\n", + "finetuned_model_name = finetuned_model_name.replace(\"/\", \"-\")\n", + "print(\"path to register model: \", model_path_from_job)\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=model_name + \" fine tuned model for extractive Q&A\",\n", + ")\n", + "print(\"prepare to register model: \\n\", prepare_to_register_model)\n", + "# register the model from pipeline job output\n", + "registered_model = workspace_ml_client.models.create_or_update(\n", + " prepare_to_register_model\n", + ")\n", + "print(\"registered model: \\n\", registered_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "\n", + "online_endpoint_name = \"ext-qna-\" + timestamp\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \"\n", + " + registered_model.name\n", + " + \", fine tuned model for emotion detection\",\n", + " auth_mode=\"key\",\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=\"demo\",\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " instance_type=\"Standard_DS3_v2\",\n", + " instance_count=1,\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {\"demo\": 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read ./squad-dataset/small_test.jsonl into a pandas dataframe\n", + "import pandas as pd\n", + "import json\n", + "\n", + "test_df = pd.read_json(\"./squad-dataset/small_test.jsonl\", orient=\"records\", lines=True)\n", + "# take 10 random samples\n", + "test_df = test_df.sample(n=10)\n", + "# rebuild index\n", + "test_df.reset_index(drop=True, inplace=True)\n", + "# flatten the json object in the \"answer\" column with the keys \"answer_start\" and \"text\"\n", + "json_struct = json.loads(test_df.to_json(orient=\"records\"))\n", + "test_df = pd.json_normalize(json_struct)\n", + "# drop id and title columns\n", + "test_df = test_df.drop(columns=[\"id\", \"title\"])\n", + "test_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a json object with \"inputs\" as key and a list of json objects with \"question\" and \"context\" as keys\n", + "test_df_copy = test_df[[\"question\", \"context\"]]\n", + "test_json = {\"input_data\": test_df_copy.to_dict(\"split\")}\n", + "print(test_json)\n", + "# write the json object to a file named sample_score.json in the ./squad-dataset folder\n", + "with open(\"./squad-dataset/sample_score.json\", \"w\") as f:\n", + " json.dump(test_json, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=\"demo\",\n", + " request_file=\"./squad-dataset/sample_score.json\",\n", + ")\n", + "print(\"raw response: \\n\", response, \"\\n\")\n", + "# convert the response to a pandas dataframe and rename the label column as scored_label\n", + "response_df = pd.read_json(response)\n", + "response_df = response_df.rename(columns={0: \"scored_answer\"})\n", + "response_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the test dataframe and the response dataframe on the index\n", + "merged_df = pd.merge(test_df, response_df, left_index=True, right_index=True)\n", + "# drop the answers.answer_start, start and end columns and rename the answer column to scored_answer\n", + "merged_df = merged_df.drop(columns=[\"answers.answer_start\"])\n", + "# rename the answers.text column to ground_truth_answers\n", + "merged_df = merged_df.rename(columns={\"answers.text\": \"ground_truth_answers\"})\n", + "merged_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json b/sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json new file mode 100644 index 0000000000..15165acfe5 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json @@ -0,0 +1,7 @@ +{ + "metrics": ["exact_match", "f1_score"], + "regexes_to_ignore": ["$[A-Z]+"], + "ignore_case": false, + "ignore_numbers": false, + "ignore_punctuations": true +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/finetune/summarization/download-dataset.py b/sdk/python/foundation-models/system/finetune/summarization/download-dataset.py index cfea8436b4..f7796cbc1e 100644 --- a/sdk/python/foundation-models/system/finetune/summarization/download-dataset.py +++ b/sdk/python/foundation-models/system/finetune/summarization/download-dataset.py @@ -16,7 +16,7 @@ parser.add_argument( "--download_dir", type=str, - default="./news-summary-dataset", + default="data", help="directory to download the dataset to", ) args = parser.parse_args() @@ -32,6 +32,7 @@ print(f"Loading {split} split of {args.dataset} dataset...") # load the split of the dataset dataset = load_dataset(args.dataset, args.config_name, split=split) + print(dataset.features) # save the split of the dataset to the download directory as json lines file dataset.select(range(int(dataset.num_rows * args.fraction))).to_json( os.path.join(args.download_dir, f"{split}.jsonl") diff --git a/sdk/python/foundation-models/system/finetune/summarization/news-summary.ipynb b/sdk/python/foundation-models/system/finetune/summarization/news-summary.ipynb new file mode 100644 index 0000000000..ea3282e465 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/summarization/news-summary.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarization - Generate news headlines style summary \n", + "\n", + "This sample shows how to use `summarization` components from the `azureml` system registry to fine tune a model to generate summary of a news article. We then deploy it to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n", + "\n", + "### Training data\n", + "We will use the [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. \n", + "\n", + "### Model\n", + "Models that can perform the `translation` task are generally good foundation models to fine tune for `summarization`. We will use the `t5-small` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick a model to fine tune.\n", + "* Pick and explore training data.\n", + "* Configure the fine tuning job.\n", + "* Run the fine tuning job.\n", + "* Register the fine tuned model. \n", + "* Deploy the fine tuned model for real time inference.\n", + "* Clean up resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install azure-ai-ml\n", + "%pip install azure-identity\n", + "%pip install datasets==2.9.0\n", + "%pip install mlflow\n", + "%pip install azureml-mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import (\n", + " DefaultAzureCredential,\n", + " InteractiveBrowserCredential,\n", + " ClientSecretCredential,\n", + ")\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential=credential)\n", + "except:\n", + " workspace_ml_client = MLClient(\n", + " credential,\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " workspace_name=\"\",\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(credential, registry_name=\"azureml\")\n", + "\n", + "experiment_name = \"summarization-news-summary\"\n", + "\n", + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + "except Exception as ex:\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpu_count_found = False\n", + "workspace_compute_sku_list = workspace_ml_client.compute.list_sizes()\n", + "available_sku_sizes = []\n", + "for compute_sku in workspace_compute_sku_list:\n", + " available_sku_sizes.append(compute_sku.name)\n", + " if compute_sku.name.lower() == compute.size.lower():\n", + " gpus_per_node = compute_sku.gpus\n", + " gpu_count_found = True\n", + "# if gpu_count_found not found, then print an error\n", + "if gpu_count_found:\n", + " print(f\"Number of GPU's in copute {compute.size}: {gpus_per_node}\")\n", + "else:\n", + " raise ValueError(\n", + " f\"Number of GPU's in copute {compute.size} not found. Available skus are: {available_sku_sizes}.\"\n", + " f\"This should not happen. Please check the selected compute cluster: {compute_cluster} and try again.\"\n", + " )\n", + "# CPU based finetune works only for single-node single-process\n", + "if gpus_per_node == 0:\n", + " print(\n", + " \"WARNING! Selected compute doesn't have GPU. CPU based finetune is experimental and works on a single process in a single node\"\n", + " )\n", + " gpus_per_node = 1\n", + "\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a foundation model to fine tune\n", + "\n", + "Models that support `translation` tasks are good candidates to fine tune for `summarization`. You can browse these models in the Model Catalog in the AzureML Studio, filtering by the `translation` task. In this example, we use the `t5-small` model. If you have opened this notebook for a different model, replace the model name and version accordingly. \n", + "\n", + "Note the model id property of the model. This will be passed as input to the fine tuning job. This is also available as the `Asset ID` field in model details page in AzureML Studio Model Catalog. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"t5-small\"\n", + "model_version = \"3\"\n", + "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", + "print(\n", + " \"\\n\\nUsing model name: {0}, version: {1}, id: {2} for fine tuning\".format(\n", + " foundation_model.name, foundation_model.version, foundation_model.id\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the dataset for fine-tuning the model\n", + "\n", + "> The [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset is larger than 1GB when uncompressed. The [download-dataset.py](./news-summary-dataset/download-dataset.py) has supports downloading a smaller fraction of the dataset. The files in the [](./news-summary-dataset/) folder contain about 3% of the original dataset rows. \n", + "\n", + "* Download the dataset.\n", + "* Visualize some data rows. \n", + "* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 20% of the already trimmed rows. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n", + "import os\n", + "\n", + "exit_status = os.system(\n", + " \"python ./download-dataset.py --download_dir news-summary-dataset\"\n", + ")\n", + "if exit_status != 0:\n", + " raise Exception(\"Error downloading dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pd.set_option(\n", + " \"display.max_colwidth\", 0\n", + ") # set the max column width to 0 to display the full text\n", + "# load the train.jsonl, test.jsonl and validation.jsonl files from the ./news-summary-dataset/ folder and show first 5 rows\n", + "train_df = pd.read_json(\"./news-summary-dataset/train.jsonl\", lines=True)\n", + "validation_df = pd.read_json(\"./news-summary-dataset/validation.jsonl\", lines=True)\n", + "# this dataset doesn't have test data, so split the validation_df into test_df and validation_df\n", + "test_df = validation_df.sample(frac=0.5, random_state=42)\n", + "validation_df.drop(test_df.index, inplace=True)\n", + "# drop the id column as it is not needed for fine tuning\n", + "train_df.drop(columns=[\"id\"], inplace=True)\n", + "validation_df.drop(columns=[\"id\"], inplace=True)\n", + "test_df.drop(columns=[\"id\"], inplace=True)\n", + "train_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder\n", + "train_df.sample(frac=0.2).to_json(\n", + " \"./news-summary-dataset/small_train.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "validation_df.sample(frac=0.2).to_json(\n", + " \"./news-summary-dataset/small_validation.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "test_df.sample(frac=0.2).to_json(\n", + " \"./news-summary-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the fine tuning job using the the model and data as inputs\n", + " \n", + "Create the job that uses the `summarization` pipeline component. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component\n", + "from azure.ai.ml import PyTorchDistribution, Input\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"summarization_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def create_pipeline():\n", + " summarization_pipeline = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry id identified in step #3\n", + " mlflow_model_path=foundation_model.id,\n", + " # huggingface_id = 'bert-base-uncased', # if you want to use a huggingface model, uncomment this line and comment the above line\n", + " compute_model_import=compute_cluster,\n", + " compute_preprocess=compute_cluster,\n", + " compute_finetune=compute_cluster,\n", + " compute_model_evaluation=compute_cluster,\n", + " # map the dataset splits to parameters\n", + " train_file_path=Input(\n", + " type=\"uri_file\", path=\"./news-summary-dataset/small_train.jsonl\"\n", + " ),\n", + " validation_file_path=Input(\n", + " type=\"uri_file\", path=\"./news-summary-dataset/small_validation.jsonl\"\n", + " ),\n", + " test_file_path=Input(\n", + " type=\"uri_file\", path=\"./news-summary-dataset/small_test.jsonl\"\n", + " ),\n", + " evaluation_config=Input(type=\"uri_file\", path=\"./summarization-config.json\"),\n", + " # The following parameters map to the dataset fields\n", + " # document_key parameter maps to the \"article\" field in the news summary dataset\n", + " document_key=\"article\",\n", + " # summary_key parameter maps to the \"highlights\" field in the news summary dataset\n", + " summary_key=\"highlights\",\n", + " # training settings\n", + " number_of_gpu_to_use_finetuning=gpus_per_node, # set to the number of GPUs available in the compute\n", + " num_train_epochs=2,\n", + " learning_rate=2e-5,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of the pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": summarization_pipeline.outputs.mlflow_model_folder\n", + " }\n", + "\n", + "\n", + "pipeline_object = create_pipeline()\n", + "\n", + "# don't use cached results from previous jobs\n", + "pipeline_object.settings.force_rerun = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job\n", + "pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + ")\n", + "# wait for the pipeline job to complete\n", + "workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review training and evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more. \n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + pipeline_job.name + \"'\"\n", + "runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + ")\n", + "training_run = None\n", + "evaluation_run = None\n", + "# get the training and evaluation runs.\n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if \"epoch\" in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif \"rouge1\" in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if training_run:\n", + " print(\"Training metrics:\\n\\n\")\n", + " print(json.dumps(training_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Training job found\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if evaluation_run:\n", + " print(\"Evaluation metrics:\\n\\n\")\n", + " print(json.dumps(evaluation_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Evaluation job found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# check if the `trained_model` output is available\n", + "print(\"pipeline job outputs: \", workspace_ml_client.jobs.get(pipeline_job.name).outputs)\n", + "\n", + "# fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = \"azureml://jobs/{0}/outputs/{1}\".format(\n", + " pipeline_job.name, \"trained_model\"\n", + ")\n", + "\n", + "finetuned_model_name = model_name + \"-news-summary\"\n", + "finetuned_model_name = finetuned_model_name.replace(\"/\", \"-\")\n", + "print(\"path to register model: \", model_path_from_job)\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=model_name + \" fine tuned model for summarizing news articles\",\n", + ")\n", + "print(\"prepare to register model: \\n\", prepare_to_register_model)\n", + "# register the model from pipeline job output\n", + "registered_model = workspace_ml_client.models.create_or_update(\n", + " prepare_to_register_model\n", + ")\n", + "print(\"registered model: \\n\", registered_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "\n", + "online_endpoint_name = \"news-summary-\" + timestamp\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \"\n", + " + registered_model.name\n", + " + \", fine tuned model for emotion detection\",\n", + " auth_mode=\"key\",\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=\"demo\",\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " instance_type=\"Standard_DS3_v2\",\n", + " instance_count=1,\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {\"demo\": 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe\n", + "import pandas as pd\n", + "import json\n", + "\n", + "test_df = pd.read_json(\n", + " \"./news-summary-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "# take 1 random sample\n", + "test_df = test_df.sample(n=1)\n", + "# rebuild index\n", + "test_df.reset_index(drop=True, inplace=True)\n", + "# rename the highlights column to ground_truth_summary\n", + "test_df.rename(columns={\"highlights\": \"ground_truth_summary\"}, inplace=True)\n", + "test_df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a json object with the key as \"inputs\" and value as a list of values from the article column of the test dataframe\n", + "test_df_copy = test_df[[\"article\"]]\n", + "test_json = {\"input_data\": test_df_copy.to_dict(\"split\")}\n", + "# save the json object to a file named sample_score.json in the ./emotion-dataset folder\n", + "with open(\"./news-summary-dataset/sample_score.json\", \"w\") as f:\n", + " json.dump(test_json, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> If the input data is long or number of records are too may, you may run into the following error: \"Failed to test real-time endpoint\n", + "upstream request timeout Please check this guide to understand why this error code might have been returned [https://docs.microsoft.com/en-us/azure/machine-learning/how-to-troubleshoot-online-endpoints#http-status-codes]\". Try to submit smaller and fewer inputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=\"demo\",\n", + " request_file=\"./news-summary-dataset/sample_score.json\",\n", + ")\n", + "print(\"raw response: \\n\", response, \"\\n\")\n", + "# convert the response to a pandas dataframe and rename the label column as scored_label\n", + "response_df = pd.read_json(response)\n", + "# rename summary_text column to scored_summary\n", + "response_df.rename(columns={0: \"scored_summary\"}, inplace=True)\n", + "response_df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the test dataframe and the response dataframe on the index\n", + "merged_df = pd.merge(test_df, response_df, left_index=True, right_index=True)\n", + "merged_df.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/finetune/summarization/summarization-config.json b/sdk/python/foundation-models/system/finetune/summarization/summarization-config.json new file mode 100644 index 0000000000..899d0c33b8 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/summarization/summarization-config.json @@ -0,0 +1,5 @@ +{ + "metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"], + "aggregator": true, + "stemmer": true +} diff --git a/sdk/python/foundation-models/system/finetune/text-classification/download-dataset.py b/sdk/python/foundation-models/system/finetune/text-classification/download-dataset.py new file mode 100644 index 0000000000..08cafa7557 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/text-classification/download-dataset.py @@ -0,0 +1,58 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset", type=str, default="dair-ai/emotion", help="dataset name" +) +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset_subset", type=str, default="split", help="dataset subset name" +) +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="data", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names +from functools import partial + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features + +# get label2id and id2label mapping + +# get any split of data +split = get_dataset_split_names(args.dataset)[0] +dataset = load_dataset(args.dataset, split=split) + +labels = dataset.features["label"].names + +id2label = {} +label2id = {} + +for i, label in enumerate(labels): + id2label[i] = label + label2id[label] = i + +label_mapping = {"id2label": id2label, "label2id": label2id} + +import json + +with open(os.path.join(args.download_dir, "label.json"), "w") as f: + json.dump(label_mapping, f) diff --git a/sdk/python/foundation-models/system/finetune/text-classification/emotion-detection.ipynb b/sdk/python/foundation-models/system/finetune/text-classification/emotion-detection.ipynb new file mode 100644 index 0000000000..b667b693fd --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/text-classification/emotion-detection.ipynb @@ -0,0 +1,631 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Classification - Emotion Detection \n", + "\n", + "This sample shows how use `text-classification` components from the `azureml` system registry to fine tune a model to detect emotions using emotion dataset. We then deploy the fine tuned model to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n", + "\n", + "### Training data\n", + "We will use the [emotion](https://huggingface.co/datasets/dair-ai/emotion) dataset.\n", + "\n", + "### Model\n", + "Models that can perform the `fill-mask` task are generally good foundation models to fine tune for `text-classification`. We will use the `bert-base-uncased` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick a model to fine tune.\n", + "* Pick and explore training data.\n", + "* Configure the fine tuning job.\n", + "* Run the fine tuning job.\n", + "* Review training and evaluation metrics. \n", + "* Register the fine tuned model. \n", + "* Deploy the fine tuned model for real time inference.\n", + "* Clean up resources. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install azure-ai-ml\n", + "%pip install azure-identity\n", + "%pip install datasets==2.9.0\n", + "%pip install mlflow\n", + "%pip install azureml-mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import (\n", + " DefaultAzureCredential,\n", + " InteractiveBrowserCredential,\n", + " ClientSecretCredential,\n", + ")\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential=credential)\n", + "except:\n", + " workspace_ml_client = MLClient(\n", + " credential,\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " workspace_name=\"\",\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(credential, registry_name=\"azureml\")\n", + "\n", + "experiment_name = \"text-classification-emotion-detection\"\n", + "\n", + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + "except Exception as ex:\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpu_count_found = False\n", + "workspace_compute_sku_list = workspace_ml_client.compute.list_sizes()\n", + "available_sku_sizes = []\n", + "for compute_sku in workspace_compute_sku_list:\n", + " available_sku_sizes.append(compute_sku.name)\n", + " if compute_sku.name.lower() == compute.size.lower():\n", + " gpus_per_node = compute_sku.gpus\n", + " gpu_count_found = True\n", + "# if gpu_count_found not found, then print an error\n", + "if gpu_count_found:\n", + " print(f\"Number of GPU's in copute {compute.size}: {gpus_per_node}\")\n", + "else:\n", + " raise ValueError(\n", + " f\"Number of GPU's in copute {compute.size} not found. Available skus are: {available_sku_sizes}.\"\n", + " f\"This should not happen. Please check the selected compute cluster: {compute_cluster} and try again.\"\n", + " )\n", + "# CPU based finetune works only for single-node single-process\n", + "if gpus_per_node == 0:\n", + " print(\n", + " \"WARNING! Selected compute doesn't have GPU. CPU based finetune is experimental and works on a single process in a single node\"\n", + " )\n", + " gpus_per_node = 1\n", + "\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a foundation model to fine tune\n", + "\n", + "Models that support `fill-mask` tasks are good candidates to fine tune for `text-classification`. You can browse these models in the Model Catalog in the AzureML Studio, filtering by the `fill-mask` task. In this example, we use the `bert-base-uncased` model. If you have opened this notebook for a different model, replace the model name and version accordingly. \n", + "\n", + "Note the model id property of the model. This will be passed as input to the fine tuning job. This is also available as the `Asset ID` field in model details page in AzureML Studio Model Catalog. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"bert-base-uncased\"\n", + "model_version = \"3\"\n", + "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", + "print(\n", + " \"\\n\\nUsing model name: {0}, version: {1}, id: {2} for fine tuning\".format(\n", + " foundation_model.name, foundation_model.version, foundation_model.id\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the dataset for fine-tuning the model\n", + "\n", + "We use the [emotion](https://huggingface.co/datasets/dair-ai/emotion) dataset. The next few cells show basic data preparation for fine tuning:\n", + "* Visualize some data rows\n", + "* Replace numerical categories in data with the actual string labels. This mapping is available in the [./emotion-dataset/label.json](./emotion-dataset/label.json). This step is needed if you want string labels such as `anger`, `joy`, etc. returned when scoring the model. If you skip this step, the model will return numerical categories such as 0, 1, 2, etc. and you will have to map them to what the category represents yourself. \n", + "* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 10% of the original. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n", + "import os\n", + "\n", + "exit_status = os.system(\"python ./download-dataset.py --download_dir emotion-dataset\")\n", + "if exit_status != 0:\n", + " raise Exception(\"Error downloading dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load the ./emotion-dataset/train.jsonl file into a pandas dataframe and show the first 5 rows\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\n", + " \"display.max_colwidth\", 0\n", + ") # set the max column width to 0 to display the full text\n", + "df = pd.read_json(\"./emotion-dataset/train.jsonl\", lines=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load the id2label json element of the ./emotion-dataset/label.json file into pandas table with keys as 'label' column of int64 type and values as 'label_string' column as string type\n", + "import json\n", + "\n", + "with open(\"./emotion-dataset/label.json\") as f:\n", + " id2label = json.load(f)\n", + " id2label = id2label[\"id2label\"]\n", + " label_df = pd.DataFrame.from_dict(\n", + " id2label, orient=\"index\", columns=[\"label_string\"]\n", + " )\n", + " label_df[\"label\"] = label_df.index.astype(\"int64\")\n", + " label_df = label_df[[\"label\", \"label_string\"]]\n", + "label_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load test.jsonl, train.jsonl and validation.jsonl form the ./emotion-dataset folder into pandas dataframes\n", + "test_df = pd.read_json(\"./emotion-dataset/test.jsonl\", lines=True)\n", + "train_df = pd.read_json(\"./emotion-dataset/train.jsonl\", lines=True)\n", + "validation_df = pd.read_json(\"./emotion-dataset/validation.jsonl\", lines=True)\n", + "# join the train, validation and test dataframes with the id2label dataframe to get the label_string column\n", + "train_df = train_df.merge(label_df, on=\"label\", how=\"left\")\n", + "validation_df = validation_df.merge(label_df, on=\"label\", how=\"left\")\n", + "test_df = test_df.merge(label_df, on=\"label\", how=\"left\")\n", + "# show the first 5 rows of the train dataframe\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save 10% of the rows from the train, validation and test dataframes into files with small_ prefix in the ./emotion-dataset folder\n", + "train_df.sample(frac=0.1).to_json(\n", + " \"./emotion-dataset/small_train.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "validation_df.sample(frac=0.1).to_json(\n", + " \"./emotion-dataset/small_validation.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "test_df.sample(frac=0.1).to_json(\n", + " \"./emotion-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the fine tuning job using the the model and data as inputs\n", + " \n", + "Create the job that uses the `text-classification` pipeline component. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component\n", + "from azure.ai.ml import PyTorchDistribution, Input\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"text_classification_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def create_pipeline():\n", + " text_classification_pipeline = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry id identified in step #3\n", + " mlflow_model_path=foundation_model.id,\n", + " # huggingface_id = 'bert-base-uncased', # if you want to use a huggingface model, uncomment this line and comment the above line\n", + " compute_model_import=compute_cluster,\n", + " compute_preprocess=compute_cluster,\n", + " compute_finetune=compute_cluster,\n", + " compute_model_evaluation=compute_cluster,\n", + " # map the dataset splits to parameters\n", + " train_file_path=Input(\n", + " type=\"uri_file\", path=\"./emotion-dataset/small_train.jsonl\"\n", + " ),\n", + " validation_file_path=Input(\n", + " type=\"uri_file\", path=\"./emotion-dataset/small_validation.jsonl\"\n", + " ),\n", + " test_file_path=Input(\n", + " type=\"uri_file\", path=\"./emotion-dataset/small_test.jsonl\"\n", + " ),\n", + " evaluation_config=Input(\n", + " type=\"uri_file\", path=\"./text-classification-config.json\"\n", + " ),\n", + " # The following parameters map to the dataset fields\n", + " sentence1_key=\"text\",\n", + " label_key=\"label_string\",\n", + " # Training settings\n", + " number_of_gpu_to_use_finetuning=gpus_per_node, # set to the number of GPUs available in the compute\n", + " num_train_epochs=3,\n", + " learning_rate=2e-5,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": text_classification_pipeline.outputs.mlflow_model_folder\n", + " }\n", + "\n", + "\n", + "pipeline_object = create_pipeline()\n", + "\n", + "# don't use cached results from previous jobs\n", + "pipeline_object.settings.force_rerun = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job\n", + "pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + ")\n", + "# wait for the pipeline job to complete\n", + "workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review training and evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more. \n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + pipeline_job.name + \"'\"\n", + "runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + ")\n", + "training_run = None\n", + "evaluation_run = None\n", + "# get the training and evaluation runs.\n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if \"epoch\" in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif \"accuracy\" in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if training_run:\n", + " print(\"Training metrics:\\n\\n\")\n", + " print(json.dumps(training_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Training job found\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if evaluation_run:\n", + " print(\"Evaluation metrics:\\n\\n\")\n", + " print(json.dumps(evaluation_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Evaluation job found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# check if the `trained_model` output is available\n", + "print(\"pipeline job outputs: \", workspace_ml_client.jobs.get(pipeline_job.name).outputs)\n", + "\n", + "# fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = \"azureml://jobs/{0}/outputs/{1}\".format(\n", + " pipeline_job.name, \"trained_model\"\n", + ")\n", + "\n", + "finetuned_model_name = model_name + \"-emotion-detection\"\n", + "finetuned_model_name = finetuned_model_name.replace(\"/\", \"-\")\n", + "print(\"path to register model: \", model_path_from_job)\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=model_name + \" fine tuned model for emotion detection\",\n", + ")\n", + "print(\"prepare to register model: \\n\", prepare_to_register_model)\n", + "# register the model from pipeline job output\n", + "registered_model = workspace_ml_client.models.create_or_update(\n", + " prepare_to_register_model\n", + ")\n", + "print(\"registered model: \\n\", registered_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "\n", + "online_endpoint_name = \"emotion-\" + timestamp\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \"\n", + " + registered_model.name\n", + " + \", fine tuned model for emotion detection\",\n", + " auth_mode=\"key\",\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=\"demo\",\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " instance_type=\"Standard_DS3_v2\",\n", + " instance_count=1,\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {\"demo\": 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read ./emotion-dataset/small_test.jsonl into a pandas dataframe\n", + "test_df = pd.read_json(\"./emotion-dataset/small_test.jsonl\", lines=True)\n", + "# take 10 random samples\n", + "test_df = test_df.sample(n=10)\n", + "# rebuild index\n", + "test_df.reset_index(drop=True, inplace=True)\n", + "# rename the label_string column to ground_truth_label\n", + "test_df = test_df.rename(columns={\"label_string\": \"ground_truth_label\"})\n", + "test_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a json object with the key as \"inputs\" and value as a list of values from the text column of the test dataframe\n", + "test_df_copy = test_df[[\"text\"]]\n", + "test_json = {\"input_data\": test_df_copy.to_dict(\"split\")}\n", + "# save the json object to a file named sample_score.json in the ./emotion-dataset folder\n", + "with open(\"./emotion-dataset/sample_score.json\", \"w\") as f:\n", + " json.dump(test_json, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=\"demo\",\n", + " request_file=\"./emotion-dataset/sample_score.json\",\n", + ")\n", + "print(\"raw response: \\n\", response, \"\\n\")\n", + "# convert the response to a pandas dataframe and rename the label column as scored_label\n", + "response_df = pd.read_json(response)\n", + "response_df = response_df.rename(columns={0: \"scored_label\"})\n", + "response_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the test dataframe and the response dataframe on the index\n", + "merged_df = pd.merge(test_df, response_df, left_index=True, right_index=True)\n", + "merged_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/finetune/text-classification/text-classification-config.json b/sdk/python/foundation-models/system/finetune/text-classification/text-classification-config.json new file mode 100644 index 0000000000..597603459e --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/text-classification/text-classification-config.json @@ -0,0 +1,7 @@ +{ + "metrics": ["average_precision_score_macro", "AUC_macro", "recall_score_macro", "average_precision_score_binary", "average_precision_score_micro", "AUC_binary", "recall_score_micro", "AUC_micro", "norm_macro_recall", "average_precision_score_weighted", "weighted_accuracy", "precision_score_micro", "f1_score_binary", "accuracy_table", "precision_score_macro", "f1_score_micro", "precision_score_weighted", "f1_score_weighted", "confusion_matrix", "recall_score_binary", "matthews_correlation", "log_loss", "accuracy", "precision_score_binary", "balanced_accuracy", "AUC_weighted", "f1_score_macro", "recall_score_weighted"], + "multilabel": false, + "enable_metric_confidence": true, + "confidence_metrics": ["accuracy", "f1_score_micro"], + "use_binary": false +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/finetune/token-classification/download-dataset.py b/sdk/python/foundation-models/system/finetune/token-classification/download-dataset.py new file mode 100644 index 0000000000..3702e2a14f --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/token-classification/download-dataset.py @@ -0,0 +1,39 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="conll2003", help="dataset name") +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="data", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +def format_ner_tags(example, class_names): + example["text"] = " ".join(example["tokens"]) + example["ner_tags_str"] = [class_names[id] for id in example["ner_tags"]] + return example + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names +from functools import partial + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + dataset = dataset.map( + partial(format_ner_tags, class_names=dataset.features["ner_tags"].feature.names) + ) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features diff --git a/sdk/python/foundation-models/system/finetune/token-classification/token-classification-config.json b/sdk/python/foundation-models/system/finetune/token-classification/token-classification-config.json new file mode 100644 index 0000000000..23efa790c7 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/token-classification/token-classification-config.json @@ -0,0 +1,3 @@ +{ + "metrics": ["precision_score_macro", "f1_score_micro", "recall_score_macro", "f1_score_weighted", "recall_score_micro", "accuracy", "precision_score_weighted", "precision_score_micro", "f1_score_macro", "recall_score_weighted"] +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/finetune/token-classification/token-classification.ipynb b/sdk/python/foundation-models/system/finetune/token-classification/token-classification.ipynb new file mode 100644 index 0000000000..bd85ad6482 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/token-classification/token-classification.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Token Classification - Named Entity Recognition (NER)\n", + "\n", + "This sample shows how use `token-classification` components from the `azureml` system registry to fine tune a model to detect entities using conll2003 dataset. We then deploy the fine tuned model to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n", + "\n", + "### Training data\n", + "We will use the [conll2003](https://huggingface.co/datasets/conll2003) dataset.\n", + "\n", + "### Model\n", + "Models that can perform the `fill-mask` task are generally good foundation models to fine tune for `token-classification`. We will use the `bert-base-uncased` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick a model to fine tune.\n", + "* Pick and explore training data.\n", + "* Configure the fine tuning job.\n", + "* Run the fine tuning job.\n", + "* Register the fine tuned model. \n", + "* Deploy the fine tuned model for real time inference.\n", + "* Clean up resources. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install azure-ai-ml\n", + "%pip install azure-identity\n", + "%pip install datasets==2.9.0\n", + "%pip install mlflow\n", + "%pip install azureml-mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import (\n", + " DefaultAzureCredential,\n", + " InteractiveBrowserCredential,\n", + " ClientSecretCredential,\n", + ")\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential=credential)\n", + "except:\n", + " workspace_ml_client = MLClient(\n", + " credential,\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " workspace_name=\"\",\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(credential, registry_name=\"azureml\")\n", + "\n", + "experiment_name = \"token-classification-ner\"\n", + "\n", + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + "except Exception as ex:\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpu_count_found = False\n", + "workspace_compute_sku_list = workspace_ml_client.compute.list_sizes()\n", + "available_sku_sizes = []\n", + "for compute_sku in workspace_compute_sku_list:\n", + " available_sku_sizes.append(compute_sku.name)\n", + " if compute_sku.name.lower() == compute.size.lower():\n", + " gpus_per_node = compute_sku.gpus\n", + " gpu_count_found = True\n", + "# if gpu_count_found not found, then print an error\n", + "if gpu_count_found:\n", + " print(f\"Number of GPU's in copute {compute.size}: {gpus_per_node}\")\n", + "else:\n", + " raise ValueError(\n", + " f\"Number of GPU's in copute {compute.size} not found. Available skus are: {available_sku_sizes}.\"\n", + " f\"This should not happen. Please check the selected compute cluster: {compute_cluster} and try again.\"\n", + " )\n", + "# CPU based finetune works only for single-node single-process\n", + "if gpus_per_node == 0:\n", + " print(\n", + " \"WARNING! Selected compute doesn't have GPU. CPU based finetune is experimental and works on a single process in a single node\"\n", + " )\n", + " gpus_per_node = 1\n", + "\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a foundation model to fine tune\n", + "\n", + "Models that support `fill-mask` tasks are good candidates to fine tune for `token-classification`. You can browse these models in the Model Catalog in the AzureML Studio, filtering by the `fill-mask` task. In this example, we use the `bert-base-uncased` model. If you have opened this notebook for a different model, replace the model name and version accordingly. \n", + "\n", + "Note the model id property of the model. This will be passed as input to the fine tuning job. This is also available as the `Asset ID` field in model details page in AzureML Studio Model Catalog. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"bert-base-uncased\"\n", + "model_version = \"3\"\n", + "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", + "print(\n", + " \"\\n\\nUsing model name: {0}, version: {1}, id: {2} for fine tuning\".format(\n", + " foundation_model.name, foundation_model.version, foundation_model.id\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the dataset for fine-tuning the model\n", + "\n", + "We will use the [conll2003](https://huggingface.co/datasets/conll2003) dataset. The next few cells show basic data preparation for fine tuning:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 10% of the original. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. \n", + "\n", + "> The [download-dataset.py](./conll2003-dataset/download-dataset.py) is used to download the conll2003 dataset and transform the dataset into finetune pipeline component consumable format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n", + "import os\n", + "\n", + "exit_status = os.system(\"python ./download-dataset.py --download_dir conll2003-dataset\")\n", + "if exit_status != 0:\n", + " raise Exception(\"Error downloading dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load the ./conll2003-dataset/train.jsonl file into a pandas dataframe and show the first 5 rows\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\n", + " \"display.max_colwidth\", 0\n", + ") # set the max column width to 0 to display the full text\n", + "df = pd.read_json(\"./conll2003-dataset/train.jsonl\", lines=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load test.jsonl, train.jsonl and validation.jsonl form the ./conll2003-dataset folder into pandas dataframes\n", + "test_df = pd.read_json(\"./conll2003-dataset/test.jsonl\", lines=True)\n", + "train_df = pd.read_json(\"./conll2003-dataset/train.jsonl\", lines=True)\n", + "validation_df = pd.read_json(\"./conll2003-dataset/validation.jsonl\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save 10% of the rows from the train, validation and test dataframes into files with small_ prefix in the ./conll2003-dataset folder\n", + "train_df.sample(frac=0.1).to_json(\n", + " \"./conll2003-dataset/small_train.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "validation_df.sample(frac=0.1).to_json(\n", + " \"./conll2003-dataset/small_validation.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "test_df.sample(frac=0.1).to_json(\n", + " \"./conll2003-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the fine tuning job using the the model and data as inputs\n", + " \n", + "Create the job that uses the `token-classification` pipeline component. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component\n", + "from azure.ai.ml import PyTorchDistribution, Input\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"token_classification_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def create_pipeline():\n", + " token_classification_pipeline = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry id identified in step #3\n", + " mlflow_model_path=foundation_model.id,\n", + " # huggingface_id = 'bert-base-uncased', # if you want to use a huggingface model, uncomment this line and comment the above line\n", + " compute_model_import=compute_cluster,\n", + " compute_preprocess=compute_cluster,\n", + " compute_finetune=compute_cluster,\n", + " compute_model_evaluation=compute_cluster,\n", + " # map the dataset splits to parameters\n", + " train_file_path=Input(\n", + " type=\"uri_file\", path=\"./conll2003-dataset/small_train.jsonl\"\n", + " ),\n", + " validation_file_path=Input(\n", + " type=\"uri_file\", path=\"./conll2003-dataset/small_validation.jsonl\"\n", + " ),\n", + " test_file_path=Input(\n", + " type=\"uri_file\", path=\"./conll2003-dataset/small_test.jsonl\"\n", + " ),\n", + " evaluation_config=Input(\n", + " type=\"uri_file\", path=\"./token-classification-config.json\"\n", + " ),\n", + " # The following parameters map to the dataset fields\n", + " token_key=\"tokens\",\n", + " tag_key=\"ner_tags_str\",\n", + " # Training settings\n", + " number_of_gpu_to_use_finetuning=gpus_per_node, # set to the number of GPUs available in the compute\n", + " num_train_epochs=3,\n", + " learning_rate=2e-5,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": token_classification_pipeline.outputs.mlflow_model_folder\n", + " }\n", + "\n", + "\n", + "pipeline_object = create_pipeline()\n", + "\n", + "# don't use cached results from previous jobs\n", + "pipeline_object.settings.force_rerun = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job\n", + "pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + ")\n", + "# wait for the pipeline job to complete\n", + "workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review training and evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more. \n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + pipeline_job.name + \"'\"\n", + "runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + ")\n", + "training_run = None\n", + "evaluation_run = None\n", + "# get the training and evaluation runs.\n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if \"epoch\" in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif \"accuracy\" in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if training_run:\n", + " print(\"Training metrics:\\n\\n\")\n", + " print(json.dumps(training_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Training job found\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if evaluation_run:\n", + " print(\"Evaluation metrics:\\n\\n\")\n", + " print(json.dumps(evaluation_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Evaluation job found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# check if the `trained_model` output is available\n", + "print(\"pipeline job outputs: \", workspace_ml_client.jobs.get(pipeline_job.name).outputs)\n", + "\n", + "# fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = \"azureml://jobs/{0}/outputs/{1}\".format(\n", + " pipeline_job.name, \"trained_model\"\n", + ")\n", + "\n", + "finetuned_model_name = model_name + \"-ner\"\n", + "finetuned_model_name = finetuned_model_name.replace(\"/\", \"-\")\n", + "print(\"path to register model: \", model_path_from_job)\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=model_name + \" fine tuned model for named entity recognition\",\n", + ")\n", + "print(\"prepare to register model: \\n\", prepare_to_register_model)\n", + "# register the model from pipeline job output\n", + "registered_model = workspace_ml_client.models.create_or_update(\n", + " prepare_to_register_model\n", + ")\n", + "print(\"registered model: \\n\", registered_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "\n", + "online_endpoint_name = \"ner-\" + timestamp\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \"\n", + " + registered_model.name\n", + " + \", fine tuned model for named entity recognition\",\n", + " auth_mode=\"key\",\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=\"demo\",\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " instance_type=\"Standard_DS3_v2\",\n", + " instance_count=1,\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {\"demo\": 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read ./conll2003-dataset/small_test.jsonl into a pandas dataframe\n", + "test_df = pd.read_json(\"./conll2003-dataset/small_test.jsonl\", lines=True)\n", + "# take 10 random samples\n", + "test_df = test_df.sample(n=10)\n", + "# drop the id, pos_tags, chunk_tags, ner_tags column\n", + "test_df.drop(columns=[\"id\", \"pos_tags\", \"chunk_tags\", \"ner_tags\"], inplace=True)\n", + "# rebuild index\n", + "test_df.reset_index(drop=True, inplace=True)\n", + "# rename the ner_tags_str column to ground_truth_label\n", + "test_df = test_df.rename(columns={\"ner_tags_str\": \"ground_truth_tags\"})\n", + "test_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# create a json object with the key as \"inputs\" and value as a list of values from the text column of the test dataframe\n", + "test_df_copy = test_df[[\"tokens\"]]\n", + "test_json = {\"input_data\": test_df_copy.to_dict(\"split\")}\n", + "# save the json object to a file named sample_score.json in the ./conll2003-dataset folder\n", + "with open(\"./conll2003-dataset/sample_score.json\", \"w\") as f:\n", + " json.dump(test_json, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=\"demo\",\n", + " request_file=\"./conll2003-dataset/sample_score.json\",\n", + ")\n", + "print(\"raw response: \\n\", response, \"\\n\")\n", + "# convert the response to a pandas dataframe\n", + "response_df = pd.read_json(response)\n", + "# rename the column to predicted_tags\n", + "response_df.rename(columns={0: \"predicted_tags\"}, inplace=True)\n", + "response_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the test dataframe and the response dataframe on the index\n", + "merged_df = pd.merge(test_df, response_df, left_index=True, right_index=True)\n", + "merged_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/finetune/translation/download-dataset.py b/sdk/python/foundation-models/system/finetune/translation/download-dataset.py new file mode 100644 index 0000000000..d945ea740e --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/translation/download-dataset.py @@ -0,0 +1,45 @@ +# import library to parse command line arguments +import argparse, os + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="wmt16", help="dataset name") +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset_subset", type=str, default="ro-en", help="dataset subset name" +) +# argument to save a fraction of the dataset +parser.add_argument( + "--fraction", type=float, default=0.05, help="fraction of the dataset to save" +) +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="data", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +def format_translation(example): + for key in example["translation"]: + example[key] = example["translation"][key] + return example + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names + +for split in get_dataset_split_names(args.dataset, args.dataset_subset): + # load the split of the dataset + dataset = load_dataset(args.dataset, args.dataset_subset, split=split) + dataset = dataset.map(format_translation, remove_columns=["translation"]) + # save the split of the dataset to the download directory as json lines file + dataset.select(range(int(dataset.num_rows * args.fraction))).to_json( + os.path.join(args.download_dir, f"{split}.jsonl") + ) diff --git a/sdk/python/foundation-models/system/finetune/translation/translation-config.json b/sdk/python/foundation-models/system/finetune/translation/translation-config.json new file mode 100644 index 0000000000..f293ed61bb --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/translation/translation-config.json @@ -0,0 +1,4 @@ +{ + "metrics": ["bleu_1", "bleu_2", "bleu_3", "bleu_4"], + "smoothing":false +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/finetune/translation/translation.ipynb b/sdk/python/foundation-models/system/finetune/translation/translation.ipynb new file mode 100644 index 0000000000..f8f5555581 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/translation/translation.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Translation - Translate english to romanian\n", + "\n", + "This sample shows how to use `translation` components from the `azureml` system registry to fine tune a model to translate english language to romanian language. We then deploy it to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n", + "\n", + "### Training data\n", + "We will use the [wmt16 (ro-en)](https://huggingface.co/datasets/wmt16) dataset.\n", + "\n", + "### Model\n", + "Models that can perform the `translation` task are used here. We will use the `t5-small` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick a model to fine tune.\n", + "* Pick and explore training data.\n", + "* Configure the fine tuning job.\n", + "* Run the fine tuning job.\n", + "* Register the fine tuned model. \n", + "* Deploy the fine tuned model for real time inference.\n", + "* Clean up resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install azure-ai-ml\n", + "%pip install azure-identity\n", + "%pip install datasets==2.9.0\n", + "%pip install mlflow\n", + "%pip install azureml-mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import (\n", + " DefaultAzureCredential,\n", + " InteractiveBrowserCredential,\n", + " ClientSecretCredential,\n", + ")\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential=credential)\n", + "except:\n", + " workspace_ml_client = MLClient(\n", + " credential,\n", + " subscription_id=\"\",\n", + " resource_group_name=\"\",\n", + " workspace_name=\"\",\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(credential, registry_name=\"azureml\")\n", + "\n", + "experiment_name = \"translation-wmt16-en-ro\"\n", + "\n", + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + "except Exception as ex:\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpu_count_found = False\n", + "workspace_compute_sku_list = workspace_ml_client.compute.list_sizes()\n", + "available_sku_sizes = []\n", + "for compute_sku in workspace_compute_sku_list:\n", + " available_sku_sizes.append(compute_sku.name)\n", + " if compute_sku.name.lower() == compute.size.lower():\n", + " gpus_per_node = compute_sku.gpus\n", + " gpu_count_found = True\n", + "# if gpu_count_found not found, then print an error\n", + "if gpu_count_found:\n", + " print(f\"Number of GPU's in copute {compute.size}: {gpus_per_node}\")\n", + "else:\n", + " raise ValueError(\n", + " f\"Number of GPU's in copute {compute.size} not found. Available skus are: {available_sku_sizes}.\"\n", + " f\"This should not happen. Please check the selected compute cluster: {compute_cluster} and try again.\"\n", + " )\n", + "# CPU based finetune works only for single-node single-process\n", + "if gpus_per_node == 0:\n", + " print(\n", + " \"WARNING! Selected compute doesn't have GPU. CPU based finetune is experimental and works on a single process in a single node\"\n", + " )\n", + " gpus_per_node = 1\n", + "\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a foundation model to fine tune\n", + "\n", + "Models that support `translation` tasks are picked to fine tune. You can browse these models in the Model Catalog in the AzureML Studio, filtering by the `translation` task. In this example, we use the `t5-small` model. If you have opened this notebook for a different model, replace the model name and version accordingly. \n", + "\n", + "Note the model id property of the model. This will be passed as input to the fine tuning job. This is also available as the `Asset ID` field in model details page in AzureML Studio Model Catalog. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"t5-small\"\n", + "model_version = \"3\"\n", + "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", + "print(\n", + " \"\\n\\nUsing model name: {0}, version: {1}, id: {2} for fine tuning\".format(\n", + " foundation_model.name, foundation_model.version, foundation_model.id\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the dataset for fine-tuning the model \n", + "\n", + "We will use the [wmt16 (ro-en)](https://huggingface.co/datasets/wmt16) dataset.\n", + "* Visualize some data rows. \n", + "* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 20% of the already trimmed rows. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. \n", + "\n", + "> The [download-dataset.py](./wmt16-en-ro-dataset/download-dataset.py) is used to download the wmt16 (ro-en) dataset and transform the dataset into finetune pipeline component consumable format. Also as the dataset is large, hence we here have only part of the dataset.\n", + "\n", + "> **Note** : Some language models have different language codes and hence the column names in the dataset should reflect the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n", + "import os\n", + "\n", + "exit_status = os.system(\n", + " \"python ./download-dataset.py --download_dir wmt16-en-ro-dataset\"\n", + ")\n", + "if exit_status != 0:\n", + " raise Exception(\"Error downloading dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pd.set_option(\n", + " \"display.max_colwidth\", 0\n", + ") # set the max column width to 0 to display the full text\n", + "# load the train.jsonl, test.jsonl and validation.jsonl files from the ./wmt16-en-ro-dataset/ folder and show first 5 rows\n", + "train_df = pd.read_json(\"./wmt16-en-ro-dataset/train.jsonl\", lines=True)\n", + "validation_df = pd.read_json(\"./wmt16-en-ro-dataset/validation.jsonl\", lines=True)\n", + "test_df = pd.read_json(\"./wmt16-en-ro-dataset/test.jsonl\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save 20% of the rows from the dataframes into files with small_ prefix in the ./wmt16-en-ro-dataset folder\n", + "train_df.sample(frac=0.2).to_json(\n", + " \"./wmt16-en-ro-dataset/small_train.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "validation_df.sample(frac=0.2).to_json(\n", + " \"./wmt16-en-ro-dataset/small_validation.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "test_df.sample(frac=0.2).to_json(\n", + " \"./wmt16-en-ro-dataset/small_test.jsonl\", orient=\"records\", lines=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the fine tuning job using the the model and data as inputs\n", + " \n", + "Create the job that uses the `translation` pipeline component. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component\n", + "from azure.ai.ml import PyTorchDistribution, Input\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"translation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def create_pipeline():\n", + " translation_pipeline = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry id identified in step #3\n", + " mlflow_model_path=foundation_model.id,\n", + " # huggingface_id = 't5-small', # if you want to use a huggingface model, uncomment this line and comment the above line\n", + " compute_model_import=compute_cluster,\n", + " compute_preprocess=compute_cluster,\n", + " compute_finetune=compute_cluster,\n", + " compute_model_evaluation=compute_cluster,\n", + " # map the dataset splits to parameters\n", + " train_file_path=Input(\n", + " type=\"uri_file\", path=\"./wmt16-en-ro-dataset/small_train.jsonl\"\n", + " ),\n", + " validation_file_path=Input(\n", + " type=\"uri_file\", path=\"./wmt16-en-ro-dataset/small_validation.jsonl\"\n", + " ),\n", + " test_file_path=Input(\n", + " type=\"uri_file\", path=\"./wmt16-en-ro-dataset/small_test.jsonl\"\n", + " ),\n", + " evaluation_config=Input(type=\"uri_file\", path=\"./translation-config.json\"),\n", + " # The following parameters map to the dataset fields\n", + " # source_lang parameter maps to the \"en\" field in the wmt16 dataset\n", + " source_lang=\"en\",\n", + " # target_lang parameter maps to the \"ro\" field in the wmt16 dataset\n", + " target_lang=\"ro\",\n", + " # training settings\n", + " number_of_gpu_to_use_finetuning=gpus_per_node, # set to the number of GPUs available in the compute\n", + " num_train_epochs=3,\n", + " learning_rate=2e-5,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of the pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": translation_pipeline.outputs.mlflow_model_folder\n", + " }\n", + "\n", + "\n", + "pipeline_object = create_pipeline()\n", + "\n", + "# don't use cached results from previous jobs\n", + "pipeline_object.settings.force_rerun = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job\n", + "pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + ")\n", + "# wait for the pipeline job to complete\n", + "workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review training and evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more. \n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + pipeline_job.name + \"'\"\n", + "runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + ")\n", + "training_run = None\n", + "evaluation_run = None\n", + "# get the training and evaluation runs.\n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if \"epoch\" in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif \"bleu_1\" in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if training_run:\n", + " print(\"Training metrics:\\n\\n\")\n", + " print(json.dumps(training_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Training job found\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if evaluation_run:\n", + " print(\"Evaluation metrics:\\n\\n\")\n", + " print(json.dumps(evaluation_run.data.metrics, indent=2))\n", + "else:\n", + " print(\"No Evaluation job found\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# check if the `trained_model` output is available\n", + "print(\"pipeline job outputs: \", workspace_ml_client.jobs.get(pipeline_job.name).outputs)\n", + "\n", + "# fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = \"azureml://jobs/{0}/outputs/{1}\".format(\n", + " pipeline_job.name, \"trained_model\"\n", + ")\n", + "\n", + "finetuned_model_name = model_name + \"-wmt16-en-ro-src\"\n", + "finetuned_model_name = finetuned_model_name.replace(\"/\", \"-\")\n", + "print(\"path to register model: \", model_path_from_job)\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=model_name + \" fine tuned model for translation wmt16 en to ro\",\n", + ")\n", + "print(\"prepare to register model: \\n\", prepare_to_register_model)\n", + "# register the model from pipeline job output\n", + "registered_model = workspace_ml_client.models.create_or_update(\n", + " prepare_to_register_model\n", + ")\n", + "print(\"registered model: \\n\", registered_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "\n", + "online_endpoint_name = \"translation-en-ro-\" + timestamp\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \"\n", + " + registered_model.name\n", + " + \", fine tuned model for emotion detection\",\n", + " auth_mode=\"key\",\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=\"demo\",\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " instance_type=\"Standard_DS3_v2\",\n", + " instance_count=1,\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {\"demo\": 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read ./wmt16-en-ro-dataset/small_test.jsonl into a pandas dataframe\n", + "import pandas as pd\n", + "import json\n", + "\n", + "test_df = pd.read_json(\"./wmt16-en-ro-dataset/test.jsonl\", orient=\"records\", lines=True)\n", + "# take 1 random sample\n", + "test_df = test_df.sample(n=1)\n", + "# rebuild index\n", + "test_df.reset_index(drop=True, inplace=True)\n", + "test_df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a json object with the key as \"inputs\" and value as a list of values from the en column of the test dataframe\n", + "test_df_copy = test_df[[\"en\"]]\n", + "test_json = {\"input_data\": test_df_copy.to_dict(\"split\")}\n", + "# save the json object to a file named sample_score.json in the ./wmt16-en-ro-dataset folder\n", + "with open(\"./wmt16-en-ro-dataset/sample_score.json\", \"w\") as f:\n", + " json.dump(test_json, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> If the input data is long or number of records are too may, you may run into the following error: \"Failed to test real-time endpoint\n", + "upstream request timeout Please check this guide to understand why this error code might have been returned [https://docs.microsoft.com/en-us/azure/machine-learning/how-to-troubleshoot-online-endpoints#http-status-codes]\". Try to submit smaller and fewer inputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=\"demo\",\n", + " request_file=\"./wmt16-en-ro-dataset/sample_score.json\",\n", + ")\n", + "print(\"raw response: \\n\", response, \"\\n\")\n", + "# convert the response to a pandas dataframe\n", + "response_df = pd.read_json(response)\n", + "# rename the column to predicted_tags\n", + "response_df.rename(columns={0: \"predicted_translation\"}, inplace=True)\n", + "response_df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the test dataframe and the response dataframe on the index\n", + "merged_df = pd.merge(test_df, response_df, left_index=True, right_index=True)\n", + "merged_df.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}