Skip to content

Commit

Permalink
Update notebooks (#2315)
Browse files Browse the repository at this point in the history
* Update notebooks

* remove datasets

* fixing formatting issues

---------

Co-authored-by: Pavan Manoj Jonnalagadda <pavanmanojj@microsoft.com>
  • Loading branch information
aggarwal-k and jpmann committed May 19, 2023
1 parent 214d603 commit e4da127
Show file tree
Hide file tree
Showing 35 changed files with 4,956 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# import library to parse command line arguments
import argparse, os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="./squad-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)

# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features

# load the train.jsonl and validation.jsonl files from the ./squad-dataset/ folder and show first 5 rows
import pandas as pd

train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
validation_df = pd.read_json(
os.path.join(args.download_dir, "validation.jsonl"), lines=True
)

# save 5% of the rows from the train dataframe into files with small_ prefix in the ./squad-dataset folder
train_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
)
# the original dataset does not have a test split, so split the validation dataframe into validation and test dataframes equally
validation_df, test_df = (
validation_df[: len(validation_df) // 2],
validation_df[len(validation_df) // 2 :],
)
# save 5% of the rows from the validation and test dataframes into files with small_ prefix in the ./squad-dataset folder
validation_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_validation.jsonl"),
orient="records",
lines=True,
)
test_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)

# read ./squad-dataset/small_test.jsonl into a pandas dataframe
import json

test_df = pd.read_json("./squad-dataset/small_test.jsonl", orient="records", lines=True)
# take 10 random samples
test_df = test_df.sample(n=10)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
# flatten the json object in the "answer" column with the keys "answer_start" and "text"
json_struct = json.loads(test_df.to_json(orient="records"))
test_df = pd.json_normalize(json_struct)
# drop id and title columns
test_df = test_df.drop(columns=["id", "title"])

# create a json object with "inputs" as key and a list of json objects with "question" and "context" as keys
test_df_copy = test_df[["question", "context"]]
test_json = {"input_data": test_df_copy.to_dict("split")}

# write the json object to a file named sample_score.json in the ./squad-dataset folder
with open("./squad-dataset/sample_score.json", "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline

experiment_name: question-answering-extractive-qna

inputs:
compute_model_import: gpu-cluster-big
compute_preprocess: gpu-cluster-big
compute_finetune: gpu-cluster-big
compute_model_evaluation: gpu-cluster-big

# specify the foundation model available in the azureml system registry
mlflow_model_path:
path: azureml://registries/azureml/models/bert-based-uncased/versions/3
# huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line

# map the dataset files to parameters
train_file_path:
type: uri_file
path: "squad-dataset/small_train.jsonl"
validation_file_path:
type: uri_file
path: "squad-dataset/small_validation.jsonl"
test_file_path:
type: uri_file
path: "squad-dataset/small_test.jsonl"
evaluation_config_path:
type: uri_file
path: "../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"

# The following parameters map to the dataset fields
# the question whose answer needs to be extracted from the provided context 
# question_key parameter maps to the "question" field in the SQuAD dataset
question_key: "question"
# the context that contains the answer to the question
# context_key parameter maps to the "context" field in the SQuAD dataset
context_key: "context"
# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
# answers_key parameter maps to the "answers" field in the SQuAD dataset
answers_key: "answers"
# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
answer_start_key: "answer_start"
# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
# in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
answer_text_key: "text"

# training settings
number_of_gpu_to_use_finetuning: 2
num_train_epochs: 3
learning_rate: 2e-5

outputs:
# map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
# registering the model is required to deploy the model to an online or batch endpoint
trained_model:
type: mlflow_model

settings:
force_rerun: true

jobs:
question_answering_pipeline:
type: pipeline
component: azureml://registries/azureml/components/question_answering_pipeline/labels/latest
inputs:
mlflow_model_path: ${{parent.inputs.mlflow_model_path}}

compute_model_import: ${{parent.inputs.compute_model_import}}
compute_preprocess: ${{parent.inputs.compute_preprocess}}
compute_finetune: ${{parent.inputs.compute_finetune}}
compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}}

train_file_path: ${{parent.inputs.train_file_path}}
validation_file_path: ${{parent.inputs.validation_file_path}}
test_file_path: ${{parent.inputs.test_file_path}}
evaluation_config: ${{parent.inputs.evaluation_config_path}}

question_key: ${{parent.inputs.question_key}}
context_key: ${{parent.inputs.context_key}}
answers_key: ${{parent.inputs.answers_key}}
answer_start_key: ${{parent.inputs.answer_start_key}}
answer_text_key: ${{parent.inputs.answer_text_key}}

number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}}
num_train_epochs: ${{parent.inputs.num_train_epochs}}
learning_rate: ${{parent.inputs.learning_rate}}
outputs:
mlflow_model_folder: ${{parent.outputs.trained_model}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
set -x
# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
registry_name="azureml"

compute_cluster="gpu-cluster-big"
# if above compute cluster does not exist, create it with the following vm size
compute_sku="Standard_ND40rs_v2"
# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node=2
# This is the foundation model for finetuning
model_name="bert-base-uncased"
# using the latest version of the model - not working yet
model_version=3

version=$(date +%s)
finetuned_model_name=$model_name"-extractive-qna"
endpoint_name="ext-qna-$version"
deployment_sku="Standard_DS3_v2"


# training data
train_data="squad-dataset/small_train.jsonl"
# validation data
validation_data="squad-dataset/small_validation.jsonl"
# test data
test_data="squad-dataset/small_test.jsonl"
# evaluation config
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"
# scoring_file
scoring_file="squad-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="question_answering_pipeline"
# The following parameters map to the dataset fields
# the question whose answer needs to be extracted from the provided context 
# question_key parameter maps to the "question" field in the SQuAD dataset
question_key="question"
# the context that contains the answer to the question
# context_key parameter maps to the "context" field in the SQuAD dataset
context_key="context"
# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
# answers_key parameter maps to the "answers" field in the SQuAD dataset
answers_key="answers"
# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
answer_start_key="answer_start"
# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
# in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
answer_text_key="text"
# Training settings
number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
num_train_epochs=3
learning_rate=2e-5

# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
[ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \
[ "$workspace_name" = "<WORKSPACE_NAME>" ]; then
echo "Please update the script with the subscription_id, resource_group_name and workspace_name"
exit 1
fi

az account set -s $subscription_id
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"

# check if $compute_cluster exists, else create it
if az ml compute show --name $compute_cluster $workspace_info
then
echo "Compute cluster $compute_cluster already exists"
else
echo "Creating compute cluster $compute_cluster"
az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster"
exit 1
}
fi

# download the dataset

python ./download-dataset.py || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name
then
echo "Model $model_name:$model_version does not exist in registry $registry_name"
exit 1
fi

# 3. Check if training data, validation data and test data exist
if [ ! -f $train_data ]; then
echo "Training data $train_data does not exist"
exit 1
fi
if [ ! -f $validation_data ]; then
echo "Validation data $validation_data does not exist"
exit 1
fi
if [ ! -f $test_data ]; then
echo "Test data $test_data does not exist"
exit 1
fi

# 4. Submit finetuning job using pipeline.yml

# check if the finetuning pipeline component exists
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
then
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
exit 1
fi

# need to switch to using latest version for model, currently blocked with a bug.
# submit finetuning job
parent_job_name=$( az ml job create --file ./extractive-qa-pipeline.yml $workspace_info --query name -o tsv --set \
jobs.question_answering_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
inputs.compute_model_import=$compute_cluster \
inputs.compute_preprocess=$compute_cluster \
inputs.compute_finetune=$compute_cluster \
inputs.compute_model_evaluation=$compute_cluster \
inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \
inputs.train_file_path.path=$train_data \
inputs.validation_file_path.path=$validation_data \
inputs.test_file_path.path=$test_data \
inputs.evaluation_config.path=$evaluation_config \
inputs.question_key=$question_key \
inputs.context_key=$context_key \
inputs.answers_key=$answers_key \
inputs.answer_start_key=$answer_start_key \
inputs.answer_text_key=$answer_text_key \
inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \
inputs.num_train_epochs=$num_train_epochs \
inputs.learning_rate=$learning_rate ) || {
echo "Failed to submit finetuning job"
exit 1
}

az ml job stream --name $parent_job_name $workspace_info || {
echo "job stream failed"; exit 1;
}

# 5. Create model in workspace from train job output
az ml model create --name $finetuned_model_name --version $version --type mlflow_model \
--path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || {
echo "model create in workspace failed"; exit 1;
}

# 6. Deploy the model to an endpoint
# create online endpoint
az ml online-endpoint create --name $endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}

# deploy model from registry to endpoint in workspace
# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list
az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \
endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \
instance_type=$deployment_sku || {
echo "deployment create failed"; exit 1;
}

# 7. Try a sample scoring request

# Check if scoring data file exists
if [ -f $scoring_file ]; then
echo "Invoking endpoint $endpoint_name with following input:\n\n"
cat $scoring_file
echo "\n\n"
else
echo "Scoring file $scoring_file does not exist"
exit 1
fi

az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || {
echo "endpoint invoke failed"; exit 1;
}

# 8. Delete the endpoint
az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || {
echo "endpoint delete failed"; exit 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@
# rename the highlights column to ground_truth_summary
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
test_df_copy = test_df[["article"]]
test_json = {"input_data": test_df_copy.to_dict("split")}
# save the json object to a file named sample_score.json in the ./emotion-dataset folder
with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
json.dump(test_json, f)
Loading

0 comments on commit e4da127

Please sign in to comment.