Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update notebooks #2315

Merged
merged 3 commits into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# import library to parse command line arguments
import argparse, os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="./squad-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)

# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features

# load the train.jsonl and validation.jsonl files from the ./squad-dataset/ folder and show first 5 rows
import pandas as pd

train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
validation_df = pd.read_json(os.path.join(args.download_dir, "validation.jsonl"), lines=True)

# save 5% of the rows from the train dataframe into files with small_ prefix in the ./squad-dataset folder
train_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
)
# the original dataset does not have a test split, so split the validation dataframe into validation and test dataframes equally
validation_df, test_df = (
validation_df[: len(validation_df) // 2],
validation_df[len(validation_df) // 2 :],
)
# save 5% of the rows from the validation and test dataframes into files with small_ prefix in the ./squad-dataset folder
validation_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_validation.jsonl"), orient="records", lines=True
)
test_df.sample(frac=0.05).to_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)

# read ./squad-dataset/small_test.jsonl into a pandas dataframe
import json

test_df = pd.read_json("./squad-dataset/small_test.jsonl", orient="records", lines=True)
# take 10 random samples
test_df = test_df.sample(n=10)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
# flatten the json object in the "answer" column with the keys "answer_start" and "text"
json_struct = json.loads(test_df.to_json(orient="records"))
test_df = pd.json_normalize(json_struct)
# drop id and title columns
test_df = test_df.drop(columns=["id", "title"])

# create a json object with "inputs" as key and a list of json objects with "question" and "context" as keys
test_df_copy = test_df[['question', 'context']]
test_json = {"input_data": test_df_copy.to_dict('split')}

# write the json object to a file named sample_score.json in the ./squad-dataset folder
with open("./squad-dataset/sample_score.json", "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline

experiment_name: question-answering-extractive-qna

inputs:
compute_model_import: gpu-cluster-big
compute_preprocess: gpu-cluster-big
compute_finetune: gpu-cluster-big
compute_model_evaluation: gpu-cluster-big

# specify the foundation model available in the azureml system registry
mlflow_model_path:
path: azureml://registries/azureml/models/bert-based-uncased/versions/3
# huggingface_id: 'bert-base-uncased' # if you want to use a huggingface model, uncomment this line and comment the above line

# map the dataset files to parameters
train_file_path:
type: uri_file
path: "squad-dataset/small_train.jsonl"
validation_file_path:
type: uri_file
path: "squad-dataset/small_validation.jsonl"
test_file_path:
type: uri_file
path: "squad-dataset/small_test.jsonl"
evaluation_config_path:
type: uri_file
path: "../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"

# The following parameters map to the dataset fields
# the question whose answer needs to be extracted from the provided context 
# question_key parameter maps to the "question" field in the SQuAD dataset
question_key: "question"
# the context that contains the answer to the question
# context_key parameter maps to the "context" field in the SQuAD dataset
context_key: "context"
# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
# answers_key parameter maps to the "answers" field in the SQuAD dataset
answers_key: "answers"
# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
answer_start_key: "answer_start"
# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
# in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
answer_text_key: "text"

# training settings
number_of_gpu_to_use_finetuning: 2
num_train_epochs: 3
learning_rate: 2e-5

outputs:
# map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
# registering the model is required to deploy the model to an online or batch endpoint
trained_model:
type: mlflow_model

settings:
force_rerun: true

jobs:
question_answering_pipeline:
type: pipeline
component: azureml://registries/azureml/components/question_answering_pipeline/labels/latest
inputs:
mlflow_model_path: ${{parent.inputs.mlflow_model_path}}

compute_model_import: ${{parent.inputs.compute_model_import}}
compute_preprocess: ${{parent.inputs.compute_preprocess}}
compute_finetune: ${{parent.inputs.compute_finetune}}
compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}}

train_file_path: ${{parent.inputs.train_file_path}}
validation_file_path: ${{parent.inputs.validation_file_path}}
test_file_path: ${{parent.inputs.test_file_path}}
evaluation_config: ${{parent.inputs.evaluation_config_path}}

question_key: ${{parent.inputs.question_key}}
context_key: ${{parent.inputs.context_key}}
answers_key: ${{parent.inputs.answers_key}}
answer_start_key: ${{parent.inputs.answer_start_key}}
answer_text_key: ${{parent.inputs.answer_text_key}}

number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}}
num_train_epochs: ${{parent.inputs.num_train_epochs}}
learning_rate: ${{parent.inputs.learning_rate}}
outputs:
mlflow_model_folder: ${{parent.outputs.trained_model}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
set -x
# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-emotion-detection
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
registry_name="azureml"

compute_cluster="gpu-cluster-big"
# if above compute cluster does not exist, create it with the following vm size
compute_sku="Standard_ND40rs_v2"
# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node=2
# This is the foundation model for finetuning
model_name="bert-base-uncased"
# using the latest version of the model - not working yet
model_version=3

version=$(date +%s)
finetuned_model_name=$model_name"-extractive-qna"
endpoint_name="ext-qna-$version"
deployment_sku="Standard_DS3_v2"


# training data
train_data="squad-dataset/small_train.jsonl"
# validation data
validation_data="squad-dataset/small_validation.jsonl"
# test data
test_data="squad-dataset/small_test.jsonl"
# evaluation config
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/question-answering/question-answering-config.json"
# scoring_file
scoring_file="squad-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="question_answering_pipeline"
# The following parameters map to the dataset fields
# the question whose answer needs to be extracted from the provided context 
# question_key parameter maps to the "question" field in the SQuAD dataset
question_key="question"
# the context that contains the answer to the question
# context_key parameter maps to the "context" field in the SQuAD dataset
context_key="context"
# The value of this field is text in json format with two nested keys, answer_start_key and answer_text_key with their corresponding values
# answers_key parameter maps to the "answers" field in the SQuAD dataset
answers_key="answers"
# Refers to the position where the answer beings in context. Needs a value that maps to a nested key in the values of the answers_key parameter.
# in the SQuAD dataset, the answer_start_key maps "answer_start" under "answer"
answer_start_key="answer_start"
# Contains the answer to the question. Needs a value that maps to a nested key in the values of the answers_key parameter
# in the SQuAD dataset, the answer_text_key maps to "text" under "answer"
answer_text_key="text"
# Training settings
number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
num_train_epochs=3
learning_rate=2e-5

# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
[ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \
[ "$workspace_name" = "<WORKSPACE_NAME>" ]; then
echo "Please update the script with the subscription_id, resource_group_name and workspace_name"
exit 1
fi

az account set -s $subscription_id
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"

# check if $compute_cluster exists, else create it
if az ml compute show --name $compute_cluster $workspace_info
then
echo "Compute cluster $compute_cluster already exists"
else
echo "Creating compute cluster $compute_cluster"
az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster"
exit 1
}
fi

# download the dataset

python ./download-dataset.py || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name
then
echo "Model $model_name:$model_version does not exist in registry $registry_name"
exit 1
fi

# 3. Check if training data, validation data and test data exist
if [ ! -f $train_data ]; then
echo "Training data $train_data does not exist"
exit 1
fi
if [ ! -f $validation_data ]; then
echo "Validation data $validation_data does not exist"
exit 1
fi
if [ ! -f $test_data ]; then
echo "Test data $test_data does not exist"
exit 1
fi

# 4. Submit finetuning job using pipeline.yml

# check if the finetuning pipeline component exists
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
then
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
exit 1
fi

# need to switch to using latest version for model, currently blocked with a bug.
# submit finetuning job
parent_job_name=$( az ml job create --file ./extractive-qa-pipeline.yml $workspace_info --query name -o tsv --set \
jobs.question_answering_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
inputs.compute_model_import=$compute_cluster \
inputs.compute_preprocess=$compute_cluster \
inputs.compute_finetune=$compute_cluster \
inputs.compute_model_evaluation=$compute_cluster \
inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$model_name/versions/$model_version" \
inputs.train_file_path.path=$train_data \
inputs.validation_file_path.path=$validation_data \
inputs.test_file_path.path=$test_data \
inputs.evaluation_config.path=$evaluation_config \
inputs.question_key=$question_key \
inputs.context_key=$context_key \
inputs.answers_key=$answers_key \
inputs.answer_start_key=$answer_start_key \
inputs.answer_text_key=$answer_text_key \
inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \
inputs.num_train_epochs=$num_train_epochs \
inputs.learning_rate=$learning_rate ) || {
echo "Failed to submit finetuning job"
exit 1
}

az ml job stream --name $parent_job_name $workspace_info || {
echo "job stream failed"; exit 1;
}

# 5. Create model in workspace from train job output
az ml model create --name $finetuned_model_name --version $version --type mlflow_model \
--path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || {
echo "model create in workspace failed"; exit 1;
}

# 6. Deploy the model to an endpoint
# create online endpoint
az ml online-endpoint create --name $endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}

# deploy model from registry to endpoint in workspace
# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list
az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \
endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \
instance_type=$deployment_sku || {
echo "deployment create failed"; exit 1;
}

# 7. Try a sample scoring request

# Check if scoring data file exists
if [ -f $scoring_file ]; then
echo "Invoking endpoint $endpoint_name with following input:\n\n"
cat $scoring_file
echo "\n\n"
else
echo "Scoring file $scoring_file does not exist"
exit 1
fi

az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || {
echo "endpoint invoke failed"; exit 1;
}

# 8. Delete the endpoint
az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || {
echo "endpoint delete failed"; exit 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@
# rename the highlights column to ground_truth_summary
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
test_df_copy = test_df[['article']]
test_json = {"input_data": test_df_copy.to_dict('split')}
# save the json object to a file named sample_score.json in the ./emotion-dataset folder
with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
json.dump(test_json, f)
Loading
Loading