diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/README.md b/sdk/python/foundation-models/system/evaluation/fill-mask/README.md new file mode 100644 index 0000000000..5609ee1a3b --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/fill-mask/README.md @@ -0,0 +1,15 @@ +## Fill Mask + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:-----------------:|:-----------------------------------------------------------------------------------------------------------------|-----------|-----------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["perplexities"] | +| model_id | Model used for calculating Perplexity. Perplexity can only be calculated for causal language models. | str | "gpt2", "bert-base-uncased" | +| batch_size | The batch size to run texts through the model | int | 16 | +| add_start_token | Boolean flag to add the start token to the texts so the perplexity can include the probability of the first word | boolean | true, false | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* perplexities \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/eval-config.json b/sdk/python/foundation-models/system/evaluation/fill-mask/eval-config.json new file mode 100644 index 0000000000..3a74810200 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/fill-mask/eval-config.json @@ -0,0 +1,8 @@ +{ + "metrics": ["perplexities"], + "model_id": "gpt2", + "add_start_token": true, + "tokenizer_config": { + "truncation": true + } +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask-eval-dashboard.png new file mode 100644 index 0000000000..3eeb20923f Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask.ipynb b/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask.ipynb new file mode 100644 index 0000000000..2940a355d3 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fill Mask Evaluation\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `fill-mask` task. \n", + "\n", + "### Evaluation dataset\n", + "Contains ~70k pages from wikipedia, each describing a person. For each page, the person described in the text is masked with a mask token. The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License. Compared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies. Reference [rcds/wikipedia-for-mask-filling](https://huggingface.co/datasets/rcds/wikipedia-for-mask-filling/viewer/original_512/train).\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `fill-mask` is generic task type that can be used for predicting which words should replace some of the words that were masked in a sentence based on context provided. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the rcds/wikipedia-for-mask-filling dataset, we would like to look for models finetuned for this specific scenario. We will compare `bert-base-uncased`, `distilbert-base-uncased` and `microsoft-deberta-large` in this sample, which are available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample.\n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"bert-base-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"bert-base-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"bert-large-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"bert-large-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"camembert-base\", \"version\": \"4\", \"mask\": \"\"},\n", + " {\"name\": \"distilbert-base-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"distilbert-base-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"distilroberta-base\", \"version\": \"4\", \"mask\": \"\"},\n", + " {\"name\": \"microsoft-deberta-base\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"microsoft-deberta-large\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"microsoft-deberta-xlarge\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n", + " {\"name\": \"roberta-base\", \"version\": \"4\", \"mask\": \"\"},\n", + " {\"name\": \"roberta-large\", \"version\": \"4\", \"mask\": \"\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\n", + " \"rcds/wikipedia-for-mask-filling\", \"original_512\", split=\"train\", streaming=True\n", + ")\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"input_string\"] = test_data_df[\"texts\"]\n", + "test_data_df[\"title\"] = test_data_df[\"masks\"].apply(\n", + " lambda x: x[0] if len(x) > 0 else \"\"\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_mask_2_df = test_data_df\n", + "test_data_mask_1_df = pd.DataFrame(test_data_df)\n", + "test_data_mask_1_df[\"input_string\"] = test_data_mask_1_df[\"input_string\"].apply(\n", + " lambda x: x.replace(\"\", \"[MASK]\")\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_mask_1_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_mask_2_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_mask_1 = \"./small-test-[MASK].jsonl\" # [MASK]\n", + "test_data_mask_2 = \"./small-test-mask.jsonl\" # " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_mask_1_df.to_json(test_data_mask_1, lines=True, orient=\"records\")\n", + "test_data_mask_2_df.to_json(test_data_mask_2, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data_mask_1, lines=True).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "pd.read_json(test_data_mask_2, lines=True).head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `perplexities` in this sample.\n", + "\n", + "All supported evaluation configurations for `fill-mask` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "import json\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "with open(\"./eval-config.json\") as f:\n", + " evaluation_config_params = json.dumps(json.load(f))\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(test_data, mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=test_data,\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"input_string\",\n", + " label_column_name=\"title\",\n", + " # Evaluation settings\n", + " task=\"fill-mask\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " # evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " evaluation_config_params=evaluation_config_params,\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"fill-mask-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " if model[\"mask\"] == \"[MASK]\":\n", + " test_data = Input(type=AssetTypes.URI_FILE, path=test_data_mask_1)\n", + " else:\n", + " test_data = Input(type=AssetTypes.URI_FILE, path=test_data_mask_2)\n", + " pipeline_object = evaluation_pipeline(\n", + " test_data=test_data,\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./fill-mask-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"exact_match\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/README.md b/sdk/python/foundation-models/system/evaluation/question-answering/README.md new file mode 100644 index 0000000000..e6020e1d50 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/question-answering/README.md @@ -0,0 +1,20 @@ +## Question Answering + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:------------------:|:-------------------------------------------------------------------------------|-----------|-----------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["exact_match", "f1_score"] | +| tokenizer | Tokenizer object to perform tokenization on provided input text | boolean | false, true | +| regexes_to_ignore | List of regex to ignore in our input data points | list | ["$[A-Z]+"] | +| ignore_case | Boolean flag to indicate whether we need to ignore case | boolean | false, true | +| ignore_punctuation | Boolean flag to indicate whether we need to ignore punctuation | boolean | false, true | +| ignore_numbers | Boolean flag to indicate whether we need to ignore numbers | boolean | false, true | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* rouge1 +* rouge2 +* rougeLsum +* rougeL \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/eval-config.json b/sdk/python/foundation-models/system/evaluation/question-answering/eval-config.json new file mode 100644 index 0000000000..15165acfe5 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/question-answering/eval-config.json @@ -0,0 +1,7 @@ +{ + "metrics": ["exact_match", "f1_score"], + "regexes_to_ignore": ["$[A-Z]+"], + "ignore_case": false, + "ignore_numbers": false, + "ignore_punctuations": true +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/question-answering-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/question-answering/question-answering-eval-dashboard.png new file mode 100644 index 0000000000..dc76733da1 Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/question-answering/question-answering-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/question-answering.ipynb b/sdk/python/foundation-models/system/evaluation/question-answering/question-answering.ipynb new file mode 100644 index 0000000000..f17db2fe80 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/question-answering/question-answering.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question Answering Evaluation\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `question-answering` task. \n", + "\n", + "### Evaluation dataset\n", + "The version 2 of Stanford Question Answering Dataset (SQuAD), SQuAD 2.0, combines the 100,000 questions in SQuAD 1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. Reference [SQuAD_v2](https://huggingface.co/datasets/squad_v2).\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `question-answering` is generic task type that can be used for scenarios to answer questions based on context provided. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the SQuAD_v2 dataset, we would like to look for models finetuned for this specific scenario. We will compare `distilbert-base-uncased-distilled-squad`, `deepset-roberta-base-squad2` and `deepset-minilm-uncased-squad2` in this sample, which are available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample.\n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"deepset-minilm-uncased-squad2\", \"version\": \"4\"},\n", + " {\"name\": \"deepset-roberta-base-squad2\", \"version\": \"4\"},\n", + " {\"name\": \"distilbert-base-cased-distilled-squad\", \"version\": \"4\"},\n", + " {\"name\": \"distilbert-base-uncased-distilled-squad\", \"version\": \"4\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\"squad_v2\", split=\"validation\", streaming=True)\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"answer_text\"] = test_data_df[\"answers\"].apply(\n", + " lambda x: x[\"text\"][0] if len(x[\"text\"]) > 0 else \"\"\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = \"./small-test.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.to_json(test_data, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data, lines=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `exact_match` and `f1_score` in this sample.\n", + "\n", + "All supported evaluation configurations for `question-answering` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=Input(type=AssetTypes.URI_FILE, path=test_data),\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"context,question\",\n", + " label_column_name=\"answer_text\",\n", + " # Evaluation settings\n", + " task=\"question-answering\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"question-answering-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " pipeline_object = evaluation_pipeline(\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./question-answering-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"exact_match\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/evaluation/summarization/README.md b/sdk/python/foundation-models/system/evaluation/summarization/README.md new file mode 100644 index 0000000000..4973b2a713 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/summarization/README.md @@ -0,0 +1,17 @@ +## Summarization + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:-----------------:|:--------------------------------------------------------------------------------------|-----------|---------------------------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["rouge1", "rouge2", "rougeL", "rougeLsum"] | +| aggregator | Boolean flag to indicate if need to aggregate rouge scores for individual data points | boolean | true, false | +| stemmer | Boolean flag to indicate whether to use Porter Stemmer for suffixes | boolean | true, false | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* rouge1 +* rouge2 +* rougeLsum +* rougeL \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/summarization/abstractive-and-extractive-summarization.ipynb b/sdk/python/foundation-models/system/evaluation/summarization/abstractive-and-extractive-summarization.ipynb new file mode 100644 index 0000000000..a2df81d607 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/summarization/abstractive-and-extractive-summarization.ipynb @@ -0,0 +1,473 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Summarization Evaluation - Abstractive and Extractive Summarization\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `text-summarization` task. \n", + "\n", + "### Evaluation dataset\n", + "The CNN / DailyMail Dataset is an English-language dataset containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail. The current version supports both extractive and abstractive summarization, though the original version was created for machine reading and comprehension and abstractive question answering. Reference [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail).\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `text-summarization` is generic task type that can be used for scenarios such as abstractive and extractive summarization. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the CNN_Dailymail dataset, we would like to look for models finetuned for this specific scenario. We will compare `sshleifer-distilbart-cnn-12-6`, and `facebook-bart-large-cnn` in this sample, which are available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample.\n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"facebook-bart-large-cnn\", \"version\": \"4\"},\n", + " {\"name\": \"sshleifer-distilbart-cnn-12-6\", \"version\": \"4\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"test\", streaming=True)\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"input_string\"] = test_data_df[\"article\"]\n", + "test_data_df[\"summary\"] = test_data_df[\"highlights\"]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = \"./small-test.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.to_json(test_data, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data, lines=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `rouge1`, `rouge2`, `rougeL` and `rougeLsum` in this sample.\n", + "\n", + "All supported evaluation configurations for `text-summarization` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "import json\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "with open(\"./eval-config.json\") as f:\n", + " evaluation_config_params = json.dumps(json.load(f))\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=Input(type=AssetTypes.URI_FILE, path=test_data),\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"input_string\",\n", + " label_column_name=\"summary\",\n", + " # Evaluation settings\n", + " task=\"text-summarization\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " # evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " evaluation_config_params=evaluation_config_params,\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"summarization-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " pipeline_object = evaluation_pipeline(\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./text-summarization-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"rouge1\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/evaluation/summarization/eval-config.json b/sdk/python/foundation-models/system/evaluation/summarization/eval-config.json new file mode 100644 index 0000000000..c389143f96 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/summarization/eval-config.json @@ -0,0 +1,8 @@ +{ + "metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"], + "aggregator": true, + "stemmer": true, + "tokenizer_config": { + "truncation": true + } +} diff --git a/sdk/python/foundation-models/system/evaluation/summarization/text-summarization-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/summarization/text-summarization-eval-dashboard.png new file mode 100644 index 0000000000..ff381d293d Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/summarization/text-summarization-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/text-generation/README.md b/sdk/python/foundation-models/system/evaluation/text-generation/README.md new file mode 100644 index 0000000000..2f67947277 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/text-generation/README.md @@ -0,0 +1,23 @@ +## Text Generation + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:-----------------:|:--------------------------------------------------------------------------------------|-----------|------------------------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["bleu_1", "bleu_2", "rouge1", "rouge2"] | +| tokenizer | Tokenizer object to perform tokenization on provided input text | | | +| smoothing | Boolean flag to indicate if bleu score needs to be smoothened | boolean | false, true | +| aggregator | Boolean flag to indicate if need to aggregate rouge scores for individual data points | boolean | true, false | +| stemmer | Boolean flag to indicate whether to use Porter Stemmer for suffixes | boolean | true, false | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* rouge1 +* rouge2 +* rougeLsum +* rougeL +* bleu_1 +* bleu_2 +* bleu_3 +* bleu_4 \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/text-generation/eval-config.json b/sdk/python/foundation-models/system/evaluation/text-generation/eval-config.json new file mode 100644 index 0000000000..8dd4358113 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/text-generation/eval-config.json @@ -0,0 +1,6 @@ +{ + "metrics": ["rouge1", "rouge2", "bleu_3", "bleu_4"], + "aggregator": true, + "stemmer": true, + "smoothing": false +} diff --git a/sdk/python/foundation-models/system/evaluation/text-generation/text-generation-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/text-generation/text-generation-eval-dashboard.png new file mode 100644 index 0000000000..e9f076781f Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/text-generation/text-generation-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/text-generation/text-generation.ipynb b/sdk/python/foundation-models/system/evaluation/text-generation/text-generation.ipynb new file mode 100644 index 0000000000..40ba89ac87 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/text-generation/text-generation.ipynb @@ -0,0 +1,458 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Generation Evaluation\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `text-generation` task.\n", + "\n", + "### Evaluation dataset\n", + "The CNN / DailyMail Dataset is an English-language dataset containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail. The current version supports both extractive and abstractive summarization, though the original version was created for machine reading and comprehension and abstractive question answering. Reference [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail).\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `text-generation` is generic task type that can be used for scenarios to generate text based on context provided. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the cnn_dailymail dataset, we would like to look for models finetuned for this specific scenario. We will compare `gpt2`, `gpt2-medium` and `distilgpt2` in this sample, which are available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample.\n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"distilgpt2\", \"version\": \"4\"},\n", + " {\"name\": \"gpt2\", \"version\": \"4\"},\n", + " {\"name\": \"gpt2-large\", \"version\": \"4\"},\n", + " {\"name\": \"gpt2-medium\", \"version\": \"4\"},\n", + " {\"name\": \"databricks-dolly-v2-12b\", \"version\": \"1\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"test\", streaming=True)\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"input_string\"] = test_data_df[\"article\"].apply(lambda x: x[:100])\n", + "test_data_df[\"ground_truth\"] = test_data_df[\"article\"]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = \"./small-test.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.to_json(test_data, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data, lines=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `rouge1`, `rouge2`, `bleu_3` and `bleu_4` in this sample.\n", + "\n", + "All supported evaluation configurations for `text-generation` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=Input(type=AssetTypes.URI_FILE, path=test_data),\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"input_string\",\n", + " label_column_name=\"ground_truth\",\n", + " # Evaluation settings\n", + " task=\"text-generation\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"text-generation-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " pipeline_object = evaluation_pipeline(\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./text-generation-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"exact_match\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/sdk/python/foundation-models/system/evaluation/token-classification/README.md b/sdk/python/foundation-models/system/evaluation/token-classification/README.md new file mode 100644 index 0000000000..6f82df11eb --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/token-classification/README.md @@ -0,0 +1,22 @@ +## Named Entity Recognition + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:------------------------:|:-------------------------------------------------------------------------------|-----------|---------------------------------------------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["accuracy", "f1_score_macro", "f1_score_micro"] | +| labels_list | List for supported labels for tokens | list | ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "B-MISC", "I-MISC"] | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* f1_score_macro +* precision_score_weighted +* precision_score_macro +* f1_score_weighted +* precision_score_micro +* recall_score_weighted +* f1_score_micro +* accuracy +* recall_score_micro +* recall_score_macro \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/token-classification/eval-config.json b/sdk/python/foundation-models/system/evaluation/token-classification/eval-config.json new file mode 100644 index 0000000000..360dd40365 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/token-classification/eval-config.json @@ -0,0 +1,3 @@ +{ + "metrics": ["accuracy", "f1_score_macro", "f1_score_micro"] +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/token-classification/news-articles-entity-recognition.ipynb b/sdk/python/foundation-models/system/evaluation/token-classification/news-articles-entity-recognition.ipynb new file mode 100644 index 0000000000..23e6c78d4a --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/token-classification/news-articles-entity-recognition.ipynb @@ -0,0 +1,503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Named Entity Recognition Evaluation\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `text-named-entity-recognition` task. \n", + "\n", + "### Evaluation dataset\n", + "The CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on a separate line and there is an empty line after each sentence. The first item on each line is a word, the second a part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags and the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only if two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2 tagging scheme, whereas the original dataset uses IOB1. Reference [CoNLL-2003](https://huggingface.co/datasets/conll2003).\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `text-named-entity-recognition` is generic task type that can be used for scenarios to recognise named entities such as persons, locations, organizations, etc. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the CoNLL-2003 dataset, we would like to look for models finetuned for this specific scenario. We will review `jean-baptiste-camembert-ner` in this sample, which is available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample.\n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"Jean-Baptiste-camembert-ner\", \"version\": \"4\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\"conll2003\", split=\"test\", streaming=True)\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Picked from https://huggingface.co/datasets/conll2003\n", + "label_dict = {\n", + " \"O\": 0,\n", + " \"B-PER\": 1,\n", + " \"I-PER\": 2,\n", + " \"B-ORG\": 3,\n", + " \"I-ORG\": 4,\n", + " \"B-LOC\": 5,\n", + " \"I-LOC\": 6,\n", + " \"B-MISC\": 7,\n", + " \"I-MISC\": 8,\n", + "}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "label_reverse_dict = {value: key for key, value in label_dict.items()}\n", + "label_reverse_dict" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"input_string\"] = test_data_df[\"tokens\"].apply(lambda x: \" \".join(x))\n", + "test_data_df[\"ner_tags_str\"] = test_data_df[\"ner_tags\"].apply(\n", + " lambda x: str([label_reverse_dict[tag] for tag in x])\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = \"./small-test.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.to_json(test_data, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data, lines=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `accuracy`, `f1_score_macro` and `f1_score_micro` in this sample.\n", + "\n", + "All supported evaluation configurations for `text-named-entity-recognition` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=Input(type=AssetTypes.URI_FILE, path=test_data),\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"input_string\",\n", + " label_column_name=\"ner_tags_str\",\n", + " # Evaluation settings\n", + " task=\"text-named-entity-recognition\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"text-named-entity-recognition-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " pipeline_object = evaluation_pipeline(\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./text-named-entity-recognition-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"accuracy\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/evaluation/token-classification/text-named-entity-recognition-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/token-classification/text-named-entity-recognition-eval-dashboard.png new file mode 100644 index 0000000000..088d148de6 Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/token-classification/text-named-entity-recognition-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/translation/README.md b/sdk/python/foundation-models/system/evaluation/translation/README.md new file mode 100644 index 0000000000..6c6c0383e8 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/translation/README.md @@ -0,0 +1,17 @@ +## Translation + +### List of supported keyword arguments: + +| Keyword Argument | Description | Type | Sample | +|:-----------------:|:-------------------------------------------------------------------------------|-----------|------------------------------------------| +| metrics | List for subset of metrics to be computed. All supported metrics listed below. | list | ["bleu_1", "bleu_2", "bleu_3", "bleu_4"] | +| tokenizer | Tokenizer object to perform tokenization on provided input text | | | +| smoothing | Boolean flag to indicate if bleu score needs to be smoothened | boolean | false, true | +| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing) | dict | | + +### List of supported metrics: + +* bleu_1 +* bleu_2 +* bleu_3 +* bleu_4 \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/translation/eval-config.json b/sdk/python/foundation-models/system/evaluation/translation/eval-config.json new file mode 100644 index 0000000000..213d60cd45 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/translation/eval-config.json @@ -0,0 +1,4 @@ +{ + "metrics": ["bleu_1", "bleu_2", "bleu_3", "bleu_4"], + "smoothing": false +} \ No newline at end of file diff --git a/sdk/python/foundation-models/system/evaluation/translation/text-translation-eval-dashboard.png b/sdk/python/foundation-models/system/evaluation/translation/text-translation-eval-dashboard.png new file mode 100644 index 0000000000..24a97420d0 Binary files /dev/null and b/sdk/python/foundation-models/system/evaluation/translation/text-translation-eval-dashboard.png differ diff --git a/sdk/python/foundation-models/system/evaluation/translation/translation-romanian-to-english.ipynb b/sdk/python/foundation-models/system/evaluation/translation/translation-romanian-to-english.ipynb new file mode 100644 index 0000000000..e5d42ab321 --- /dev/null +++ b/sdk/python/foundation-models/system/evaluation/translation/translation-romanian-to-english.ipynb @@ -0,0 +1,473 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Translation Evaluation - Translation between Romanian and English\n", + "\n", + "This sample shows how use the evaluate a group of models against a given set of metrics for the `text-translation` task. \n", + "\n", + "### Evaluation dataset\n", + "Translation dataset based on the data from statmt.org. Versions exist for different years using a combination of data sources. The base wmt allows you to create a custom dataset by choosing your own data/language pair. A copy of the [wmt16/ro-en](https://huggingface.co/datasets/wmt16/viewer/ro-en) dataset is available in the [wmt16_ro-en](./wmt16_ro-en) folder.\n", + "\n", + "### Model\n", + "The goal of evaluating models is to compare their performance on a variety of metrics. `text-translation` is generic task type that can be used for translation between two languages. As such, the models you pick to compare must be finetuned for same scenario. Given that we have the WMT16-RO-EN dataset, we would like to look for models finetuned for this specific scenario. We will compare `t5-base`, `t5-small` and `t5-large` in this sample, which are available in the `azureml` system registry.\n", + "\n", + "If you'd like to evaluate models that are not in the system registry, you can import those models to your workspace or organization registry and then evaluate them using the approach outlined in this sample. Review the sample notebook for [importing models](../../import/import-model-from-huggingface.ipynb). \n", + "\n", + "### Outline\n", + "* Setup pre-requisites such as compute.\n", + "* Pick the models to evaluate.\n", + "* Pick and explore evaluate data.\n", + "* Configure the evaluation jobs.\n", + "* Run the evaluation jobs.\n", + "* Review the evaluation metrics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry\n", + "* Set an optional experiment name\n", + "* Check or create compute. A single GPU node can have multiple GPU cards. For example, in one node of `Standard_ND40rs_v2` there are 8 NVIDIA V100 GPUs while in `Standard_NC12s_v3`, there are 2 NVIDIA V100 GPUs. Refer to the [docs](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) for this information. The number of GPU cards per node is set in the param `gpus_per_node` below. Setting this value correctly will ensure utilization of all GPUs in the node. The recommended GPU compute SKUs can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series) and [here](https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies by running below cell. This is not an optional step if running in a new environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "%pip install --upgrade azure-ai-ml\n", + "%pip install --upgrade azure-identity\n", + "%pip install --upgrade datasets==2.9.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319346668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " workspace = workspace_ml_client.workspace_name\n", + " resource_group = workspace_ml_client.resource_group_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace = \"\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace\n", + " )\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry = \"azureml\"\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, registry_name=registry\n", + ")\n", + "registry_ml_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'\n", + "compute_cluster = \"gpu-cluster-big\"\n", + "try:\n", + " compute = workspace_ml_client.compute.get(compute_cluster)\n", + " print(f\"GPU compute '{compute_cluster}' found.\")\n", + "except Exception as ex:\n", + " print(f\"GPU compute '{compute_cluster}' not found. Creating new one.\")\n", + " compute = AmlCompute(\n", + " name=compute_cluster,\n", + " size=\"Standard_ND40rs_v2\",\n", + " max_instances=2, # For multi node training set this to an integer value more than 1\n", + " )\n", + " workspace_ml_client.compute.begin_create_or_update(compute).wait()\n", + "\n", + "# generating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below snippet will allow us to query number of GPU's present on the compute. We can use it to set `gpu_per_node` to ensure utilization of all GPUs in the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is the number of GPUs in a single node of the selected 'vm_size' compute.\n", + "# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.\n", + "# Setting this to more than the number of GPUs will result in an error.\n", + "gpus_per_node = 1 # default value\n", + "gpu_count_found = False\n", + "ws_computes = workspace_ml_client.compute.list_sizes()\n", + "for ws_compute in ws_computes:\n", + " if ws_compute.name.lower() == compute.size.lower():\n", + " gpus_per_node = ws_compute.gpus\n", + " print(f\"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}\")\n", + "# if gpu_count_found not found, then print an error\n", + "if gpus_per_node > 0:\n", + " gpu_count_found = True\n", + "else:\n", + " gpu_count_found = False\n", + " print(f\"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick the models to evaluate\n", + "\n", + "Verify that the models selected for evaluation are available in system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "gather": { + "logged": 1679319354708 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n", + "models = [\n", + " {\"name\": \"t5-base\", \"version\": \"4\"},\n", + " {\"name\": \"t5-large\", \"version\": \"4\"},\n", + " {\"name\": \"t5-small\", \"version\": \"4\"},\n", + "]\n", + "for model in models:\n", + " model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n", + " print(model.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick the test dataset for evaluation\n", + "A copy of the wmt16/ro-en is available in the [wmt16/ro-en](./wmt16_ro-en/) folder. The next few cells show basic data preparation:\n", + "* Visualize some data rows\n", + "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n", + "* To use the entire dataset, uncomment the cells below and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "hf_test_data = load_dataset(\"wmt16\", \"ro-en\", split=\"test\", streaming=True)\n", + "\n", + "test_data_df = pd.DataFrame(hf_test_data.take(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df[\"input_string\"] = test_data_df[\"translation\"].apply(lambda x: x[\"en\"])\n", + "test_data_df[\"ro\"] = test_data_df[\"translation\"].apply(lambda x: x[\"ro\"])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data = \"./small-test.jsonl\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "test_data_df.to_json(test_data, lines=True, orient=\"records\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.read_json(test_data, lines=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit the evaluation jobs using the model and data as inputs\n", + "\n", + "Create the job that uses the `model_evaluation_pipeline` component. We will submit one job per model.\n", + "\n", + "Note that the metrics that the evaluation jobs need to calculate are specified in the [eval_config.json](./eval_config.json) file. We calculate `bleu_1`, `bleu_2`, `bleu_3` and `bleu_4` in this sample.\n", + "\n", + "All supported evaluation configurations for `text-translation` can be found in [README](./README.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# fetch the pipeline component\n", + "pipeline_component_func = registry_ml_client.components.get(\n", + " name=\"model_evaluation_pipeline\", label=\"latest\"\n", + ")\n", + "\n", + "\n", + "# define the pipeline job\n", + "@pipeline()\n", + "def evaluation_pipeline(mlflow_model):\n", + " evaluation_job = pipeline_component_func(\n", + " # specify the foundation model available in the azureml system registry or a model from the workspace\n", + " # mlflow_model = Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{mlflow_model_path}\"),\n", + " mlflow_model=mlflow_model,\n", + " # test data\n", + " test_data=Input(type=AssetTypes.URI_FILE, path=test_data),\n", + " # The following parameters map to the dataset fields\n", + " input_column_names=\"input_string\",\n", + " label_column_name=\"ro\",\n", + " # Evaluation settings\n", + " task=\"text-translation\",\n", + " # config file containing the details of evaluation metrics to calculate\n", + " evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n", + " # config cluster/device job is running on\n", + " # set device to GPU/CPU on basis if GPU count was found\n", + " device=\"gpu\" if gpu_count_found else \"cpu\",\n", + " )\n", + " return {\"evaluation_result\": evaluation_job.outputs.evaluation_result}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the jobs, passing the model as a parameter to the pipeline created in the above step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# submit the pipeline job for each model that we want to evaluate\n", + "# you could consider submitting the pipeline jobs in parallel, provided your cluster has multiple nodes\n", + "pipeline_jobs = []\n", + "\n", + "experiment_name = \"text-translation-evaluation\"\n", + "\n", + "for model in models:\n", + " model_object = registry_ml_client.models.get(\n", + " model[\"name\"], version=model[\"version\"]\n", + " )\n", + " pipeline_object = evaluation_pipeline(\n", + " mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n", + " )\n", + " # don't reuse cached results from previous jobs\n", + " pipeline_object.settings.force_rerun = True\n", + " pipeline_object.settings.default_compute = compute_cluster\n", + " pipeline_object.display_name = f\"eval-{model['name']}-{timestamp}\"\n", + " pipeline_job = workspace_ml_client.jobs.create_or_update(\n", + " pipeline_object, experiment_name=experiment_name\n", + " )\n", + " # add model['name'] and pipeline_job.name as key value pairs to a dictionary\n", + " pipeline_jobs.append({\"model_name\": model[\"name\"], \"job_name\": pipeline_job.name})\n", + " # wait for the pipeline job to complete\n", + " workspace_ml_client.jobs.stream(pipeline_job.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Review evaluation metrics\n", + "Viewing the job in AzureML studio is the best way to analyze logs, metrics and outputs of jobs. You can create custom charts and compare metics across different jobs. See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?tabs=interactive#view-jobsruns-information-in-the-studio to learn more.\n", + "\n", + "![Model evaluation dashboard in AzureML studio](./text-translation-eval-dashboard.png)\n", + "\n", + "However, we may need to access and review metrics programmatically for which we will use MLflow, which is the recommended client for logging and querying metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow, json\n", + "\n", + "mlflow_tracking_uri = workspace_ml_client.workspaces.get(\n", + " workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + "metrics_df = pd.DataFrame()\n", + "for job in pipeline_jobs:\n", + " # concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + " filter = \"tags.mlflow.rootRunId='\" + job[\"job_name\"] + \"'\"\n", + " runs = mlflow.search_runs(\n", + " experiment_names=[experiment_name], filter_string=filter, output_format=\"list\"\n", + " )\n", + " # get the compute_metrics runs.\n", + " # using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + " for run in runs:\n", + " # else, check if run.data.metrics.accuracy exists\n", + " if \"bleu_1\" in run.data.metrics:\n", + " # get the metrics from the mlflow run\n", + " run_metric = run.data.metrics\n", + " # add the model name to the run_metric dictionary\n", + " run_metric[\"model_name\"] = job[\"model_name\"]\n", + " # convert the run_metric dictionary to a pandas dataframe\n", + " temp_df = pd.DataFrame(run_metric, index=[0])\n", + " # concat the temp_df to the metrics_df\n", + " metrics_df = pd.concat([metrics_df, temp_df], ignore_index=True)\n", + "\n", + "# move the model_name columns to the first column\n", + "cols = metrics_df.columns.tolist()\n", + "cols = cols[-1:] + cols[:-1]\n", + "metrics_df = metrics_df[cols]\n", + "metrics_df.head()" + ] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}