Model Evaluation sample notebooks - Fetch and Use latest model version (

#2352) * Model Evaluation sample notebooks - Fetch and Use latest model version * Model Evaluation sample notebooks - Fix Fill Mask task notebooks * Model Evaluation sample notebooks - Fixing black runs * Model Evaluation sample notebooks - Fixing Fill Mask notebooks * Model Evaluation sample notebooks - Temporarily removing camembert-base for Fill Mask task --------- Co-authored-by: Sarthak Singhal <sarsinghal@microsoft.com>
Azure · Jun 8, 2023 · b7e0b18 · b7e0b18
1 parent 2eee6de
commit b7e0b18
Show file tree

Hide file tree

Showing 9 changed files with 307 additions and 166 deletions.
diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/eval-config.json b/sdk/python/foundation-models/system/evaluation/fill-mask/eval-config.json
@@ -1,8 +1,4 @@
 {
   "metrics": ["perplexities"],
-  "model_id": "gpt2",
-  "add_start_token": true,
-  "tokenizer_config": {
-    "truncation": true
-  }
+  "add_start_token": true
 }
diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask.ipynb b/sdk/python/foundation-models/system/evaluation/fill-mask/fill-mask.ipynb
@@ -203,112 +203,87 @@
    "outputs": [],
    "source": [
     "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n",
-    "models = [\n",
-    "    {\"name\": \"bert-base-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"bert-base-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"bert-large-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"bert-large-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"camembert-base\", \"version\": \"4\", \"mask\": \"<mask>\"},\n",
-    "    {\"name\": \"distilbert-base-cased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"distilbert-base-uncased\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"distilroberta-base\", \"version\": \"4\", \"mask\": \"<mask>\"},\n",
-    "    {\"name\": \"microsoft-deberta-base\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"microsoft-deberta-large\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"microsoft-deberta-xlarge\", \"version\": \"4\", \"mask\": \"[MASK]\"},\n",
-    "    {\"name\": \"roberta-base\", \"version\": \"4\", \"mask\": \"<mask>\"},\n",
-    "    {\"name\": \"roberta-large\", \"version\": \"4\", \"mask\": \"<mask>\"},\n",
-    "]\n",
-    "for model in models:\n",
-    "    model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n",
-    "    print(model.id)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3. Pick the test dataset for evaluation\n",
-    "The next few cells show basic data preparation:\n",
-    "* Visualize some data rows\n",
-    "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n",
-    "* To use the entire dataset, uncomment the cells below and run."
+    "model_details = [\n",
+    "    {\"name\": \"bert-base-cased\"},\n",
+    "    {\"name\": \"bert-base-uncased\"},\n",
+    "    {\"name\": \"bert-large-cased\"},\n",
+    "    {\"name\": \"bert-large-uncased\"},\n",
+    "    # {\"name\": \"camembert-base\"},\n",
+    "    {\"name\": \"distilbert-base-cased\"},\n",
+    "    {\"name\": \"distilbert-base-uncased\"},\n",
+    "    {\"name\": \"distilroberta-base\"},\n",
+    "    {\"name\": \"microsoft-deberta-base\", \"pretrained\": \"microsoft/deberta-base\"},\n",
+    "    {\"name\": \"microsoft-deberta-large\", \"pretrained\": \"microsoft/deberta-large\"},\n",
+    "    {\"name\": \"microsoft-deberta-xlarge\", \"pretrained\": \"microsoft/deberta-xlarge\"},\n",
+    "    {\"name\": \"roberta-base\"},\n",
+    "    {\"name\": \"roberta-large\"},\n",
+    "]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
    "outputs": [],
    "source": [
-    "from datasets import load_dataset\n",
+    "models = []\n",
     "\n",
-    "hf_test_data = load_dataset(\n",
-    "    \"rcds/wikipedia-for-mask-filling\", \"original_512\", split=\"train\", streaming=True\n",
-    ")\n",
-    "\n",
-    "test_data_df = pd.DataFrame(hf_test_data.take(1000))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+    "for model in model_details:\n",
+    "    reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
+    "    print(reg_model.id)\n",
+    "    models.append({**model, \"version\": reg_model.version})"
+   ],
    "metadata": {
     "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "test_data_df[\"input_string\"] = test_data_df[\"texts\"]\n",
-    "test_data_df[\"title\"] = test_data_df[\"masks\"].apply(\n",
-    "    lambda x: x[0] if len(x) > 0 else \"\"\n",
-    ")"
-   ]
+   }
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
    "outputs": [],
    "source": [
-    "test_data_mask_2_df = test_data_df\n",
-    "test_data_mask_1_df = pd.DataFrame(test_data_df)\n",
-    "test_data_mask_1_df[\"input_string\"] = test_data_mask_1_df[\"input_string\"].apply(\n",
-    "    lambda x: x.replace(\"<mask>\", \"[MASK]\")\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+    "models"
+   ],
    "metadata": {
     "collapsed": false
-   },
-   "outputs": [],
+   }
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "test_data_mask_1_df.head()"
+    "### 3. Pick the test dataset for evaluation\n",
+    "The next few cells show basic data preparation:\n",
+    "* Visualize some data rows\n",
+    "* We want this sample to run quickly, so we use a smaller dataset containing 10% of the original.\n",
+    "* To use the entire dataset, uncomment the cells below and run."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
    "outputs": [],
    "source": [
-    "test_data_mask_2_df.head()"
-   ]
+    "%pip install transformers\n",
+    "%pip install torch"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_data_mask_1 = \"./small-test-[MASK].jsonl\"  # [MASK]\n",
-    "test_data_mask_2 = \"./small-test-mask.jsonl\"  # <mask>"
+    "from datasets import load_dataset\n",
+    "\n",
+    "hf_test_data = load_dataset(\n",
+    "    \"rcds/wikipedia-for-mask-filling\", \"original_512\", split=\"train\", streaming=True\n",
+    ")\n",
+    "\n",
+    "test_data_df = pd.DataFrame(hf_test_data.take(1000))"
    ]
   },
   {
@@ -319,31 +294,35 @@
    },
    "outputs": [],
    "source": [
-    "test_data_mask_1_df.to_json(test_data_mask_1, lines=True, orient=\"records\")\n",
-    "test_data_mask_2_df.to_json(test_data_mask_2, lines=True, orient=\"records\")"
+    "test_data_df[\"title\"] = test_data_df[\"masks\"].apply(\n",
+    "    lambda x: x[0] if len(x) > 0 else \"\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
    "outputs": [],
    "source": [
-    "pd.read_json(test_data_mask_1, lines=True).head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "for model in models:\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model.get(\"pretrained\", model[\"name\"]))\n",
+    "    test_data_df[\"input_string\"] = test_data_df[\"texts\"].apply(\n",
+    "        lambda x: tokenizer.decode(\n",
+    "            tokenizer.encode(\n",
+    "                x.replace(\"<mask>\", tokenizer.mask_token),\n",
+    "                max_length=512,\n",
+    "                truncation=True,\n",
+    "            )[:500]\n",
+    "        )\n",
+    "    )\n",
+    "    test_data_file_name = \"small-test-{}.jsonl\".format(model[\"name\"])\n",
+    "    test_data_df.to_json(test_data_file_name, lines=True, orient=\"records\")"
+   ],
    "metadata": {
     "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "pd.read_json(test_data_mask_2, lines=True).head()"
-   ]
+   }
   },
   {
    "attachments": {},
@@ -375,8 +354,8 @@
     "    name=\"model_evaluation_pipeline\", label=\"latest\"\n",
     ")\n",
     "\n",
-    "with open(\"./eval-config.json\") as f:\n",
-    "    evaluation_config_params = json.dumps(json.load(f))\n",
+    "# with open(\"./eval-config.json\") as f:\n",
+    "#     evaluation_config_params = json.dumps(json.load(f))\n",
     "\n",
     "\n",
     "# define the pipeline job\n",
@@ -394,8 +373,8 @@
     "        # Evaluation settings\n",
     "        task=\"fill-mask\",\n",
     "        # config file containing the details of evaluation metrics to calculate\n",
-    "        # evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n",
-    "        evaluation_config_params=evaluation_config_params,\n",
+    "        evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n",
+    "        # evaluation_config_params=evaluation_config_params,\n",
     "        # config cluster/device job is running on\n",
     "        # set device to GPU/CPU on basis if GPU count was found\n",
     "        device=\"gpu\" if gpu_count_found else \"cpu\",\n",
@@ -429,12 +408,10 @@
     "    model_object = registry_ml_client.models.get(\n",
     "        model[\"name\"], version=model[\"version\"]\n",
     "    )\n",
-    "    if model[\"mask\"] == \"[MASK]\":\n",
-    "        test_data = Input(type=AssetTypes.URI_FILE, path=test_data_mask_1)\n",
-    "    else:\n",
-    "        test_data = Input(type=AssetTypes.URI_FILE, path=test_data_mask_2)\n",
     "    pipeline_object = evaluation_pipeline(\n",
-    "        test_data=test_data,\n",
+    "        test_data=Input(\n",
+    "            type=AssetTypes.URI_FILE, path=\"small-test-{}.jsonl\".format(model[\"name\"])\n",
+    "        ),\n",
     "        mlflow_model=Input(type=AssetTypes.MLFLOW_MODEL, path=f\"{model_object.id}\"),\n",
     "    )\n",
     "    # don't reuse cached results from previous jobs\n",

diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/question-answering.ipynb b/sdk/python/foundation-models/system/evaluation/question-answering/question-answering.ipynb
@@ -206,17 +206,41 @@
    "outputs": [],
    "source": [
     "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n",
-    "models = [\n",
-    "    {\"name\": \"deepset-minilm-uncased-squad2\", \"version\": \"4\"},\n",
-    "    {\"name\": \"deepset-roberta-base-squad2\", \"version\": \"4\"},\n",
-    "    {\"name\": \"distilbert-base-cased-distilled-squad\", \"version\": \"4\"},\n",
-    "    {\"name\": \"distilbert-base-uncased-distilled-squad\", \"version\": \"4\"},\n",
-    "]\n",
-    "for model in models:\n",
-    "    model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n",
-    "    print(model.id)"
+    "model_details = [\n",
+    "    {\"name\": \"deepset-minilm-uncased-squad2\"},\n",
+    "    {\"name\": \"deepset-roberta-base-squad2\"},\n",
+    "    {\"name\": \"distilbert-base-cased-distilled-squad\"},\n",
+    "    {\"name\": \"distilbert-base-uncased-distilled-squad\"},\n",
+    "]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "models = []\n",
+    "\n",
+    "for model in model_details:\n",
+    "    reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
+    "    print(reg_model.id)\n",
+    "    models.append({**model, \"version\": reg_model.version})"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "models"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

diff --git a/...ion-models/system/evaluation/summarization/abstractive-and-extractive-summarization.ipynb b/...ion-models/system/evaluation/summarization/abstractive-and-extractive-summarization.ipynb
@@ -201,15 +201,39 @@
    "outputs": [],
    "source": [
     "# need to specify model versions until the bug to support fetching the latest version using latest label is fixed\n",
-    "models = [\n",
-    "    {\"name\": \"facebook-bart-large-cnn\", \"version\": \"4\"},\n",
-    "    {\"name\": \"sshleifer-distilbart-cnn-12-6\", \"version\": \"4\"},\n",
-    "]\n",
-    "for model in models:\n",
-    "    model = registry_ml_client.models.get(model[\"name\"], version=model[\"version\"])\n",
-    "    print(model.id)"
+    "model_details = [\n",
+    "    {\"name\": \"facebook-bart-large-cnn\"},\n",
+    "    {\"name\": \"sshleifer-distilbart-cnn-12-6\"},\n",
+    "]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "models = []\n",
+    "\n",
+    "for model in model_details:\n",
+    "    reg_model = list(registry_ml_client.models.list(name=model[\"name\"]))[0]\n",
+    "    print(reg_model.id)\n",
+    "    models.append({**model, \"version\": reg_model.version})"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "models"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -319,8 +343,8 @@
     "    name=\"model_evaluation_pipeline\", label=\"latest\"\n",
     ")\n",
     "\n",
-    "with open(\"./eval-config.json\") as f:\n",
-    "    evaluation_config_params = json.dumps(json.load(f))\n",
+    "# with open(\"./eval-config.json\") as f:\n",
+    "#     evaluation_config_params = json.dumps(json.load(f))\n",
     "\n",
     "\n",
     "# define the pipeline job\n",
@@ -338,8 +362,8 @@
     "        # Evaluation settings\n",
     "        task=\"text-summarization\",\n",
     "        # config file containing the details of evaluation metrics to calculate\n",
-    "        # evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n",
-    "        evaluation_config_params=evaluation_config_params,\n",
+    "        evaluation_config=Input(type=AssetTypes.URI_FILE, path=\"./eval-config.json\"),\n",
+    "        # evaluation_config_params=evaluation_config_params,\n",
     "        # config cluster/device job is running on\n",
     "        # set device to GPU/CPU on basis if GPU count was found\n",
     "        device=\"gpu\" if gpu_count_found else \"cpu\",\n",