resolving conflicts between training args and optimization args (#3193)

* resolving conflicts between training args and optimization args * Changing the log text * black formatting * adding the pipeline name to the notebook
Azure · May 15, 2024 · 8a5c1c0 · 8a5c1c0
1 parent df0bfcc
commit 8a5c1c0
Showing 1 changed file with 71 additions and 12 deletions.
diff --git a/sdk/python/foundation-models/system/finetune/chat-completion/chat-completion.ipynb b/sdk/python/foundation-models/system/finetune/chat-completion/chat-completion.ipynb
@@ -377,24 +377,84 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Training parameters\n",
+    "# Default training parameters\n",
     "training_parameters = dict(\n",
     "    num_train_epochs=3,\n",
     "    per_device_train_batch_size=1,\n",
     "    per_device_eval_batch_size=1,\n",
     "    learning_rate=5e-6,\n",
     "    lr_scheduler_type=\"cosine\",\n",
     ")\n",
-    "print(f\"The following training parameters are enabled - {training_parameters}\")\n",
+    "# Default optimization parameters\n",
+    "optimization_parameters = dict(\n",
+    "    apply_lora=\"true\",\n",
+    "    apply_deepspeed=\"true\",\n",
+    "    deepspeed_stage=2,\n",
+    ")\n",
+    "# Let's construct finetuning parameters using training and optimization paramters.\n",
+    "finetune_parameters = {**training_parameters, **optimization_parameters}\n",
     "\n",
-    "# Optimization parameters - As these parameters are packaged with the model itself, lets retrieve those parameters\n",
+    "# Each model finetuning works best with certain finetuning parameters which are packed with model as `model_specific_defaults`.\n",
+    "# Let's override the finetune_parameters in case the model has some custom defaults.\n",
     "if \"model_specific_defaults\" in foundation_model.tags:\n",
-    "    optimization_parameters = ast.literal_eval(\n",
-    "        foundation_model.tags[\"model_specific_defaults\"]\n",
-    "    )  # convert string to python dict\n",
-    "else:\n",
-    "    optimization_parameters = dict(apply_lora=\"true\", apply_deepspeed=\"true\")\n",
-    "print(f\"The following optimizations are enabled - {optimization_parameters}\")"
+    "    print(\"Warning! Model specific defaults exist. The defaults could be overridden.\")\n",
+    "    finetune_parameters.update(\n",
+    "        ast.literal_eval(  # convert string to python dict\n",
+    "            foundation_model.tags[\"model_specific_defaults\"]\n",
+    "        )\n",
+    "    )\n",
+    "print(\n",
+    "    f\"The following finetune parameters are going to be set for the run: {finetune_parameters}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the pipeline display name for distinguishing different runs from the name\n",
+    "def get_pipeline_display_name():\n",
+    "    batch_size = (\n",
+    "        int(finetune_parameters.get(\"per_device_train_batch_size\", 1))\n",
+    "        * int(finetune_parameters.get(\"gradient_accumulation_steps\", 1))\n",
+    "        * int(gpus_per_node)\n",
+    "        * int(finetune_parameters.get(\"num_nodes_finetune\", 1))\n",
+    "    )\n",
+    "    scheduler = finetune_parameters.get(\"lr_scheduler_type\", \"linear\")\n",
+    "    deepspeed = finetune_parameters.get(\"apply_deepspeed\", \"false\")\n",
+    "    ds_stage = finetune_parameters.get(\"deepspeed_stage\", \"2\")\n",
+    "    if deepspeed == \"true\":\n",
+    "        ds_string = f\"ds{ds_stage}\"\n",
+    "    else:\n",
+    "        ds_string = \"nods\"\n",
+    "    lora = finetune_parameters.get(\"apply_lora\", \"false\")\n",
+    "    if lora == \"true\":\n",
+    "        lora_string = \"lora\"\n",
+    "    else:\n",
+    "        lora_string = \"nolora\"\n",
+    "    save_limit = finetune_parameters.get(\"save_total_limit\", -1)\n",
+    "    seq_len = finetune_parameters.get(\"max_seq_length\", -1)\n",
+    "    return (\n",
+    "        model_name\n",
+    "        + \"-\"\n",
+    "        + \"ultrachat\"\n",
+    "        + \"-\"\n",
+    "        + f\"bs{batch_size}\"\n",
+    "        + \"-\"\n",
+    "        + f\"{scheduler}\"\n",
+    "        + \"-\"\n",
+    "        + ds_string\n",
+    "        + \"-\"\n",
+    "        + lora_string\n",
+    "        + f\"-save_limit{save_limit}\"\n",
+    "        + f\"-seqlen{seq_len}\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "pipeline_display_name = get_pipeline_display_name()\n",
+    "print(f\"Display name used for the run: {pipeline_display_name}\")"
    ]
   },
   {
@@ -413,7 +473,7 @@
     "\n",
     "\n",
     "# define the pipeline job\n",
-    "@pipeline()\n",
+    "@pipeline(name=pipeline_display_name)\n",
     "def create_pipeline():\n",
     "    chat_completion_pipeline = pipeline_component_func(\n",
     "        mlflow_model_path=foundation_model.id,\n",
@@ -430,8 +490,7 @@
     "        ),\n",
     "        # Training settings\n",
     "        number_of_gpu_to_use_finetuning=gpus_per_node,  # set to the number of GPUs available in the compute\n",
-    "        **training_parameters,\n",
-    "        **optimization_parameters\n",
+    "        **finetune_parameters\n",
     "    )\n",
     "    return {\n",
     "        # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model\n",