From 673c0052da628fb3b05c503e9a6b275bfd4ca1f9 Mon Sep 17 00:00:00 2001 From: Gaurav Rajguru Date: Tue, 7 May 2024 18:44:15 +0530 Subject: [PATCH] CLI and notebook examples for text to image diffusion models (#3154) * CLI and notebook example for dreambooth finetuning * notebook update pipeline * notebook --------- Co-authored-by: gaurav Co-authored-by: grajguru --- .../finetune/text-to-image/base64_to_jpeg.py | 51 ++++++ .../system/finetune/text-to-image/deploy.yaml | 10 ++ .../diffusers-dreambooth-dog-text-to-image.sh | 167 ++++++++++++++++++ ...iffusers-dreambooth-dog-text-to-image.yaml | 118 +++++++++++++ .../finetune/text-to-image/prepare_data.py | 54 ++++++ ...ffusers-dreambooth-dog-text-to-image.ipynb | 52 ++++-- 6 files changed, 439 insertions(+), 13 deletions(-) create mode 100644 cli/foundation-models/system/finetune/text-to-image/base64_to_jpeg.py create mode 100644 cli/foundation-models/system/finetune/text-to-image/deploy.yaml create mode 100644 cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.sh create mode 100644 cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.yaml create mode 100644 cli/foundation-models/system/finetune/text-to-image/prepare_data.py diff --git a/cli/foundation-models/system/finetune/text-to-image/base64_to_jpeg.py b/cli/foundation-models/system/finetune/text-to-image/base64_to_jpeg.py new file mode 100644 index 0000000000..69bc49be73 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-to-image/base64_to_jpeg.py @@ -0,0 +1,51 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Read base64 encoded image from txt file and convert it to image file + +import argparse +import re +import json +import io +import base64 +from PIL import Image + + +INPUT_PROMPT_COLUMN = "prompt" +OUTPUT_IMAGE_COLUMN = "generated_image" + + +def base64_str_to_image(response_file: str): + """ + Read file that contains response from online endpoint. Extarct base64 encoded images and save them as image files. + + :param response_file: Path to json file, which has response received from endpoint. + :type response_file: str + :return: None + """ + with open(response_file) as f: + json_str = json.load(f) + + json_obj = json.loads(json_str) + for i, obj in enumerate(json_obj): + generated_image = obj[OUTPUT_IMAGE_COLUMN] + img = Image.open(io.BytesIO(base64.b64decode(generated_image))) + text_prompt = "" + if INPUT_PROMPT_COLUMN in obj: + text_prompt = obj[INPUT_PROMPT_COLUMN].strip() + text_prompt = f"Img_{i}_" + re.sub(r"[^a-zA-Z0-9 ]+", "", text_prompt) + text_prompt = text_prompt[:50] + img.save(text_prompt + ".jpg", "JPEG") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--response_file", + type=str, + default="generated_image.json", + help="File having image response from endpoint.", + ) + args, unknown = parser.parse_known_args() + + base64_str_to_image(args.response_file) diff --git a/cli/foundation-models/system/finetune/text-to-image/deploy.yaml b/cli/foundation-models/system/finetune/text-to-image/deploy.yaml new file mode 100644 index 0000000000..35f241a58f --- /dev/null +++ b/cli/foundation-models/system/finetune/text-to-image/deploy.yaml @@ -0,0 +1,10 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +instance_type: Standard_NC6s_v3 +instance_count: 1 +liveness_probe: + initial_delay: 180 + period: 180 + failure_threshold: 49 + timeout: 299 +request_settings: + request_timeout_ms: 90000 diff --git a/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.sh b/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.sh new file mode 100644 index 0000000000..896607f734 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.sh @@ -0,0 +1,167 @@ +#!/bin/bash +set -x + +# script inputs +registry_name="azureml" +subscription_id="" +resource_group_name="" +workspace_name="" + +cluster_name="sample-finetune-cluster-gpu" + +# If above compute cluster does not exist, create it with the following vm size +cluster_sku="Standard_NC6s_v3" + +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=1 +# This is the number of nodes in cluster +instance_count=1 + +# huggingFace model +huggingface_model_name="runwayml/stable-diffusion-v1-5" +# This is the foundation model for finetuning from azureml system registry +aml_registry_model_name="runwayml-stable-diffusion-v1-5" +model_label="latest" + +version=$(date +%s) +finetuned_huggingface_model_name="runwayml-stable-diffusion-2-1-dog-text-to-image" +huggingface_endpoint_name="text-to-image-dog-$version" +deployment_name="text2img-dog-mlflow-deploy" +deployment_sku="Standard_NC6s_v3" +request_file="request.json" +response_file="generated_image.json" + +# finetuning job parameters +finetuning_pipeline_component="diffusers_text_to_image_dreambooth_pipeline" +# Training settings +process_count_per_instance=$gpus_per_node # set to the number of GPUs available in the compute +instance_count=$instance_count + +# 1. Install dependencies +pip install azure-ai-ml==1.8.0 +pip install azure-identity==1.13.0 + +# 2. Setup pre-requisites +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# Check if $compute_cluster_finetune exists, else create it +if az ml compute show --name $cluster_name $workspace_info +then + echo "Compute cluster $cluster_name already exists" +else + echo "Creating compute cluster $cluster_name" + az ml compute create --name $cluster_name --type amlcompute --min-instances 0 --max-instances $instance_count --size $cluster_sku $workspace_info || { + echo "Failed to create compute cluster $cluster_name" + exit 1 + } +fi + +# Check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# 3. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $aml_registry_model_name --label $model_label --registry-name $registry_name +then + echo "Model $aml_registry_model_name:$model_label does not exist in registry $registry_name" + exit 1 +fi + +# Get the latest model version +model_version=$(az ml model show --name $aml_registry_model_name --label $model_label --registry-name $registry_name --query version --output tsv) + +# 4. Prepare data +# Git clone the DOG dataset +dataset_dir="dog-example" +dataset_url="https://datasets-server.huggingface.co/rows?dataset=diffusers%2Fdog-example&config=default&split=train&offset=0&length=100" +python prepare_data.py --url $dataset_url --dataset_dir $dataset_dir + +# Check if training data exist +if [ ! -d $dataset_dir ]; then + echo "Training data $train_data does not exist" + exit 1 +fi + +# 5. Submit finetuning job using pipeline.yaml for a stable diffusion model + +# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below +# inputs.model_name=$huggingface_model_name + +parent_job_name=$( az ml job create --file "./diffusers-dreambooth-dog-text-to-image.yaml" $workspace_info --query name -o tsv \ +--set jobs.huggingface_diffusers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ +inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \ +inputs.compute_model_import=$cluster_name inputs.process_count_per_instance=$process_count_per_instance \ +inputs.instance_data_dir.path=$dataset_dir \ +inputs.instance_count=$instance_count inputs.compute_finetune=$cluster_name) || { + echo "Failed to submit finetuning job" + exit 1 +} + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model +az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint +# Create online endpoint +az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# Deploy model from registry to endpoint in workspace + +az ml online-deployment create --file ./deploy.yaml --name=$deployment_name $workspace_info --set \ + endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +az ml online-endpoint update $workspace_info --name=$huggingface_endpoint_name --traffic="$deployment_name=100" || { + echo "Failed to set all traffic to the new deployment" + exit 1 +} + +# 8. Try a sample scoring request on the deployed HuggingFace Transformers model + +read -r -d '' request_json << EOM +{ + "input_data": {"columns": ["prompt"], "index": [0], "data": ["a photo of sks dog in a bucket"]}, + "params": { + "height": 512, + "width": 512, + "num_inference_steps": 50, + "guidance_scale": 7.5, + "negative_prompt": ["blurry; three legs"], + "num_images_per_prompt": 2 + } +} +EOM + +echo "$request_json" > $request_file + +az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $request_file $workspace_info -o json > $response_file || { + echo "endpoint invoke failed"; exit 1; +} + +python base64_to_jpeg.py --response_file $response_file + + +# 9. Delete the endpoint +az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} + +# 10. Delete the request data file +rm $huggingface_sample_request_data \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.yaml b/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.yaml new file mode 100644 index 0000000000..5974189877 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.yaml @@ -0,0 +1,118 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: AzureML-Train-Finetune-MultiModal-TextToImage-DreamBooth-Samples + +inputs: + # # Model - specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/stabilityai-stable-diffusion-2-1/labels/latest + type: mlflow_model + # model_name: stabilityai/stable-diffusion-2-1 + + # # compute + compute_model_import: sample-model-import-cluster + compute_finetune: sample-finetune-cluster-gpu + process_count_per_instance: 1 + instance_count: 1 + + # # Dataset + instance_data_dir: + path: ./dog-example + type: uri_folder + +outputs: + # Map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model. Registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + continue_on_step_failure: false + default_compute: azureml:sample-finetune-cluster-gpu + +jobs: + huggingface_diffusers_model_finetune_job: + type: pipeline + component: azureml:diffusers_text_to_image_dreambooth_pipeline:latest + inputs: + # Compute + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_finetune: ${{parent.inputs.compute_finetune}} + instance_data_dir: ${{parent.inputs.instance_data_dir}} + + process_count_per_instance: ${{parent.inputs.process_count_per_instance}} + instance_count: ${{parent.inputs.instance_count}} + + # Model import args + download_from_source: False # True for downloading a model directly from HuggingFace + model_family: HuggingFaceImage + # # Specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub + mlflow_model: ${{parent.inputs.mlflow_model_path}} + # model_name: ${{parent.inputs.model_name}} + + # # Instance prompt + task_name: stable-diffusion-text-to-image + instance_prompt: "\"A photo of a sks dog\"" + resolution: 512 + + # # Prior preservation loss + with_prior_preservation: True + # Class prompt - a prompt without the unique identifier. This prompt is used for generating "class images" for prior preservation + class_prompt: "\"a photo of dog\"" # Please note that we need to escape the double inverted comma. + num_class_images: 100 # Number of images to generate with the class prompt for prior preservation. + # class_data_dir: None # Specify Datastore URI of existing uri_folder containing class images if you have, and the training job will generate any additional images so that num_class_images are present in class_data_dir during training time. + prior_generation_precision: fp32 + prior_loss_weight: 1.0 + sample_batch_size: 2 # Number of samples to generate class images in each batch. + + # # Lora parameters + # # LoRA reduces the number of trainable parameters by learning pairs of rank-decompostion matrices while freezing the original weights. This vastly reduces the storage requirement for large models adapted to specific tasks and enables efficient task-switching during deployment all without introducing inference latency. LoRA also outperforms several other adaptation methods including adapter, prefix-tuning, and fine-tuning. + apply_lora: True + # lora_alpha: 128 + # lora_r: 16 + # lora_dropout: 0.0 + # tokenizer_max_length: 77 + + # # Text Encoder + pre_compute_text_embeddings: True + train_text_encoder: False + #text_encoder_type: CLIPTextModel + #text_encoder_name: openai/clip-vit-base-patch32 # Huggingface id of text encoder. + #text_encoder_use_attention_mask: False + + # # UNET related + #class_labels_conditioning: timesteps + + # # Noise Scheduler + noise_scheduler_name: DDPMScheduler # Optional, default is used from the base model. If following scheduler related parameters are not provided, it is taken from model's scheduler config. + # noise_scheduler_num_train_timesteps: 1000 + # noise_scheduler_variance_type: fixed_small + # noise_scheduler_prediction_type: epsilon + # noise_scheduler_timestep_spacing: leading, + # extra_noise_scheduler_args: "clip_sample_range=1.0; clip_sample=True" # Optional additional arguments that are supplied to noise scheduler. The arguments should be semi-colon separated key value pairs and should be enclosed in double quotes. + # offset_noise: False + + # # Training related + num_validation_images: 3 # Number of images to generate using instance_prompt. Images are stored in the output/checkpoint-* directories. Please note that this will increase the training time. + number_of_workers: 3 + number_of_epochs: 15 + max_steps: -1 + training_batch_size: 3 + auto_find_batch_size: False + learning_rate: 1e-4 # Learning rate is recommended to be set to a lower value, if not fine-tuning with Lora + # learning_rate_scheduler: warmup_linear, + # warmup_steps: 0 + # optimizer: adamw_hf + # weight_decay: 0.0 + # gradient_accumulation_step: 1 + # max_grad_norm: 1.0 + precision: 32 + random_seed: 42 + logging_strategy: epoch + # logging_steps: 500 # Number of update steps between two logs if logging_strategy='steps'. + save_total_limit: -1 # If you face issues related to disk space, you can limit the number of checkpoints saved. + save_as_mlflow_model: True + + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/text-to-image/prepare_data.py b/cli/foundation-models/system/finetune/text-to-image/prepare_data.py new file mode 100644 index 0000000000..fe718a6cef --- /dev/null +++ b/cli/foundation-models/system/finetune/text-to-image/prepare_data.py @@ -0,0 +1,54 @@ +import argparse +import json +import requests +import os +from typing import List, Dict + +def get_data(url: str) -> List[Dict]: + """Send a GET request to the specified URL, parse the JSON content of the response. + + :param url: The URL to send the GET request to. + :type url: str + :return: The "rows" field of the parsed data. + :rtype: List[Dict] + """ + response = requests.get(url) + response.raise_for_status() + data = json.loads(response.content) + return data["rows"] + +def download_images(data: List[Dict], dataset_dir: str) -> None: + """Create a directory for the images and download each image to the directory. + + :param data: The parsed data. + :type data: List[Dict] + :param dataset_dir: The directory to save the images to. + :type dataset_dir: str + """ + # Create a directory for the images + os.makedirs(dataset_dir, exist_ok=True) + + # Iterate over the parsed data and download each image + for i, item in enumerate(data): + image_url = item["row"]["image"]['src'] + image_response = requests.get(image_url) + + # Check if the request was successful + image_response.raise_for_status() + + # Write the image data to a file + with open(os.path.join(dataset_dir, f'image_{i}.jpg'), 'wb') as f: + f.write(image_response.content) + +if __name__ == "__main__": + """ + Parse command-line arguments for the URL and directory name, and pass them to the + get_data() and download_images() functions. + """ + parser = argparse.ArgumentParser(description='Download images from a dataset.') + parser.add_argument('--url', required=True, help='URL of the dataset.') + parser.add_argument('--dataset_dir', required=True, help='Directory to save the images.') + args = parser.parse_args() + + data = get_data(args.url) + download_images(data, args.dataset_dir) diff --git a/sdk/python/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.ipynb b/sdk/python/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.ipynb index 3871c1cf87..1d0015df48 100644 --- a/sdk/python/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.ipynb +++ b/sdk/python/foundation-models/system/finetune/text-to-image/diffusers-dreambooth-dog-text-to-image.ipynb @@ -146,7 +146,7 @@ "source": [ "### 3. Pick a foundation model to fine tune\n", "\n", - "We will use the `stabilityai-stable-diffusion-2-1` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either register the model and use the registered model or provide huggingface model_id in the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", + "We will use the `runwayml-stable-diffusion-v1-5` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either register the model and use the registered model or provide huggingface model_id in the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", "\n", "Currently following models are supported:\n", "\n", @@ -155,7 +155,6 @@ "| [runwayml-stable-diffusion-v1-5](https://ml.azure.com/registries/azureml/models/runwayml-stable-diffusion-v1-5/version/8) | azureml registry |\n", "| [stabilityai-stable-diffusion-2-1](https://ml.azure.com/registries/azureml/models/stabilityai-stable-diffusion-2-1/version/8) | azureml registry |\n", "| [compvis-stable-diffusion-v1-4](https://ml.azure.com/registries/azureml/models/compvis-stable-diffusion-v1-4/version/8) | azureml registry |\n", - "| [deci-decidiffusion-v1-0](https://ml.azure.com/registries/azureml/models/deci-decidiffusion-v1-0/version/4) | azureml registry |\n", "| [Text to Image models from Huggingface's Transformer library](https://huggingface.co/models?pipeline_tag=text-to-image&library=transformers)| HuggingFace |" ] }, @@ -165,9 +164,9 @@ "metadata": {}, "outputs": [], "source": [ - "huggingface_model_name = \"stabilityai/stable-diffusion-2-1\"\n", + "huggingface_model_name = \"runwayml/stable-diffusion-v1-5\"\n", "\n", - "aml_registry_model_name = \"stabilityai-stable-diffusion-2-1\"\n", + "aml_registry_model_name = \"runwayml-stable-diffusion-v1-5\"\n", "foundation_models = registry_ml_client.models.list(aml_registry_model_name)\n", "foundation_model = max(foundation_models, key=lambda x: int(x.version))\n", "print(\n", @@ -199,11 +198,33 @@ "metadata": {}, "outputs": [], "source": [ + "import json\n", + "import requests\n", "import os\n", "\n", + "url = \"https://datasets-server.huggingface.co/rows?dataset=diffusers%2Fdog-example&config=default&split=train&offset=0&length=100\"\n", "dataset_dir = \"dog-example\"\n", - "if os.path.isdir(dataset_dir) == False:\n", - " !git clone https://huggingface.co/datasets/diffusers/dog-example" + "\n", + "response = requests.get(url)\n", + "response.raise_for_status()\n", + "\n", + "# Parse the JSON content\n", + "data = json.loads(response.content)\n", + "data = data[\"rows\"]\n", + "# Create a directory for the images\n", + "os.makedirs(dataset_dir, exist_ok=True)\n", + "\n", + "# Iterate over the parsed data and download each image\n", + "for i, item in enumerate(data):\n", + " image_url = item[\"row\"][\"image\"]['src']\n", + " image_response = requests.get(image_url)\n", + "\n", + " # Check if the request was successful\n", + " image_response.raise_for_status()\n", + "\n", + " # Write the image data to a file\n", + " with open(os.path.join(dataset_dir, f'image_{i}.jpeg'), 'wb') as f:\n", + " f.write(image_response.content)\n" ] }, { @@ -375,6 +396,16 @@ "}" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "process_count_per_instance = 1 # Number of gpus to be used per node for finetuning, should be equal to number of gpu per node in the compute SKU used for finetune\n", + "instance_count = 1 # Number of nodes to be used for finetuning (used for distributed training)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -420,6 +451,8 @@ " compute_model_import=cluster_name,\n", " compute_finetune=cluster_name,\n", " instance_data_dir=Input(type=AssetTypes.URI_FOLDER, path=instance_data_uri_folder.path),\n", + " process_count_per_instance=process_count_per_instance,\n", + " instance_count=instance_count,\n", " **pipeline_component_args,\n", " )\n", " return {\n", @@ -806,13 +839,6 @@ "source": [ "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {