In [1]:
# Imports

import io
import time
import torch

from diffusers import DiffusionPipeline
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipeline.to("cuda")

Loading pipeline components...: 100%|██████████| 7/7 [00:07<00:00,  1.01s/it]


StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.31.0",
  "_name_or_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [3]:
# Load the existing dataset from Hugging Face (or create a new one)
dataset_name = "DiegoP-S/DatasetSynthesis"
dataset = load_dataset(dataset_name)
existing_data = {key: list(dataset["train"][key]) for key in dataset["train"].features.keys()}



In [None]:
samples_per_class = 100
batch_size = 20
name = "fruit"
classes = {"Apple", "Banana", "Orange"}
classes_complete = 0
start_time = time.time()

for class_name in classes:
    images_complete = 0
    class_start_time = time.time()
    for i in range(samples_per_class):
        

        # Generate the image using the pipeline
        print(class_name)
        generated_image = pipeline(f"{class_name} {name}").images[0]

        # Convert the image to a binary format (e.g., PNG)
        img_byte_arr = io.BytesIO()
        generated_image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()

        # You can now upload the binary data to Hugging Face Dataset
        new_data = {
            "file_name": [f"{class_name}_generated_image_{i}.png"],
            "image": [img_byte_arr]
        }

        print(new_data["file_name"])

        # Append the new data
        for key in new_data.keys():
            existing_data[key].extend(new_data[key])

        images_complete += 1

        if i % batch_size == 1:
            # Create a new dataset with the updated data
            updated_dataset = Dataset.from_dict(existing_data)

            # Push the updated dataset to Hugging Face
            updated_dataset.push_to_hub(dataset_name)
            print(f"Images complete in class {class_name} ({classes_complete + 1} / {len(classes)}): {images_complete}/{samples_per_class}. Estimated time remaining in class: {samples_per_class / (i) * (time.time() - class_start_time) - (time.time() - class_start_time)}s")
    classes_complete += 1
    print(f"Classes complete: {classes_complete}/{len(classes)}. Estimated time remaining: {len(classes) / classes_complete * (time.time() - start_time) - (time.time() - start_time)}s")

Apple


100%|██████████| 50/50 [00:23<00:00,  2.14it/s]


['Apple_generated_image_0.png']
Apple


 70%|███████   | 35/50 [00:16<00:06,  2.19it/s]

In [None]:
for key in existing_data.keys():
    print(existing_data['file_name'])

['apple.png', 'apple.png', 'apple2.png', 'apple_generated_image.png', 'apple_generated_image_1.png', 'TEST.png', 'Apple_generated_image_0.png', 'Banana_generated_image_0.png', 'Orange_generated_image_0.png']
['apple.png', 'apple.png', 'apple2.png', 'apple_generated_image.png', 'apple_generated_image_1.png', 'TEST.png', 'Apple_generated_image_0.png', 'Banana_generated_image_0.png', 'Orange_generated_image_0.png']


In [None]:
# Create a new dataset with the updated data
updated_dataset = Dataset.from_dict(existing_data)



# Push the updated dataset to Hugging Face
updated_dataset.push_to_hub(dataset_name)
print(f"Dataset updated: https://huggingface.co/datasets/{dataset_name}")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 166.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


Dataset updated: https://huggingface.co/datasets/DiegoP-S/DatasetSynthesis
