In [None]:
# Imports

import io
import time
import torch

from diffusers import DiffusionPipeline
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipeline.to("cuda")

Loading pipeline components...: 100%|██████████| 7/7 [00:07<00:00,  1.07s/it]


StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.31.0",
  "_name_or_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [None]:
# Load the existing dataset from Hugging Face (or create a new one)
dataset_name = "DiegoP-S/DatasetSynthesis"
dataset = load_dataset(dataset_name)



In [8]:
print(dataset.keys())

dict_keys(['Bell_Pepper', 'Soybean', 'Apple', 'Pomegranate', 'Chilli_Pepper', 'Ginger', 'Watermelon', 'Cabbage', 'Sweet_Potato', 'Capsicum', 'Kiwi', 'Corn', 'Garlic', 'Potato', 'Beetroot', 'Jalapeno', 'Orange', 'Carrot', 'Pineapple', 'Sweetcorn', 'Tomato', 'Peas', 'Banana', 'Cauliflower', 'Mango', 'Paprika'])


In [None]:
samples_per_class = 1
batch_size = 1
group = "test/elephant"
# classes = {"Apple", "Banana", "Beetroot", "Bell Pepper", "Cabbage", "Capsicum", "Carrot", "Cauliflower", "Chilli Pepper", "Corn", "Garlic", "Ginger", "Jalapeno", "Kiwi", "Mango", "Orange", "Paprika", "Peas", "Pineapple", "Pomegranate", "Potato", "Soybean", "Sweet Potato", "Sweetcorn", "Tomato", "Watermelon", "Pear", "Grapes", "Cucumber", "Onion", "Lemon", "Spinach", "Turnip", "Eggplant"}
classes = {"Elephant"}

for class_name in classes:
    split = {class_name: {"image": [], "group": [], "file_name": []}}
    for i in range(samples_per_class):
        file_name = f"{class_name}_generated_image_{i}.png"

        # Check if image already in dataset
        if class_name in dataset.keys() and file_name in dataset[class_name]["file_name"]:
            continue

        # Generate the image using the pipeline
        print(class_name)
        generated_image = pipeline(f"{class_name}").images[0]

        # Convert the image to a binary format (e.g., PNG)
        img_byte_arr = io.BytesIO()
        generated_image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()

        # You can now upload the binary data to Hugging Face Dataset
        split[class_name]["file_name"].append(file_name),
        split[class_name]["group"].append(group),
        split[class_name]["image"].append(img_byte_arr),

        if i % batch_size == 1:
            # Create a new dataset with the updated data
            dataset_updated = DatasetDict({split_name: Dataset.from_dict(data) for split_name, data in split.items()})

            # Push the updated dataset to Hugging Face
            dataset_updated.push_to_hub(dataset_name)

Doing Onion_generated_image_0.png
Doing Lemon_generated_image_0.png
Doing Cucumber_generated_image_0.png
Doing Spinach_generated_image_0.png
Doing Jalapeno_generated_image_22.png
Doing Sweet Potato_generated_image_0.png
Doing Turnip_generated_image_0.png
Doing Bell Pepper_generated_image_0.png
Doing Chilli Pepper_generated_image_0.png
Doing Eggplant_generated_image_0.png
Doing Pear_generated_image_0.png
Doing Grapes_generated_image_0.png


In [None]:
# Create a new dataset with the updated data
updated_dataset = Dataset.from_dict(existing_data)


# Push the updated dataset to Hugging Face
updated_dataset.push_to_hub(dataset_name)
print(f"Dataset updated: https://huggingface.co/datasets/{dataset_name}")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 90.90ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


Dataset updated: https://huggingface.co/datasets/DiegoP-S/DatasetSynthesis
