# Setup -> Curate Image -> Tag Image 까지는 필수

# 📊 Dataset Maker by Hollowstrawberry

This is based on the work of [Kohya-ss](https://github.com/kohya-ss/sd-scripts) and [Linaqruf](https://colab.research.google.com/github/Linaqruf/kohya-trainer/blob/main/kohya-LoRA-dreambooth.ipynb). Thank you!

### ⭕ Disclaimer
The purpose of this document is to research bleeding-edge technologies in the field of machine learning.  
Please read and follow the [Google Colab guidelines](https://research.google.com/colaboratory/faq.html) and its [Terms of Service](https://research.google.com/colaboratory/tos_v3.html).

| |GitHub|🇬🇧 English|🇪🇸 Spanish|
|:--|:-:|:-:|:-:|
| 🏠 **Homepage** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab) | | |
| 📊 **Dataset Maker** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab/blob/main/Dataset_Maker.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Dataset_Maker.ipynb) | [![Abrir en Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge-spanish.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Spanish_Dataset_Maker.ipynb) |
| ⭐ **Lora Trainer** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer.ipynb) | [![Abrir en Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge-spanish.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Spanish_Lora_Trainer.ipynb) |
| 🌟 **XL Lora Trainer** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer_XL.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer_XL.ipynb) |  |
| 🌟 **Legacy XL Trainer** | [![GitHub](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/github.svg)](https://github.com/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer_XL_Legacy.ipynb) | [![Open in Colab](https://raw.githubusercontent.com/hollowstrawberry/kohya-colab/main/assets/colab-badge.svg)](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer_XL_Legacy.ipynb) |  |

In [None]:
import os
from IPython import get_ipython
from IPython.display import display, Markdown

COLAB = True

if COLAB:
  from google.colab.output import clear as clear_output
else:
  from IPython.display import clear_output

#@title ## 🚩 Start Here

#@markdown ### 1️⃣ Setup
#@markdown This cell will load some requirements and create the necessary folders in your Google Drive. <p>
#@markdown Your project name can't contain spaces but it can contain a single / to make a subfolder in your dataset.
project_name = "Pastel_Dreamscape" #@param {type:"string"}
project_name = project_name.strip()
#@markdown The folder structure doesn't matter and is purely for comfort. Make sure to always pick the same one. I like organizing by project.
folder_structure = "Organize by project (MyDrive/Loras/project_name/dataset)" #@param ["Organize by category (MyDrive/lora_training/datasets/project_name)", "Organize by project (MyDrive/Loras/project_name/dataset)"]

if not project_name or any(c in project_name for c in " .()\"'\\") or project_name.count("/") > 1:
  print("Please write a valid project_name.")
else:
  if COLAB and not os.path.exists('/content/drive'):
    from google.colab import drive
    print("📂 Connecting to Google Drive...")
    drive.mount('/content/drive')

  project_base = project_name if "/" not in project_name else project_name[:project_name.rfind("/")]
  project_subfolder = project_name if "/" not in project_name else project_name[project_name.rfind("/")+1:]

  root_dir = "/content" if COLAB else "~/Loras"
  deps_dir = os.path.join(root_dir, "deps")

  if "/Loras" in folder_structure:
    main_dir      = os.path.join(root_dir, "drive/MyDrive/Loras") if COLAB else root_dir
    config_folder = os.path.join(main_dir, project_base)
    images_folder = os.path.join(main_dir, project_base, "dataset")
    if "/" in project_name:
      images_folder = os.path.join(images_folder, project_subfolder)
  else:
    main_dir      = os.path.join(root_dir, "drive/MyDrive/lora_training") if COLAB else root_dir
    config_folder = os.path.join(main_dir, "config", project_name)
    images_folder = os.path.join(main_dir, "datasets", project_name)

  for dir in [main_dir, deps_dir, images_folder, config_folder]:
    os.makedirs(dir, exist_ok=True)

  print(f"✅ Project {project_name} is ready!")
  step1_installed_flag = True


📂 Connecting to Google Drive...
Mounted at /content/drive
✅ Project Pastel_Dreamscape is ready!


In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 3️⃣ Curate your images
#@markdown We will find duplicate images with the FiftyOne AI and delete them. <p>
#@markdown This is how similar 2 images must be to be marked for deletion. I recommend 0.97 to 0.99:
similarity_threshold = 0.985 #@param {type:"number"}
#@markdown You can choose between only deleting the duplicates, or additionally opening an interactive area below this cell that lets you visualize all your images and manually mark with `delete` to the ones you don't like. <p>
#@markdown If the interactive area appears blank for over a minute, try enabling cookies and removing tracking protection for the Google Colab website, as they may break it.
#@markdown Regardless, you can save your changes by sending Enter in the input box above the interactive area.<p>
action = "Delete duplicates" #@param ["Delete duplicates","Mark duplicates and open interactive area","Open interactive area"]
#@markdown To open the interactive area in a new tab INSTEAD of below, you need an ngrok account.
open_in_new_tab = False #@param {type:"boolean"}
ngrok_token = "" #@param {type:"string"}


os.chdir(root_dir)
model_name = "clip-vit-base32-torch"
supported_types = (".png", ".jpg", ".jpeg")
img_count = len(os.listdir(images_folder))
batch_size = min(250, img_count)

if "step3_installed_flag" not in globals():
  print("🏭 Installing dependencies...\n")
  !pip -q install fiftyone ftfy pyngrok
  !pip -q install fiftyone-db-ubuntu2204
  if not get_ipython().__dict__['user_ns']['_exit_code']:
    clear_output()
    step3_installed_flag = True
  else:
    print("❌ Error installing dependencies, attempting to continue anyway...")

os.environ["FIFTYONE_SERVER"] = "0"
import numpy as np
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F
from sklearn.metrics.pairwise import cosine_similarity
from pyngrok import ngrok, conf
from portpicker import pick_unused_port

non_images = [f for f in os.listdir(images_folder) if not f.lower().endswith(supported_types)]
if non_images:
  print(f"💥 Error: Found non-image file {non_images[0]} - This program doesn't allow it. Sorry! Use the Extras at the bottom to clean the folder.")
elif img_count == 0:
  print(f"💥 Error: No images found in {images_folder}")
else:
  print("\n💿 Analyzing dataset...\n")
  dataset = fo.Dataset.from_dir(images_folder, dataset_type=fo.types.ImageDirectory)
  if "duplicates" in action:
    model = foz.load_zoo_model(model_name)
    embeddings = dataset.compute_embeddings(model, batch_size=batch_size)

    batch_embeddings = np.array_split(embeddings, batch_size)
    similarity_matrices = []
    max_size_x = max(array.shape[0] for array in batch_embeddings)
    max_size_y = max(array.shape[1] for array in batch_embeddings)

    for i, batch_embedding in enumerate(batch_embeddings):
      similarity = cosine_similarity(batch_embedding)
      #Pad 0 for np.concatenate
      padded_array = np.zeros((max_size_x, max_size_y))
      padded_array[0:similarity.shape[0], 0:similarity.shape[1]] = similarity
      similarity_matrices.append(padded_array)

    similarity_matrix = np.concatenate(similarity_matrices, axis=0)
    similarity_matrix = similarity_matrix[0:embeddings.shape[0], 0:embeddings.shape[0]]

    similarity_matrix = cosine_similarity(embeddings)
    similarity_matrix -= np.identity(len(similarity_matrix))

    dataset.match(F("max_similarity") > similarity_threshold)
    dataset.tags = ["delete", "has_duplicates"]

    id_map = [s.id for s in dataset.select_fields(["id"])]
    samples_to_remove = set()
    samples_to_keep = set()

    for idx, sample in enumerate(dataset):
      if sample.id not in samples_to_remove:
        # Keep the first instance of two duplicates
        samples_to_keep.add(sample.id)

        dup_idxs = np.where(similarity_matrix[idx] > similarity_threshold)[0]
        for dup in dup_idxs:
            # We kept the first instance so remove all other duplicates
            samples_to_remove.add(id_map[dup])

        if len(dup_idxs) > 0:
            sample.tags.append("has_duplicates")
            sample.save()
      else:
        sample.tags.append("delete")
        sample.save()

    sidebar_groups = fo.DatasetAppConfig.default_sidebar_groups(dataset)
    for group in sidebar_groups[1:]:
      group.expanded = False
    dataset.app_config.sidebar_groups = sidebar_groups
    dataset.save()

  if "interactive" in action:
    clear_output()
    os.environ["FIFTYONE_SERVER"] = "1"
    port = pick_unused_port()
    session = fo.launch_app(dataset, port=port, auto=not open_in_new_tab)
    if open_in_new_tab:
      conf.get_default().auth_token = ngrok_token
      public_url = ngrok.connect(port).public_url
      print(f"🟢 Session open at {public_url}")

    print("❗ Wait a minute for the session to load. If it doesn't, read above.")
    print("❗ When it's ready, you'll see a grid of your images.")
    print("❗ On the left side enable \"sample tags\" to visualize the images marked for deletion.")
    print("❗ You can mark your own images with the \"delete\" label by selecting them and pressing the tag icon at the top.")
    input("⭕ When you're done, enter something here to save your changes: ")

    print("💾 Saving...")

  marked = [s for s in dataset if "delete" in s.tags]
  dataset.delete_samples(marked)
  previous_folder = images_folder[:images_folder.rfind("/")]
  dataset.export(export_dir=os.path.join(images_folder, project_subfolder), dataset_type=fo.types.ImageDirectory)

  temp_suffix = "_temp"
  !mv {images_folder} {images_folder}{temp_suffix}
  !mv {images_folder}{temp_suffix}/{project_subfolder} {images_folder}
  !rm -r {images_folder}{temp_suffix}

  if "interactive" in action:
    session.refresh()
    fo.close_app()
    clear_output()

  print(f"\n✅ Removed {len(marked)} images from dataset. You now have {len(os.listdir(images_folder))} images.")



💿 Analyzing dataset...

 100% |███████████████████| 25/25 [17.5ms elapsed, 0s remaining, 1.4K samples/s]      


INFO:eta.core.utils: 100% |███████████████████| 25/25 [17.5ms elapsed, 0s remaining, 1.4K samples/s]      


Downloading model from 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt'...


INFO:fiftyone.core.models:Downloading model from 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt'...


 100% |██████|    2.6Gb/2.6Gb [19.8s elapsed, 0s remaining, 122.4Mb/s]      


INFO:eta.core.utils: 100% |██████|    2.6Gb/2.6Gb [19.8s elapsed, 0s remaining, 122.4Mb/s]      


Downloading CLIP tokenizer...


INFO:fiftyone.utils.clip.zoo:Downloading CLIP tokenizer...


 100% |█████|   10.4Mb/10.4Mb [65.4ms elapsed, 0s remaining, 158.2Mb/s]     


INFO:eta.core.utils: 100% |█████|   10.4Mb/10.4Mb [65.4ms elapsed, 0s remaining, 158.2Mb/s]     


 100% |███████████████████| 25/25 [8.6s elapsed, 0s remaining, 2.9 samples/s] 


INFO:eta.core.utils: 100% |███████████████████| 25/25 [8.6s elapsed, 0s remaining, 2.9 samples/s] 


 100% |███████████████████| 25/25 [455.0ms elapsed, 0s remaining, 54.9 samples/s]      


INFO:eta.core.utils: 100% |███████████████████| 25/25 [455.0ms elapsed, 0s remaining, 54.9 samples/s]      



✅ Removed 0 images from dataset. You now have 25 images.


In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 4️⃣ Tag your images
#@markdown We will be using AI to automatically tag your images, specifically [Waifu Diffusion](https://huggingface.co/SmilingWolf/wd-eva02-large-tagger-v3) in the case of anime and [BLIP](https://huggingface.co/spaces/Salesforce/BLIP) in the case of photos. <p>
#@markdown Giving tags/captions to your images allows for much better training. This process takes 5 minutes to install and 5 more minutes to tag a thousand images. It goes through all subfolders if you have any. <p>
method = "Photo captions" #@param ["Anime tags", "Photo captions"]
#@markdown **Anime:** Using both taggers will be more accurate than one or the other. Lower threshold will yield more tags, try 0.25 for concepts and 0.50 for styles. You should include character names if you're not training a character.
tagger = "Both" #@param ["Both","SmilingWolf/wd-eva02-large-tagger-v3","SmilingWolf/wd-vit-large-tagger-v3"]
tag_threshold = 0.25 #@param {type:"slider", min:0.0, max:1.0, step:0.01}
blacklist_tags = "virtual youtuber, parody, style parody, official alternate costume, official alternate hairstyle, official alternate hair length, alternate costume, alternate hairstyle, alternate hair length, alternate hair color" #@param {type:"string"}
include_character_names = False #@param {type:"boolean"}
#@markdown **Photos:** The minimum and maximum length of tokens/words in each caption.
caption_min = 10 #@param {type:"number"}
caption_max = 75 #@param {type:"number"}

character_threshold = tag_threshold if include_character_names else 1.1
undesired_tags = '"' + ','.join([t.strip() for t in blacklist_tags.split(",") if t.strip()]) + '"'

kohya_dir = "/content/kohya"
venv_python = os.path.join(kohya_dir, "venv/bin/python")
venv_pip = os.path.join(kohya_dir, "venv/bin/pip")

if "step4_installed_flag" not in globals():
  print("\n🏭 Installing dependencies...\n")
  !apt install -y python3.10-venv -qq
  !git clone https://github.com/kohya-ss/sd-scripts {kohya_dir}
  os.chdir(kohya_dir)
  !git reset --hard e89653975ddf429cdf0c0fd268da0a5a3e8dba1f
  !python3.10 -m venv venv
  !{venv_pip} install -r requirements.txt
  !{venv_pip} install fairscale==0.4.13 timm==0.6.12
  !{venv_pip} install onnx onnxruntime-gpu==1.20.1 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
  !{venv_pip} uninstall -y rich
  step4_installed_flag = True

print("\n🚶‍♂️ Launching program...\n")
os.chdir(kohya_dir)

if "Anime" in method:
  tagger_models = [
    "SmilingWolf/wd-eva02-large-tagger-v3",
    "SmilingWolf/wd-vit-large-tagger-v3"
  ] if tagger == "Both" else [tagger]

  for i, tagger_model in enumerate(tagger_models):
    append_tags = "--append_tags" if i > 0 else ""
    !{venv_python} finetune/tag_images_by_wd14_tagger.py \
      {images_folder} \
      --repo_id={tagger_model} \
      --general_threshold={tag_threshold} \
      --character_threshold={character_threshold} \
      --batch_size=8 \
      --max_data_loader_n_workers=2 \
      --caption_extension=.txt \
      --undesired_tags {undesired_tags} \
      --onnx --recursive --remove_underscore {append_tags}

  if not get_ipython().__dict__['user_ns']['_exit_code']:
    # Count tags
    from collections import Counter
    text_files = []
    for root, dirs, files in os.walk(images_folder):
      for file in files:
        if file.lower().endswith(".txt"):
          text_files.append(os.path.join(root, file))
    top_tags = Counter()
    for file in text_files:
      with open(file, 'r') as f:
        tags = [t.strip() for t in f.read().split(",")]
      top_tags.update(tags)

    clear_output()
    print(f"📊 Tagging complete. Here are the top 50 tags in your dataset:")
    print("\n".join(f"{k} ({v})" for k, v in top_tags.most_common(50)))

else:
  !{venv_python} finetune/make_captions.py \
    {images_folder} \
    --beam_search \
    --max_data_loader_n_workers=2 \
    --batch_size=8 \
    --min_length={caption_min} \
    --max_length={caption_max} \
    --caption_extension=.txt \
    --recursive

  if not get_ipython().__dict__['user_ns']['_exit_code']:
    import os
    import random
    from IPython.display import clear_output

    # ✅ caption 파일 경로 수집
    caption_files = []
    for root, dirs, files in os.walk(images_folder):
      for file in files:
        if file.lower().endswith(".txt"):
          caption_files.append(os.path.join(root, file))

    # ✅ 10개 샘플 출력
    sample = []
    for txt_path in random.sample(caption_files, min(10, len(caption_files))):
      with open(txt_path, 'r') as f:
        sample.append(f.read())

    clear_output()
    print(f"📊 Captioning complete. Here are {len(sample)} example captions from your dataset:")
    print("\n".join(sample))

os.chdir(root_dir)

📊 Captioning complete. Here are 10 example captions from your dataset:
a person is standing on a beach with a surfboard

a purple sky with clouds and a pink sky

a field of lavender with the sun setting in the background

a beach with a pink sky and a few people

a field with tall grass and a foggy sky

a couple of people on a beach with a surfboard

a person walking on a beach with a surfboard

a boat is sitting on the frozen water

a field of purple flowers next to a lake

a mountain range with a pink sky in the background



In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

# @markdown ### 5️⃣ Curate your tags - Optional for Keyword
# @markdown Modify your dataset's tags. You can run this cell multiple times with different parameters. <p>

#@markdown Put an activation tag at the start of every text file. This is useful to make learning better and activate your Lora easier. Set `keep_tokens` to 1 when training.<p>
#@markdown Common tags that are removed such as hair color, etc. will be "absorbed" by your activation tag.
global_activation_tag = "" #@param {type:"string"}
remove_tags = "" #@param {type:"string"}
#@markdown &nbsp;

#@markdown In this advanced section, you can search text files containing matching tags, and replace them with less/more/different tags. If you select the checkbox below, any extra tags will be put at the start of the file, letting you assign different activation tags to different parts of your dataset. Still, you may want a more advanced tool for this.
search_tags = "" #@param {type:"string"}
replace_with = "" #@param {type:"string"}
search_mode = "OR" #@param ["OR", "AND"]
new_becomes_activation_tag = False #@param {type:"boolean"}
#@markdown These may be useful sometimes. Will remove existing activation tags, be careful.
sort_alphabetically = False #@param {type:"boolean"}
remove_duplicates = False #@param {type:"boolean"}

def split_tags(tagstr):
  return [s.strip() for s in tagstr.split(",") if s.strip()]

activation_tag_list = split_tags(global_activation_tag)
remove_tags_list = split_tags(remove_tags)
search_tags_list = split_tags(search_tags)
replace_with_list = split_tags(replace_with)
replace_new_list = [t for t in replace_with_list if t not in search_tags_list]

replace_with_list = [t for t in replace_with_list if t not in replace_new_list]
replace_new_list.reverse()
activation_tag_list.reverse()

remove_count = 0
replace_count = 0

text_files = []
for root, dirs, files in os.walk(images_folder):
  for file in files:
    if file.lower().endswith(".txt"):
      text_files.append(os.path.join(root, file))

for txt in text_files:

  with open(os.path.join(images_folder, txt), 'r') as f:
    tags = [s.strip() for s in f.read().split(",") if s.strip()]

  if remove_duplicates:
    tags = list(set(tags))
  if sort_alphabetically:
    tags.sort()

  for rem in remove_tags_list:
    if rem in tags:
      remove_count += 1
      tags.remove(rem)

  if "AND" in search_mode and all(r in tags for r in search_tags_list) \
      or "OR" in search_mode and any(r in tags for r in search_tags_list):
    replace_count += 1
    for rem in search_tags_list:
      if rem in tags:
        tags.remove(rem)
    for add in replace_with_list:
      if add not in tags:
        tags.append(add)
    for new in replace_new_list:
      if new_becomes_activation_tag:
        if new in tags:
          tags.remove(new)
        tags.insert(0, new)
      else:
        if new not in tags:
          tags.append(new)

  for act in activation_tag_list:
    if act in tags:
      tags.remove(act)
    tags.insert(0, act)

  with open(os.path.join(images_folder, txt), 'w') as f:
    f.write(", ".join(tags))

if global_activation_tag:
  print(f"\n📎 Applied new activation tag(s): {', '.join(activation_tag_list)}")
if remove_tags:
  print(f"\n🚮 Removed {remove_count} tags.")
if search_tags:
  print(f"\n💫 Replaced in {replace_count} files.")
print("\n✅ Done! Check your updated tags in the Extras below.")


In [None]:
#@markdown ### 6️⃣ Ready
#@markdown You should be ready to [train your Lora](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer.ipynb)!

from IPython.display import Markdown, display
display(Markdown(f"### 🦀 [Click here to open the Lora trainer](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer.ipynb)"))


### 🦀 [Click here to open the Lora trainer](https://colab.research.google.com/github/hollowstrawberry/kohya-colab/blob/main/Lora_Trainer.ipynb)

## *️⃣ Extras

In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 📈 Analyze Tags
#@markdown Perhaps you need another look at your dataset.
show_top_tags = 50 #@param {type:"number"}

text_files = []
for root, dirs, files in os.walk(images_folder):
  for file in files:
    if file.lower().endswith(".txt"):
      text_files.append(os.path.join(root, file))

from collections import Counter
top_tags = Counter()
for file in text_files:
  with open(file, 'r') as f:
    tags = [t.strip() for t in f.read().split(",")]
  top_tags.update(tags)

print(f"📊 Top {show_top_tags} tags:")
for k, v in top_tags.most_common(show_top_tags):
  print(f"{k} ({v})")

In [None]:
#@markdown ### 📂 Unzip dataset
#@markdown It's much slower to upload individual files to your Drive, so you may want to upload a zip if you have your dataset in your computer.
zip = "/content/drive/MyDrive/Loras/example.zip" #@param {type:"string"}
extract_to = "/content/drive/MyDrive/Loras/example/dataset" #@param {type:"string"}

import os, zipfile

if not os.path.exists('/content/drive'):
  from google.colab import drive
  print("📂 Connecting to Google Drive...")
  drive.mount('/content/drive')

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip, 'r') as f:
  f.extractall(extract_to)

print("✅ Done")


In [None]:
#@markdown ### 🔢 Count datasets
#@markdown Google Drive makes it impossible to count the files in a folder, so this will show you the file counts in all folders and subfolders.
folder = "/content/drive/MyDrive/Loras" #@param {type:"string"}

import os
from google.colab import drive

if not os.path.exists('/content/drive'):
    print("📂 Connecting to Google Drive...\n")
    drive.mount('/content/drive')

tree = {}
exclude = ("_logs", "/output")
for i, (root, dirs, files) in enumerate(os.walk(folder, topdown=True)):
  dirs[:] = [d for d in dirs if all(ex not in d for ex in exclude)]
  images = len([f for f in files if f.lower().endswith((".png", ".jpg", ".jpeg"))])
  captions = len([f for f in files if f.lower().endswith(".txt")])
  others = len(files) - images - captions
  path = root[folder.rfind("/")+1:]
  tree[path] = None if not images else f"{images:>4} images | {captions:>4} captions |"
  if tree[path] and others:
    tree[path] += f" {others:>4} other files"

pad = max(len(k) for k in tree)
print("\n".join(f"📁{k.ljust(pad)} | {v}" for k, v in tree.items() if v))


In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

from PIL import Image
import os
Image.MAX_IMAGE_PIXELS = None

#@markdown ### 🖼️ Reduce dataset filesize
#@markdown This will convert all images in the project folder to jpeg, reducing filesize without affecting quality too much. This can also solve some errors.
location = images_folder

for dir in [d[0] for d in os.walk(location)]:
    os.chdir(dir)
    converted = False
    for file_name in list(os.listdir(".")):
        try:
            # Convert png to jpeg
            if file_name.endswith(".png"):
                if not converted:
                    print(f"Converting {dir}")
                    converted = True
                im = Image.open(file_name)
                im = im.convert("RGB")
                new_file_name = os.path.splitext(file_name)[0] + ".jpeg"
                im.save(new_file_name, quality=95)
                os.remove(file_name)
                file_name = new_file_name
            # Resize large jpegs
            if file_name.endswith((".jpeg", ".jpg")) and os.path.getsize(file_name) > 2000000:
                if not converted:
                    print(f"Converting {dir}")
                    converted = True
                im = Image.open(file_name)
                im = im.resize((int(im.width/2), int(im.height/2)))
                im.save(file_name, quality=95)
            # Rename jpg to jpeg
            if file_name.endswith(".jpg"):
                if not converted:
                    print(f"Converting {dir}")
                new_file_name = os.path.splitext(file_name)[0] + ".jpeg"
                os.rename(file_name, new_file_name)
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")
    if converted:
        print(f"Converted {dir}")


In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")

#@markdown ### 🚮 Clean folder
#@markdown Careful! Deletes all non-image files in the project folder.

!find {images_folder} -type f ! \( -iname '*.png' -o -iname '*.jpg' -o -iname '*.jpeg' \) -delete
