In [35]:
import os
import sys

from google.colab import drive
drive.mount('/content/drive')

gdrive_path = "/content/drive/MyDrive/AutobioLoraFinetuning"
local_link_path = "/content/AutobioLoraFinetuning"

if not os.path.exists(local_link_path):
    os.symlink(gdrive_path, local_link_path)

sys.path.append(local_link_path)

print(f"Symlink created and path added: {local_link_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Symlink created and path added: /content/AutobioLoraFinetuning


In [36]:
import os
from PIL import Image
from pathlib import Path
from transformers import pipeline

# parameters
TARGET_WIDTH = 512
TARGET_HEIGHT = 512
DATA_DIR = "AutobioLoraFinetuning"
IMAGES_DIR = os.path.join(DATA_DIR, "images")
PROMPTS_DIR = os.path.join(DATA_DIR, "prompts")
os.makedirs(PROMPTS_DIR, exist_ok=True)
DATASET_NAME = "dataset.csv"
SUPPORTED_IMG_TYPES = (".png", ".jpg", ".jpeg", ".webp")
LABELING_MODEL = "Salesforce/blip-image-captioning-base"
HF_USER_NAME = "3ilo"
HF_DATASET_NAME = "test-style-dataset"

In [37]:
# TODO: replace. Have noticed this includes stylistic info -- not desirable in this case.
captioner = pipeline("image-to-text", model=LABELING_MODEL)

Device set to use cpu


In [39]:
# process images
# !!! TODO: add special token in the labels.
for img_file in os.listdir(IMAGES_DIR):
    if not img_file.lower().endswith(SUPPORTED_IMG_TYPES):
        continue

    img_path = os.path.join(IMAGES_DIR, img_file)
    prompt_path = os.path.join(PROMPTS_DIR, os.path.splitext(img_file)[0] + ".txt")

    # resize if needed
    img = Image.open(img_path)
    if img.size != (TARGET_WIDTH, TARGET_HEIGHT):
        img_resized = img.resize((TARGET_WIDTH, TARGET_HEIGHT), Image.LANCZOS)
        img_resized.save(img_path)
        print(f"Resized {img_file} to {TARGET_WIDTH}x{TARGET_HEIGHT}")

    # generate prompt if needed
    if not os.path.exists(prompt_path):
        prompt_text = captioner(img_path)[0]['generated_text']
        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt_text)
        print(f"Generated prompt for {img_file}: {prompt_text}")


Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_running_1.png: a woman in a dress and shoes
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_running_3.png: a drawing of a woman with long hair
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_sleeping_1.png: a drawing of a woman sleeping on a bed
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_sleeping_2.png: a drawing of a woman sleeping on a bed
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_playing_1.png: a bunch of sketches for a hair salon
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_laughing_3.png: a woman laughing and holding her hand up
Generated prompt for best_quality,_clean,_sketch_stylized_illustration_girl_that_is_drinking_3.png: a woman with long hair and a white dress
Generated prompt for best_quality,_clean,

FileNotFoundError: [Errno 2] No such file or directory: 'AutobioLoraFinetuning/images/Copy of best_quality,_clean,_sketch_stylized_illustration_dragon_that_is_jumping_3.png'

In [40]:
# little bit of post-processing
import os

target_dir = f"{DATA_DIR}/prompts"

# phrases to remove from prompts
phrases_to_remove = [
    "a drawing of",
    "a black and white",
    "drawing of",
]

def clean_file_contents(directory, phrases):
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue

        file_path = os.path.join(directory, filename)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        new_content = content
        for phrase in phrases:
            new_content = new_content.replace(phrase, "")

        new_content = " ".join(new_content.split())

        if new_content != content:
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(new_content)
            print(f"Updated: {filename}")

clean_file_contents(target_dir, phrases_to_remove)


Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_running_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_sitting_2.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_sitting_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_sleeping_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_playing_1.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_playing_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_hiding_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_dog_that_is_chasing_2.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_cat_that_is_running_2.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_cat_that_is_eating_3.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_cat_that_is_hiding_2.txt
Updated: best_quality,_clean,_sketch_stylized_illustration_cat_that

In [46]:
import os
import csv

images_dir = os.path.join(DATA_DIR, "images")
prompts_dir = os.path.join(DATA_DIR, "prompts")
csv_file = os.path.join(DATA_DIR, DATASET_NAME)

image_files = sorted([f for f in os.listdir(images_dir) if f.lower().endswith(SUPPORTED_IMG_TYPES)])

rows = []

# combine image and prompt files into csv
for img_file in image_files:
    prompt_file = os.path.splitext(img_file)[0] + ".txt"
    prompt_path = os.path.join(prompts_dir, prompt_file)

    if not os.path.exists(prompt_path):
        print(f"Warning: No prompt file found for {img_file}, skipping.")
        continue

    with open(prompt_path, "r", encoding="utf-8") as f:
        prompt_text = f.read().strip()

    img_rel_path = os.path.join(images_dir, img_file)

    rows.append([img_rel_path, prompt_text])

# write csv
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["image", "prompt"])
    writer.writerows(rows)

print(f"CSV successfully created at {csv_file} with {len(rows)} entries.")


CSV successfully created at AutobioLoraFinetuning/dataset.csv with 63 entries.


In [47]:
# need to log into HF to publish dataset
!hf auth login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The tok

In [48]:
# create dataset
from datasets import Dataset, DatasetDict, Features, Value, Image

dataset_path = os.path.join(dataset_dir, DATASET_NAME)
dataset = Dataset.from_csv(dataset_path)
dataset = dataset.cast_column("image", Image())

# optional
dataset = dataset.train_test_split(test_size=0.1)

dataset.push_to_hub(f"{HF_USER_NAME}/{HF_DATASET_NAME}")

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         |  524kB / 25.9MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  98%|#########7| 3.15MB / 3.22MB            

CommitInfo(commit_url='https://huggingface.co/datasets/3ilo/test-style-dataset/commit/24333d6db0ffbdf648531afeb091174d997dc5f3', commit_message='Upload dataset', commit_description='', oid='24333d6db0ffbdf648531afeb091174d997dc5f3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/3ilo/test-style-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='3ilo/test-style-dataset'), pr_revision=None, pr_num=None)