1. Install Dependencies

In [1]:
# CLIP dependencies
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

# Stable Diffusion dependencies
! pip install diffusers

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-37l1dzes
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-37l1dzes
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

2. Imports

In [2]:
import os
import re
from google.colab import drive

from abc import ABC, abstractmethod
from typing import Union, List, Optional

from PIL import Image

import numpy as np
import torch
from torchvision import transforms

from diffusers import DiffusionPipeline, StableDiffusionPipeline, StableDiffusion3Pipeline, FluxPipeline

import clip

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

3. Connect to Google Drive

In [3]:
drive.mount("/content/drive",force_remount=True)
os.chdir("/content/drive/My Drive")

Mounted at /content/drive


4. Test Stable Diffusion Model

In [4]:
class ModelLoadingError(Exception):
    """Exception raised when there is an error loading the model."""
    pass

class InferenceError(Exception):
    """Exception raised when an error occurs during inference."""
    pass


In [5]:
class BaseModel(ABC):
    @abstractmethod
    def load_model(self):
        """
        Load the open-weights model or make an API connection to the closed-source model.
        """
        pass

    @abstractmethod
    def inference(
        self, inputs: Union[List[str], torch.Tensor], captions: Optional[List[str]] = None
    ) -> Union[torch.Tensor, List[float]]:
        """
        Run inference on a batch of inputs with optional captions.

        Args:
            inputs (Union[List[str], torch.Tensor]): A batch of text prompts or a batch of images.
            captions (Optional[List[str]]): Optional text captions associated with the inputs for reward models.

        Returns:
            Union[torch.Tensor, List[float]]: A batch of model outputs or a list of reward scores.
        """
        pass

In [6]:
class HPSv1Model(BaseModel):
    def __init__(self, model_path: str):
        """
        Args:
            model_path (str): Path to the HPSv1 model checkpoint.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_path = model_path
        self.load_model()

    def load_model(self):
        try:
            self.model, self.preprocess = clip.load("ViT-L/14", device=self.device)
            checkpoint = torch.load(self.model_path)

            if "state_dict" not in checkpoint:
                raise ModelLoadingError("Checkpoint does not contain 'state_dict'.")

            self.model.load_state_dict(checkpoint["state_dict"])

        except FileNotFoundError as e:
            raise ModelLoadingError(f"Model checkpoint not found at '{self.model_path}'.") from e
        except Exception as e:
            raise ModelLoadingError(f"Error loading model: {e}") from e

    def inference(self, inputs: torch.Tensor, captions: List[str]) -> List[float]:
        """
        Runs inference on a batch of images and corresponding captions.
        Returns a batch of reward scores.
        """
        if not isinstance(inputs, torch.Tensor):
            raise TypeError("Expected 'inputs' to be of type torch.Tensor (i.e. images).")
        if not isinstance(captions, list) or not all(isinstance(c, str) for c in captions):
            raise TypeError("Expected 'captions' to be a list of strings.")
        if inputs.shape[0] != len(captions):
            raise ValueError("Number of 'inputs' and 'captions' must match.")

        try:
            with torch.no_grad():
                image_features = self.model.encode_image(inputs.to(self.device))
                text_tokens = clip.tokenize(captions).to(self.device)
                text_features = self.model.encode_text(text_tokens)

                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
                text_features = text_features / text_features.norm(dim=-1, keepdim=True)

                # Convert cosine similarity scores to percentages as in the original paper
                similarity_scores = (image_features @ text_features.T).diag() * 100

            return similarity_scores.tolist()
        except Exception as e:
            raise InferenceError(f"Inference failed: {e}") from e

In [7]:
class StableDiffusionModel(BaseModel):
    def __init__(self, model_path: str, offload_to_cpu: bool = False, resolution: int = 512, **kwargs):
        """
        Args:
            model_path (str): Path to the Stable Diffusion model.
                              Must include 'stable-diffusion-1', 'stable-diffusion-2', or 'stable-diffusion-3' after '<repo-owner>/'
                              for simplicity.
        """
        self.seed = 42

        self.model_path = model_path
        self.offload_to_cpu = offload_to_cpu
        self.resolution = resolution
        self.kwargs = kwargs

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_model()

    def load_model(self):
        version_tag = self.model_path.split("/")[-1].lower()

        if re.search(r'(stable-diffusion-1|v-?1)', version_tag):
            pipeline_class = StableDiffusionPipeline
        elif re.search(r'(stable-diffusion-2|v-?2)', version_tag):
            pipeline_class = DiffusionPipeline
        elif re.search(r'(stable-diffusion-3|v-?3)', version_tag):
            pipeline_class = StableDiffusion3Pipeline
        else:
            raise ModelLoadingError(
                "Model path must contain one of: 'stable-diffusion-1', 'stable-diffusion-2', or 'stable-diffusion-3'."
            )

        try:
            # Load the model with float16 precision.
            # If your GPU supports torch.bfloat16 for lower memory usage with similar precision to FP32,
            # consider switching the torch_dtype accordingly.
            self.diffusion_pipeline = pipeline_class.from_pretrained(
                self.model_path, torch_dtype=torch.float16,
                **self.kwargs
            ).to(self.device)
            if self.offload_to_cpu:
              self.diffusion_pipeline.enable_model_cpu_offload()

        except MemoryError as e:
            if hasattr(self, "diffusion_pipeline"):
                del self.diffusion_pipeline
            torch.cuda.empty_cache()
            raise ModelLoadingError(f"Memory error occurred while loading the model. Consider using a smaller model: {e}") from e
        except Exception as e:
            raise ModelLoadingError(f"Failed to load Stable Diffusion model: {e}") from e

    def inference(
        self, inputs: List[str], captions: Optional[List[str]] = None
    ) -> list[torch.Tensor]:
        """
        Runs inference on a batch of prompts.
        Returns a batch of images corresponding to the prompts.
        """
        if not isinstance(inputs, list) or not all(isinstance(c, str) for c in inputs):
            raise TypeError("Expected 'inputs' to be a list of strings.")

        try:
            # Create one generator per prompt to ensure reproducibility
            generators = [
                torch.Generator(self.device).manual_seed(self.seed) for _ in range(len(inputs))
            ]
            images = self.diffusion_pipeline(
                prompt=inputs, generator=generators,
                height=self.resolution, width=self.resolution # use 1:1 aspect ratio
            ).images
            return images

        except Exception as e:
            raise InferenceError(f"Inference failed: {e}")

In [8]:
os.environ["HF_TOKEN"] = "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"

In [9]:
class FluxModel(BaseModel):
    def __init__(self, model_path: str, offload_to_cpu: bool = False, resolution: int = 512, **kwargs):
        """
        Args:
            model_path (str): Path to the Flux.1-[dev, schnell] model checkpoint.
        """
        self.seed = 42

        self.model_path = model_path
        self.offload_to_cpu = offload_to_cpu
        self.resolution = resolution
        self.kwargs = kwargs

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_model()

    def load_model(self):
        try:
            # Load the model with float16 precision.
            # If your GPU supports torch.bfloat16 for lower memory usage with similar precision to FP32,
            # consider switching the torch_dtype accordingly.
            self.diffusion_pipeline = FluxPipeline.from_pretrained(
                self.model_path, torch_dtype=torch.float16,
                **self.kwargs
            ).to(self.device)
            if self.offload_to_cpu:
              self.diffusion_pipeline.enable_model_cpu_offload()

        except MemoryError as e:
            if hasattr(self, "diffusion_pipeline"):
                del self.diffusion_pipeline
            torch.cuda.empty_cache()
            raise ModelLoadingError(f"Memory error occurred while loading the model. Consider using a smaller model: {e}") from e
        except Exception as e:
            raise ModelLoadingError(f"Failed to load Stable Diffusion model: {e}") from e

    def inference(
        self, inputs: List[str], captions: Optional[List[str]] = None
    ) -> list[torch.Tensor]:
        """
        Runs inference on a batch of prompts.
        Returns a batch of images corresponding to the prompts.
        """
        if not isinstance(inputs, list) or not all(isinstance(c, str) for c in inputs):
            raise TypeError("Expected 'inputs' to be a list of strings.")

        try:
            # Create one generator per prompt to ensure reproducibility
            generators = [
                torch.Generator(self.device).manual_seed(self.seed) for _ in range(len(inputs))
            ]
            images = self.diffusion_pipeline(
                prompt=inputs, generator=generators,
                height=self.resolution, width=self.resolution # use 1:1 aspect ratio
            ).images
            return images

        except Exception as e:
            raise InferenceError(f"Inference failed: {e}")

In [None]:
model = FluxModel(model_path="black-forest-labs/FLUX.1-schnell")#, offload_to_cpu=True)
images = model.inference(inputs=["a photograph of an astronaut riding a horse"] * 5)
for i in range(5):
  display(images[i])

model_index.json:   0%|          | 0.00/536 [00:00<?, ?B/s]

Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]

text_encoder%2Fconfig.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

scheduler%2Fscheduler_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

text_encoder_2%2Fconfig.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.53G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

tokenizer%2Fmerges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

(…)encoder_2%2Fmodel.safetensors.index.json:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

tokenizer%2Ftokenizer_config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

tokenizer%2Fspecial_tokens_map.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

tokenizer%2Fvocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer_2%2Fspecial_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

tokenizer_2%2Ftokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer_2%2Ftokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

transformer%2Fconfig.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

(…)pytorch_model-00001-of-00003.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

(…)pytorch_model-00002-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

(…)pytorch_model-00003-of-00003.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

(…)ion_pytorch_model.safetensors.index.json:   0%|          | 0.00/121k [00:00<?, ?B/s]

vae%2Fconfig.json:   0%|          | 0.00/774 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


##NOTE: Colab memory restrictions (CPU & GPU memory) prevent me from being able to load the FLUX.1 family of models.