1. Install Dependencies

In [1]:
# HPS dependencies
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git
! pip install hpsv2

# Stable Diffusion dependencies
! pip install diffusers

# Adversarial attack dependencies
! pip install torchattacks

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-diohcdjm
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-diohcdjm
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
!mkdir -p clip && wget https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz -P /usr/local/lib/python3.11/dist-packages/hpsv2/src/open_clip

--2025-03-09 04:59:18--  https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/openai/CLIP/main/clip/bpe_simple_vocab_16e6.txt.gz [following]
--2025-03-09 04:59:18--  https://raw.githubusercontent.com/openai/CLIP/main/clip/bpe_simple_vocab_16e6.txt.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1356917 (1.3M) [application/octet-stream]
Saving to: ‘/usr/local/lib/python3.11/dist-packages/hpsv2/src/open_clip/bpe_simple_vocab_16e6.txt.gz.12’


2025-03-09 04:59:19 (57.4 MB/s) - ‘/usr/local/lib/python3.11/dist-packages/hpsv2/src/

2. Imports

In [3]:
import os
import re
import gc
import ast
from datetime import datetime
import random
import argparse
from tqdm import tqdm
from google.colab import drive
from collections import OrderedDict

from abc import ABC, abstractmethod
from typing import Union, List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from diffusers import DiffusionPipeline, StableDiffusionPipeline, StableDiffusion3Pipeline

import clip
import hpsv2
from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
import PIL
from PIL import Image

from torchattacks.attack import Attack, wrapper_method



3. Connect to Google Drive

In [4]:
drive.mount("/content/drive",force_remount=True)
os.chdir("/content/drive/My Drive")

Mounted at /content/drive


4. Model Code

In [5]:
class ModelLoadingError(Exception):
    """Exception raised when there is an error loading the model."""
    pass

class InferenceError(Exception):
    """Exception raised when an error occurs during inference."""
    pass

In [6]:
class BaseModel(ABC):
    @abstractmethod
    def load_model(self):
        """
        Load the open-weights model or make an API connection to the closed-source model.
        """
        pass

    @abstractmethod
    def inference(
        self, inputs: Union[List[str], torch.Tensor], captions: Optional[List[str]] = None
    ) -> Union[torch.Tensor, List[float]]:
        """
        Run inference on a batch of inputs with optional captions.

        Args:
            inputs (Union[List[str], torch.Tensor]): A batch of text prompts or a batch of images.
            captions (Optional[List[str]]): Optional text captions associated with the inputs for reward models.

        Returns:
            Union[torch.Tensor, List[float]]: A batch of model outputs or a list of reward scores.
        """
        pass

In [7]:
class HPSv1Model(BaseModel):
    def __init__(self, model_path: str):
        """
        Args:
            model_path (str): Path to the HPSv1 model checkpoint.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_path = model_path
        self.load_model()

    def load_model(self):
        try:
            self.model, self.preprocess_function = clip.load("ViT-L/14", device=self.device)
            checkpoint = torch.load(self.model_path)

            if "state_dict" not in checkpoint:
                raise ModelLoadingError("Checkpoint does not contain 'state_dict'.")

            self.model.load_state_dict(checkpoint["state_dict"])
            self.tokenizer = clip.tokenize
            self.model.eval()

        except FileNotFoundError as e:
            raise ModelLoadingError(f"Model checkpoint not found at '{self.model_path}'.") from e
        except Exception as e:
            raise ModelLoadingError(f"Error loading model: {e}") from e

    def inference(self, inputs: torch.Tensor, captions: Union[List[str], torch.Tensor]) -> List[float]:
        """
        Runs inference on a batch of images and corresponding captions.
        Returns a batch of reward scores.
        """
        if not isinstance(inputs, torch.Tensor):
            raise TypeError("Expected 'inputs' to be of type torch.Tensor (i.e. images).")
        if not (isinstance(captions, torch.Tensor) or (isinstance(captions, list) and all(isinstance(c, str) for c in captions))):
            raise TypeError("Expected 'captions' to be either a torch.Tensor or a list of strings.")
        if inputs.shape[0] != len(captions):
            raise ValueError("Number of 'inputs' and 'captions' must match.")

        try:
            with torch.no_grad():
                image_features = self.model.encode_image(inputs.to(self.device))

                if not isinstance(captions, torch.Tensor):
                    text_tokens = self.tokenizer(captions).to(self.device)
                else:
                    text_tokens = captions.to(self.device)
                text_features = self.model.encode_text(text_tokens)

                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
                text_features = text_features / text_features.norm(dim=-1, keepdim=True)

                # Convert cosine similarity scores to percentages as in the original paper
                similarity_scores = (image_features @ text_features.T).diag() * 100
            return similarity_scores.tolist()
        except Exception as e:
            raise InferenceError(f"Inference failed: {e}") from e

    def inference_with_grad(self, inputs: torch.Tensor, captions: List[str]) -> List[float]:
        """
        Runs inference on a batch of images and corresponding captions.
        Returns a batch of reward scores.
        """
        if not isinstance(inputs, torch.Tensor):
            raise TypeError("Expected 'inputs' to be of type torch.Tensor (i.e. images).")
        if not isinstance(captions, list) or not all(isinstance(c, str) for c in captions):
            raise TypeError("Expected 'captions' to be a list of strings.")
        if inputs.shape[0] != len(captions):
            raise ValueError("Number of 'inputs' and 'captions' must match.")

        try:
            text_tokens = clip.tokenize(captions).to(self.device)
            image_features, text_features = self.model(inputs, text_tokens)
            return (image_features @ text_features.T).diag() * 100
        except Exception as e:
            raise InferenceError(f"Inference failed: {e}") from e

In [8]:
class HPSv2Model(BaseModel):
    def __init__(self, model_path: str):
        """
        Args:
            model_path (str): Path to the HPSv2 model checkpoint.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_path = model_path
        self.load_model()

    def load_model(self):
        try:
            self.model, _, self.preprocess_function = create_model_and_transforms(
                "ViT-H-14",
                "laion2B-s32B-b79K",
                precision="amp",
                device=self.device,
                jit=False,
                force_quick_gelu=False,
                force_custom_text=False,
                force_patch_dropout=False,
                force_image_size=None,
                pretrained_image=False,
                image_mean=None,
                image_std=None,
                light_augmentation=True,
                aug_cfg={},
                output_dict=True,
                with_score_predictor=False,
                with_region_predictor=False
            )

            checkpoint = torch.load(self.model_path)
            if "state_dict" not in checkpoint:
                raise ModelLoadingError("Checkpoint does not contain 'state_dict'.")

            self.model.load_state_dict(checkpoint["state_dict"])
            self.tokenizer = get_tokenizer("ViT-H-14")
            self.model.eval()

        except FileNotFoundError as e:
            raise ModelLoadingError(f"Model checkpoint not found at '{self.model_path}'.") from e
        except Exception as e:
            raise ModelLoadingError(f"Error loading model: {e}") from e

    def inference(self, inputs: torch.Tensor, captions: Union[List[str], torch.Tensor]) -> List[float]:
        """
        Runs inference on a batch of images and corresponding captions.
        Returns a batch of reward scores.
        """
        if not isinstance(inputs, torch.Tensor):
            raise TypeError("Expected 'inputs' to be a list of PIL.Image objects.")
        if not (isinstance(captions, torch.Tensor) or (isinstance(captions, list) and all(isinstance(c, str) for c in captions))):
            raise TypeError("Expected 'captions' to be either a torch.Tensor or a list of strings.")
        if len(inputs) != len(captions):
            raise ValueError("Number of 'inputs' and 'captions' must match.")

        try:
            with torch.no_grad():
                if not isinstance(captions, torch.Tensor):
                    text_tokens = self.tokenizer(captions).to(self.device)
                else:
                    text_tokens = captions.to(self.device)
                inputs = inputs.to(self.device)

                with torch.cuda.amp.autocast():
                    outputs = self.model(inputs, text_tokens)
                    image_features, text_features = outputs["image_features"], outputs["text_features"]
                    similarity_scores = (image_features @ text_features.T).diag() * 100
                return similarity_scores.tolist()

        except Exception as e:
            raise InferenceError(f"Inference failed: {e}") from e


    def inference_with_grad(self, inputs: torch.Tensor, captions: List[str]) -> List[float]:
        """
        Runs inference on a batch of images and corresponding captions.
        Returns a batch of reward scores.
        """
        if not isinstance(inputs, torch.Tensor):
            raise TypeError("Expected 'inputs' to be a list of PIL.Image objects.")
        if not isinstance(captions, list) or not all(isinstance(c, str) for c in captions):
            raise TypeError("Expected 'captions' to be a list of strings.")
        if len(inputs) != len(captions):
            raise ValueError("Number of 'inputs' and 'captions' must match.")

        try:
            text_tokens = self.tokenizer(captions).to(self.device)
            inputs = inputs.to(self.device)

            with torch.cuda.amp.autocast():
                outputs = self.model(inputs, text_tokens)
                image_features, text_features = outputs["image_features"], outputs["text_features"]
                return (image_features @ text_features.T).diag() * 100

        except Exception as e:
            raise InferenceError(f"Inference failed: {e}") from e

In [9]:
class BaseDiffusionModel(BaseModel):
    def __init__(self, model_path: str, offload_to_cpu: bool = False, resolution: int = None, **kwargs):
        """
        Args:
            model_path (str): Path or repository ID of the diffusion model checkpoint.
        """
        self.seed = 42

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_path = model_path
        self.offload_to_cpu = offload_to_cpu
        self.resolution = resolution
        self.kwargs = kwargs

        self.diffusion_pipeline = self._get_diffusion_pipeline()
        self.load_model()

    def _get_diffusion_pipeline(self):
        """ Subclasses should override this to return the correct pipeline. """
        return DiffusionPipeline

    def load_model(self):
        try:
            self.model = self.diffusion_pipeline.from_pretrained(
                self.model_path,
                **self.kwargs
            ).to(self.device)
            if self.offload_to_cpu:
                self.model.enable_model_cpu_offload()

        except MemoryError as e:
            if hasattr(self, "model"):
                del self.model
                torch.cuda.empty_cache()
            raise ModelLoadingError(f"Memory error occurred while loading the model. Consider using a smaller model: {e}")
        except FileNotFoundError as e:
            raise ModelLoadingError(f"Model checkpoint not found at '{self.model_path}'.") from e
        except Exception as e:
            raise ModelLoadingError(f"Failed to load diffusion model: {e}") from e

    def inference(
        self, inputs: List[str], captions: Optional[List[str]] = None
    ):
        """
        Runs inference on a batch of prompts.
        Returns a batch of images corresponding to the prompts.
        """
        if not isinstance(inputs, list) or not all(isinstance(c, str) for c in inputs):
            raise TypeError("Expected 'inputs' to be a list of strings.")

        try:
            # Create one generator per prompt to ensure reproducibility
            generators = [
                torch.Generator(self.device).manual_seed(self.seed) for _ in range(len(inputs))
            ]
            if self.resolution:
                images = self.model(
                    prompt=inputs, generator=generators,
                    height=self.resolution, width=self.resolution # use 1:1 aspect ratio
                ).images
                return images
            else:
                images = self.model(
                    prompt=inputs, generator=generators,
                ).images
                return images

        except Exception as e:
            raise InferenceError(f"Inference failed: {e}")

In [10]:
class StableDiffusionModel(BaseDiffusionModel):
    def __init__(self, model_path: str, offload_to_cpu: bool = False, resolution: int = None, **kwargs):
        """
        Note:
            model_path (str): Path to the Stable Diffusion model.
                              Must include 'stable-diffusion-1', 'stable-diffusion-2', or 'stable-diffusion-3' after '<repo-owner>/'
                              for simplicity.
        """

        # Load the model with float16 precision.
        # If your GPU supports torch.bfloat16 for lower memory usage with similar precision to FP32,
        # consider switching the torch_dtype accordingly.
        if "torch_dtype" not in kwargs:
            kwargs["torch_dtype"] = torch.float16
        super().__init__(model_path, offload_to_cpu, resolution, **kwargs)

    def _get_diffusion_pipeline(self):
        version_tag = self.model_path.split("/")[-1].lower()

        if re.search(r'(stable-diffusion-?(v-?|v)?1(?:-\d+)?)(.*)?$', version_tag):
            return StableDiffusionPipeline
        elif re.search(r'(stable-diffusion-?(v-?|v)?2(?:-\d+)?)(.*)?$', version_tag):
            return DiffusionPipeline
        elif re.search(r'(stable-diffusion-?(v-?|v)?3(?:-\d+)?)(.*)?$', version_tag):
            return StableDiffusion3Pipeline
        else:
            raise ValueError(
                "Model path must match 'stable-diffusion-1', 'stable-diffusion-v1', 'stable-diffusion-v-1', "
                "'stable-diffusion-2', 'stable-diffusion-v2', etc."
            )

In [11]:
class ModelFactory:
    @staticmethod
    def create_model(
        model_type: str, model_path: str,
        **kwargs,
    ) -> BaseModel:
        """
        Creates and returns an instance of a model subclass based on the model_type.

        Args:
            model_type (str): The type of model to create. Supported values are:
                - "hpsv1": For HPSv1 reward models.
                - "hpsv2": For HPSv2 reward models.
                - "sd": For stable diffusion text-to-image models.
            model_path (str): The path or repository ID of the model checkpoint.

        Returns:
            BaseModel: An instance of the requested model.

        Raises:
            ValueError: If an unsupported model_type is provided.
        """
        if model_type == "hpsv1":
            return HPSv1Model(model_path)
        elif model_type == "hpsv2":
            return HPSv2Model(model_path)
        elif model_type == "sd":
            return StableDiffusionModel(model_path, **kwargs)
        else:
            raise ValueError("Unsupported model type. Use 'sd' for stable diffusion models or 'hps' for HPS models.")

5. Dataset Code

In [12]:
class DatasetFormatError(Exception):
    """Raised when the dataset format is incorrect."""
    pass

class DatasetLoadingError(Exception):
    """Raised when the dataset fails to load properly."""
    pass

In [13]:
class BasePromptDataset(Dataset, ABC):
    def __init__(self):
        try:
            self.data = self.load_dataset()
        except Exception as e:
            raise DatasetLoadingError(f"Failed to load dataset: {e}")

        if not isinstance(self.data, dict):
            raise DatasetFormatError(f"Expected 'load_dataset()' to return a dictionary, got '{type(self.data)}'.")

        for key, prompts in self.data.items():
            if not isinstance(prompts, list) or not all(isinstance(p, str) for p in prompts):
                raise DatasetFormatError(f"Expected a list of strings for category '{key}', but got '{type(prompts)}'")

        # Precompute samples with round-robin ordering
        self.samples = self._create_round_robin_samples()

    @abstractmethod
    def load_dataset(self) -> Dict[str, List[str]]:
        """To be implemented by subclasses."""
        pass

    def _create_round_robin_samples(self) -> List[Dict[str, str]]:
        """Ensure fair round-robin interleaving of prompts from all categories."""
        samples = []
        categories = list(self.data.keys())
        category_prompts = [self.data[cat] for cat in categories]

        if not categories or all(len(prompts) == 0 for prompts in category_prompts):
            raise DatasetFormatError("Dataset is empty or contains only empty categories.")

        max_length = max(len(prompts) for prompts in category_prompts)

        # Round-robin interleaving
        for i in range(max_length):
            for cat_idx, category in enumerate(categories):
                prompts = category_prompts[cat_idx]
                if len(prompts) > 0:
                    prompt = prompts[i % len(prompts)]  # Cycle back for shorter lists
                    samples.append({"category": category, "prompt": prompt})

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

    def num_categories(self) -> int:
        """Returns the number of unique categories in the dataset."""
        return len(self.data)

In [14]:
class HPSV2PromptDataset(BasePromptDataset):
    def load_dataset(self) -> Dict[str, List[str]]:
        all_prompts = hpsv2.benchmark_prompts("all")
        return dict(all_prompts.items())

In [15]:
class DrawBenchPromptDataset(BasePromptDataset):
    def load_dataset(self) -> Dict[str, List[str]]:
        df = pd.read_csv("drawbench_data.csv")
        return df.groupby("Category")["Prompts"].apply(list).to_dict()

In [16]:
class ImagePromptDataset(Dataset):
    def __init__(
            self,
            image_list: List[PIL.Image], prompt_list: List[Tuple[str, str]],
            image_transform_function: callable, text_tokenizer_function: callable = None
        ):
        """
        Args:
            image_list (List[PIL.Image]): List of PIL images.
            prompt_list (List[Tuple[str, str]]): List of (category, prompt) tuples.
            image_transform_function (callable): Function to transform PIL images.
            text_tokenizer_function (callable): Function to tokenize text prompts.
        """
        if len(image_list) == 0 or len(prompt_list) == 0:
            raise DatasetFormatError("Both image_list and prompt_list must be non-empty.")
        if len(image_list) != len(prompt_list):
            raise DatasetFormatError("Images and prompts must have the same length.")

        self.images = image_list
        self.prompts = prompt_list  # List of (category, prompt)
        self.image_transform_function = image_transform_function
        self.text_tokenizer_function = text_tokenizer_function

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.image_transform_function(self.images[idx])
        _, prompt = self.prompts[idx]
        if self.text_tokenizer_function is None:
            tokens = prompt
        else:
            tokens = self.text_tokenizer_function(prompt)

        return image, tokens

In [17]:
class RoundRobinSampler(torch.utils.data.Sampler):
    def __init__(self, dataset: BasePromptDataset):
        self.dataset = dataset
        self.indices = self._generate_indices()

    def _generate_indices(self):
        """
        Assume dataset.data has equal length lists per category.

        For each category, create a shuffled list of indices corresponding to that category's samples.
        Since BasePromptDataset precomputes samples in round-robin order, we need to map from category + position
        to the flat sample index.

        In our round-robin samples, the ordering is:
        index 0: category1, index 1: category2, ..., index N: category1

        Let K = number of categories,
        Then the sample index for category j at position i is: i*K + j.
        """
        categories = list(self.dataset.data.keys())
        num_per_category = len(next(iter(self.dataset.data.values())))
        K = len(categories)

        category_indices = {}
        for j, cat in enumerate(categories):
            indices = [i * K + j for i in range(num_per_category)]
            random.shuffle(indices)
            category_indices[cat] = indices

        ordered_indices = []
        for i in range(num_per_category):
            for cat in categories:
                ordered_indices.append(category_indices[cat][i])
        return ordered_indices

    def __iter__(self):
        return iter(self.indices)

    def __len__(self):
        return len(self.indices)

In [18]:
class DatasetFactory:
    @staticmethod
    def create_dataset(
        dataset_type: str,
        **kwargs,
    ) -> Union[BasePromptDataset, ImagePromptDataset]:

        if dataset_type == "drawbench":
            return DrawBenchPromptDataset()
        elif dataset_type == "hps":
            return HPSV2PromptDataset()
        elif dataset_type == "imageandprompt":
            return ImagePromptDataset(**kwargs)
        else:
            raise ValueError(f"Unknown dataset type: '{dataset_type}'.")

6. Attack Code

In [19]:
class BaseAttack(Attack):
    """
    Small modifications to the torchattack's Attack class
    to work with reward models
    """
    def __init__(self, name, model):
        """
        Initializes internal attack state.

        Arguments:
            name (str): name of attack.
            model (BaseModel): model to attack.
        """

        self.attack = name
        self._attacks = OrderedDict()

        self.set_model(model)
        ################################################
        # MODIFICATION
        # Set device using torch.cuda instead of
        # model.parameters().device
        ################################################
        try:
              self.device = "cuda" if torch.cuda.is_available() else "cpu"
        ################################################
        except Exception:
            self.device = None
            print("Failed to set device automatically, please try set_device() manual.")

        # Controls attack mode.
        self.attack_mode = "default"
        self.supported_mode = ["default"]
        self.targeted = False
        self._target_map_function = None

        # Controls when normalization is used.
        self.normalization_used = None
        self._normalization_applied = None
        if self.model.__class__.__name__ == "RobModel":
            self._set_rmodel_normalization_used(model)

        # Controls model mode during attack.
        self._model_training = False
        self._batchnorm_training = False
        self._dropout_training = False

    @wrapper_method
    def _change_model_mode(self, given_training):
        ################################################
        # MODIFICATION
        # do not iterate over model parameters
        # as we use pipelines for inference
        ################################################
        pass
        # if self._model_training:
        #     self.model.train()
        #     for _, m in self.model.named_modules():
        #         if not self._batchnorm_training:
        #             if "BatchNorm" in m.__class__.__name__:
        #                 m = m.eval()
        #         if not self._dropout_training:
        #             if "Dropout" in m.__class__.__name__:
        #                 m = m.eval()
        # else:
        #     self.model.eval()

    @wrapper_method
    def _recover_model_mode(self, given_training):
        ################################################
        # MODIFICATION
        # do not execute model.train()
        # as we use pipelines for inference
        ################################################
        if given_training:
            pass
            # self.model.train()

    def __call__(self, inputs, labels=None, *args, **kwargs):
      # given_training = self.model.training
      # self._change_model_mode(given_training)

      if self._normalization_applied is True:
          inputs = self.inverse_normalize(inputs)
          self._set_normalization_applied(False)

          adv_inputs = self.forward(inputs, labels, *args, **kwargs)
          # adv_inputs = self.to_type(adv_inputs, self.return_type)

          adv_inputs = self.normalize(adv_inputs)
          self._set_normalization_applied(True)
      else:
          adv_inputs = self.forward(inputs, labels, *args, **kwargs)
          # adv_inputs = self.to_type(adv_inputs, self.return_type)

      # self._recover_model_mode(given_training)

      return adv_inputs

In [20]:
class GNRewardModel(BaseAttack):
    """
    Gaussian Noise attack for reward models.

    Arguments:
        model (BaseModel): reward model to attack.
        std (float): standard deviation of the Gaussian noise (Default: 0.1).
    """

    def __init__(self, model, std=0.1):
        super().__init__("GNReward", model)
        self.std = std
        self.supported_mode = ["default"]

    def forward(self, images, labels):
        """
        Overridden forward method for attacking a reward model.
        """

        dataset = ImagePromptDataset(
            image_list=images, prompt_list=labels,
            image_transform_function=self.model.preprocess_function,
            text_tokenizer_function=self.model.tokenizer
        )

        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=len(images), shuffle=False
        )

        images, _ = next(iter(dataloader))
        images = images.clone().detach().to(self.device)
        adv_images = images + self.std * torch.randn_like(images)
        adv_images = torch.clamp(adv_images, min=0, max=1).detach()
        return adv_images


In [21]:
class FGSMRewardModel(BaseAttack):
    """
    FGSM for reward models.

    Instead of using cross-entropy, this attack uses a custom loss:
    Loss = -reward, so that the adversary minimizes the reward score.

    Distance Measure: Linf

    Arguments:
        model (BaseModel): reward model to attack.
        eps (float): maximum perturbation. (Default: 8/255)
        batch_size (int): batch size for processing images via DataLoader.
    """
    def __init__(self, model, eps=8/255, batch_size=1):
        super().__init__("FGSMRewardModel", model)
        self.eps = eps
        self.batch_size = batch_size
        self.supported_mode = ["default"]

    def forward(self, images, labels):
        """
        Overridden forward method for attacking a reward model using a DataLoader.
        """
        dataset = ImagePromptDataset(
            image_list=images, prompt_list=labels,
            image_transform_function=self.model.preprocess_function,
            text_tokenizer_function=None
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
        adv_images_list = []

        for images, labels in loader:
            images = images.clone().detach().to(self.device)
            images.requires_grad = True

            reward = self.model.inference_with_grad(images, list(labels))
            loss = -reward.mean()
            grad = torch.autograd.grad(loss, images, retain_graph=False, create_graph=False)[0]
            adv_batch = images + self.eps * grad.sign()
            adv_batch = torch.clamp(adv_batch, 0, 1).detach()
            adv_images_list.append(adv_batch)

        adv_images = torch.cat(adv_images_list, dim=0)
        return adv_images

In [22]:
class PGDRewardModel(BaseAttack):
    """
    PGD for reward models using global loss averaging over the entire dataset.

    Instead of using cross-entropy loss, this attack uses a custom loss:
    Loss = -reward, so that the adversary minimizes the reward score.

    The entire dataset is loaded into memory, and during each PGD step we iterate
    over mini-batches of the current adversarial images by simple slicing.

    Distance Measure: Linf

    Arguments:
        model (nn.Module): reward model to attack.
        eps (float): maximum perturbation (Default: 8/255).
        alpha (float): step size (Default: 2/255).
        steps (int): number of PGD steps (Default: 10).
        random_start (bool): if True, initializes adversarial examples with a random perturbation.
        batch_size (int): mini-batch size for computing the global loss (Default: 8).
    """
    def __init__(self, model, eps=8/255, alpha=2/255, steps=10, random_start=True, batch_size=8):
        super().__init__("PGDRewardModel", model)
        self.eps = eps
        self.alpha = alpha
        self.steps = steps
        self.random_start = random_start
        self.batch_size = batch_size
        self.supported_mode = ["default"]

    def forward(self, images, labels):
        """
        Overridden forward method for attacking a reward model.
        """
        dataset = ImagePromptDataset(
            image_list=images, prompt_list=labels,
            image_transform_function=self.model.preprocess_function,
            text_tokenizer_function=None
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
        images_list = []
        prompts_list = []

        for imgs, prompts in loader:
            images_list.append(imgs)
            prompts_list.extend(prompts)

        all_images = torch.cat(images_list, dim=0).to(self.device)
        adv_images = all_images.clone().detach()
        num_images = all_images.shape[0]

        if self.random_start:
            adv_images = adv_images + torch.empty_like(adv_images).uniform_(-self.eps, self.eps)
            adv_images = torch.clamp(adv_images, 0, 1).detach()

        for _ in range(self.steps):
            global_grad = torch.zeros_like(adv_images)
            total_samples = 0

            for i in range(0, num_images, self.batch_size):
                batch_images = adv_images[i: i + self.batch_size]
                batch_prompts = prompts_list[i: i + self.batch_size]

                batch_images.requires_grad_()

                reward = self.model.inference_with_grad(batch_images, batch_prompts)
                loss = -reward.sum() / num_images

                grad = torch.autograd.grad(loss, batch_images)[0]
                global_grad[i: i + self.batch_size] = grad

                total_samples += batch_images.shape[0]

                clear_cuda_memory_and_force_gc(force=True)

            adv_images = adv_images.detach() + self.alpha * global_grad.sign()
            delta = torch.clamp(adv_images - all_images, min=-self.eps, max=self.eps)
            adv_images = torch.clamp(all_images + delta, 0, 1).detach()
        return adv_images

In [23]:
class SPSARewardModel(BaseAttack):
    """
    SPSA attack for reward models.

    Distance Measure : L_inf

    Arguments:
        model (nn.Module): Reward model to attack. It should take an image tensor and output a scalar reward.
        eps (float): maximum perturbation. (Default: 0.3)
        delta (float): smoothing parameter for gradient approximation. (Default: 0.01)
        lr (float): learning rate for the optimizer. (Default: 0.01)
        nb_iter (int): number of attack iterations. (Default: 1)
        nb_sample (int): number of samples for SPSA gradient approximation. (Default: 128)
        max_batch_size (int): maximum batch size for gradient estimation. (Default: 64)
        batch_size (int): mini-batch size for processing inputs from the dataset. (Default: 8)
    """
    def __init__(self, model, eps=0.3, delta=0.01, lr=0.01, nb_iter=1, nb_sample=128, max_batch_size=64, batch_size=8):
        super().__init__("SPSARewardModel", model)
        self.eps = eps
        self.delta = delta
        self.lr = lr
        self.nb_iter = nb_iter
        self.nb_sample = nb_sample
        self.max_batch_size = max_batch_size
        self.dataset_batch_size = batch_size
        self.supported_mode = ["default"]

    def forward(self, images, labels):
        dataset = ImagePromptDataset(
            image_list=images, prompt_list=labels,
            image_transform_function=self.model.preprocess_function,
            text_tokenizer_function=self.model.tokenizer
        )
        dataloader = DataLoader(dataset, batch_size=self.dataset_batch_size, shuffle=False)

        adv_images_list = []
        for images, labels in dataloader:
            images = images.clone().detach().to(self.device)
            labels = labels.clone().detach().to(self.device)
            adv = self.spsa_perturb(images, labels)
            adv_images_list.append(adv)
        return torch.cat(adv_images_list, dim=0)

    def loss(self, images, labels):
        reward = self.model.inference(images, labels)
        reward = torch.tensor(reward, device=images.device)
        return -reward.mean()

    def linf_clamp_(self, dx, x, eps):
        dx_clamped = torch.clamp(dx, min=-eps, max=eps)
        x_adv = torch.clamp(x + dx_clamped, min=0, max=1)
        # In-place update for proper optimizer tracking.
        dx += x_adv - x - dx
        return dx

    def _get_batch_sizes(self, n, max_batch_size):
        batches = [max_batch_size for _ in range(n // max_batch_size)]
        if n % max_batch_size > 0:
            batches.append(n % max_batch_size)
        return batches

    @torch.no_grad()
    def spsa_grad(self, images, labels, delta, nb_sample, max_batch_size):
        # images shape: (B, C, H, W)
        grad = torch.zeros_like(images)
        B = images.shape[0]

        images = images.unsqueeze(1)   # (B, 1, C, H, W)
        labels = labels.unsqueeze(1)   # (B, 1, P)

        images = images.expand(B, max_batch_size, *images.shape[2:]).contiguous()  # (B, max_batch_size, C, H, W)
        labels = labels.expand(B, max_batch_size, *labels.shape[2:]).contiguous()

        v = torch.empty_like(images[:, :, :1, ...])  # (B, max_batch_size, 1, H, W)
        for current_batch in self._get_batch_sizes(nb_sample, max_batch_size):
            x_batch = images[:, :current_batch].contiguous()  # (B, current_batch, C, H, W)
            y_batch = labels[:, :current_batch].contiguous()    # (B, current_batch, P)
            v_batch = v[:, :current_batch]
            v_batch.bernoulli_().mul_(2.0).sub_(1.0)
            v_batch_exp = v_batch.expand_as(x_batch).contiguous()  # (B, current_batch, C, H, W)

            B_curr, bs, C, H, W = x_batch.shape
            x_batch_reshaped = x_batch.view(B_curr * bs, C, H, W)
            y_batch_reshaped = y_batch.view(B_curr * bs, -1)
            v_batch_reshaped = v_batch_exp.view(B_curr * bs, C, H, W)

            df = self.loss(x_batch_reshaped + delta * v_batch_reshaped, y_batch_reshaped) \
                 - self.loss(x_batch_reshaped - delta * v_batch_reshaped, y_batch_reshaped)
            df = df.view(-1, *([1] * (v_batch_reshaped.dim()-1)))
            grad_batch = (df / (2.0 * delta)) * v_batch_reshaped # equivalent to original code as each element of v_batch_reshaped is +-1
            grad_batch = grad_batch.view(B_curr, bs, C, H, W)
            grad += grad_batch.sum(dim=1)

        grad /= nb_sample
        return grad

    def spsa_perturb(self, x, y):
        dx = torch.zeros_like(x)
        dx.grad = torch.zeros_like(dx)
        optimizer = torch.optim.Adam([dx], lr=self.lr)
        for _ in range(self.nb_iter):
            optimizer.zero_grad()
            dx.grad = self.spsa_grad(x + dx, y, self.delta, self.nb_sample, self.max_batch_size)
            optimizer.step()
            dx = self.linf_clamp_(dx, x, self.eps)
        x_adv = x + dx
        return x_adv

In [24]:
class AttackFactory:
    @staticmethod
    def create_attack(
        attack_type: str,
        model: BaseModel,
        **kwargs,
    ) -> BaseAttack:
        if attack_type == "gn":
            return GNRewardModel(model)
        elif attack_type == "fgsm":
            return FGSMRewardModel(model, **kwargs)
        elif attack_type == "pgd":
            return PGDRewardModel(model, **kwargs)
        elif attack_type == "spsa":
            return SPSARewardModel(model, **kwargs)
        else:
            raise ValueError("Unsupported attack type.")

7. Define Arguments and Utils

In [25]:
def check_reward_model(value):
    valid_versions = ["v1.0", "v2.0"]
    if value not in valid_versions:
        raise argparse.ArgumentTypeError(
            "reward_model_name must be one of: 'v1.0', 'v2.0'.")
    return value

def check_dataset_name(value):
    if value not in ['hps', 'drawbench']:
        raise argparse.ArgumentTypeError(
            "dataset_name must be either 'hps' or 'drawbench'.")
    return value

def check_attack_name(value):
    if value not in ['gn', 'fgsm', 'pgd', 'spsa']:
        raise argparse.ArgumentTypeError(
            "attack_name must be one of: 'gn', 'fgsm', 'pgd', 'spsa'.")
    return value

def parse_attack_args():
    parser = argparse.ArgumentParser(
        description="Argument partser for attack process."
    )

    # Models group
    models = parser.add_argument_group("models")
    models.add_argument("--reward_model_name", type=check_reward_model, required=True,
                        help="HPS reward model version: v1.0, v2.0")
    models.add_argument("--reward_threshold", type=float, default=15.0,
                        help="Minimum reward score for attack (default: 15.0)")

    # Datasets group
    datasets = parser.add_argument_group("datasets")
    datasets.add_argument("--dataset_name", type=check_dataset_name, required=True,
                        help="Dataset for generating preliminary images: 'hps' or 'drawbench'")
    datasets.add_argument("--num_samples_per_category", type=int, default=None,
                        help="Number of text prompts per category (default: 5 for hps, 2 for drawbench)")

    # Attack group
    attack = parser.add_argument_group("attack")
    attack.add_argument("--attack_name", type=check_attack_name, required=True,
                        help="Name of the perturbation attack: gn, fgsm, pgd, or spsa")

    # Misc group
    misc = parser.add_argument_group("misc")
    misc.add_argument("--saved_images_path", type=str, required=True,
                        help="Path where base images and prompts are stored")
    misc.add_argument("--attack_batch_size", type=int, default=8,
                        help="Batch size for PGD & FGSM attack (default: 8)")
    misc.add_argument("--no_save_image_results", dest="save_image_results", action="store_false",
                        help="Do not store adversarial images, prompts, and reward scores")

    args = parser.parse_args()
    if args.num_samples_per_category is None:
        if args.dataset_name == "hps":
            args.num_samples_per_category = 5
        else:  # drawbench
            args.num_samples_per_category = 2
    return args

In [26]:
def compute_reward_statistics(top_k_prompts, adv_rewards):
    """
    Compute reward statistics using both the original rewards from top_k_prompts
    and the adversarial rewards in adv_rewards.

    Parameters:
        top_k_prompts (list): List of tuples (category, prompt, original_reward, image)
        adv_rewards (list): List of adversarial rewards corresponding to each prompt.
    """
    if not top_k_prompts or not adv_rewards:
        return {
            "average_original": 0.0,
            "average_adversarial": 0.0,
            "per_category_original": {},
            "per_category_adversarial": {}
        }

    original_rewards = [entry[2] for entry in top_k_prompts]
    avg_original = sum(original_rewards) / len(original_rewards)
    avg_adv = sum(adv_rewards) / len(adv_rewards)

    per_category_orig = {}
    per_category_adv = {}

    for (cat, _, orig_reward, _), adv_reward in zip(top_k_prompts, adv_rewards):
        per_category_orig.setdefault(cat, []).append(orig_reward)
        per_category_adv.setdefault(cat, []).append(adv_reward)

    per_category_orig_avg = {cat: sum(scores) / len(scores) for cat, scores in per_category_orig.items()}
    per_category_adv_avg = {cat: sum(scores) / len(scores) for cat, scores in per_category_adv.items()}

    return {
        "average_original": avg_original,
        "average_adversarial": avg_adv,
        "per_category_original": per_category_orig_avg,
        "per_category_adversarial": per_category_adv_avg
    }

In [27]:
def clear_cuda_memory_and_force_gc(force: bool = False):
    """
    Clears the CUDA memory cache and forces garbage collection if the allocated memory
    exceeds a certain threshold or if explicitly forced.

    Args:
        force (bool): If True, CUDA cache will be cleared and garbage collection
                      will be forced regardless of the memory threshold.
    """

    memory_allocated = torch.cuda.max_memory_reserved()
    memory_total = torch.cuda.get_device_properties("cuda").total_memory

    memory_threshold = memory_total * 0.7
    if memory_allocated > memory_threshold or force:
        torch.cuda.empty_cache()
        gc.collect()

In [28]:
def numerical_key(filename):
    # Extract the number from filename like "image_1.png"
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else -1

In [29]:
class SampledDataset(Dataset):
    def __init__(self, prompts, images=None, transforms=None):
        self.data = [{"category": c, "prompt": p} for c, p in zip(prompts["category"], prompts["prompt"])]
        self.images = images
        self.transforms = transforms
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.images is not None:
            return self.transforms(self.images[idx]), self.data[idx]
        return self.data[idx]

8. Run Adversarial Attack

In [30]:
reward_model = ModelFactory.create_model(
    model_type="hpsv1",
    model_path="hpc.pt" #HPS_v2_compressed
)

  checkpoint = torch.load(self.model_path)


In [31]:
to_pil = T.ToPILImage()

In [32]:
def run_attack_rank_model(run_attack_args):
    print(run_attack_args)

    image_directory = run_attack_args.saved_images_path
    prompts_file = os.path.join(image_directory, "prompts.txt")
    prompts = {"category": [], "prompt": []}

    with open(prompts_file, "r") as pf:
        for line in pf:
            content = line.split(": ", 1)[1].strip()
            if content.startswith("(") and content.endswith(")"):
                category, prompt = ast.literal_eval(content)
                prompts["category"].append(category)
                prompts["prompt"].append(prompt)

    image_files = sorted([f for f in os.listdir(image_directory) if f.endswith(".png")], key=numerical_key)
    original_images = [Image.open(os.path.join(image_directory, img_file)) for img_file in image_files]

    dataset = SampledDataset(
        prompts=prompts, images=original_images,
        transforms=reward_model.preprocess_function
    )
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    all_results = []  # will store tuples: (category, prompt, reward_score)
    total_batches = len(dataloader)
    pbar = tqdm(total=total_batches, desc="Ranking images and prompts")

    for batch in dataloader:
        images, prompts = batch
        categories  = prompts["category"]
        prompt_texts = prompts["prompt"]

        reward_scores = reward_model.inference(inputs=images, captions=prompt_texts)
        for cat, pr, score, image in zip(categories, prompt_texts, reward_scores, images):
            all_results.append((cat, pr, score, to_pil(image.cpu())))
        pbar.update(1)
    pbar.close()

    filtered_results = [entry for entry in all_results if entry[2] >= run_attack_args.reward_threshold]
    ranked_results = sorted(filtered_results, key=lambda x: x[2], reverse=True)

    k = run_attack_args.num_samples_per_category * 4 if run_attack_args.dataset_name == "hps" else run_attack_args.num_samples_per_category * 11
    top_k_prompts = ranked_results[:k]

    print("Top prompts:")
    for idx, (cat, pr, score, _) in enumerate(top_k_prompts):
        print(f"Image {idx}: ({cat}, {pr}) with score {score}")
    return top_k_prompts, reward_model

def run_attack_reward_model(run_attack_args, top_k_prompts, reward_model):
    attack = AttackFactory.create_attack(
        attack_type=run_attack_args.attack_name,
        model=reward_model,
        batch_size=run_attack_args.attack_batch_size
    )

    prompts = [(cat, pr) for cat, pr, _, _ in top_k_prompts]
    prompts_only = [pr for _, pr, _, _ in top_k_prompts]
    original_images = [image for _, _, _, image in top_k_prompts]
    original_rewards = [score for _, _, score, _ in top_k_prompts]

    adv_images = attack(
        inputs=original_images,
        labels=prompts,
    )
    adv_rewards = reward_model.inference(inputs=adv_images, captions=prompts_only)

    if run_attack_args.save_image_results:
        timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        parts = run_attack_args.saved_images_path.split(os.sep)
        output_dir = f"outputs/{parts[1]}/{run_attack_args.dataset_name}_adversarial/{run_attack_args.attack_name}/{run_attack_args.reward_model_name}/{timestamp}/"
        os.makedirs(output_dir, exist_ok=True)

        prompts_file_path = os.path.join(output_dir, "prompts.txt")
        with open(prompts_file_path, "w") as pf:
            for idx, (pr, orig_r, adv_r, adv_img) in enumerate(zip(prompts, original_rewards, adv_rewards, adv_images)):
                image_filename = os.path.join(output_dir, f"image_{idx}.png")
                pil_img = to_pil(adv_img.cpu())
                pil_img.save(image_filename)
                pf.write(f"Image {idx}: ({repr(pr[0])}, {repr(pr[1])}, {orig_r}, {adv_r})\n")

    stats = compute_reward_statistics(top_k_prompts, adv_rewards)
    print("\n" + "=" * 40)
    print("Overall Reward Statistics:")
    print(f"  Original: {stats['average_original']} | Adversarial: {stats['average_adversarial']}\n")

    print("Per-Category Comparison:")
    all_categories = set(stats["per_category_original"].keys()).union(stats["per_category_adversarial"].keys())
    for cat in all_categories:
        orig = stats["per_category_original"].get(cat, 0)
        adv = stats["per_category_adversarial"].get(cat, 0)
        print(f"  {cat}: Original = {orig} | Adversarial = {adv}")
    print("=" * 40)

    clear_cuda_memory_and_force_gc(force=True)

In [33]:
import argparse
import sys

sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "gn",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
top_k_prompts, reward_model = run_attack_rank_model(args)

Namespace(reward_model_name='v1.0', reward_threshold=15.0, dataset_name='hps', num_samples_per_category=5, attack_name='gn', saved_images_path='outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37', attack_batch_size=8, save_image_results=True)


Ranking images and prompts: 100%|██████████| 5/5 [00:02<00:00,  2.23it/s]

Top prompts:
Image 0: (concept-art, A giant burning pineapple illuminates the forest and mountain backdrop in this cinematic concept art for a video game.) with score 23.40625
Image 1: (concept-art, A close-up portrait of a beautiful girl with an autumn leaves headdress and melting wax.) with score 23.28125
Image 2: (concept-art, A dragon attacking burning medieval hobbit homes in a picturesque landscape with a waterfall and bridge.) with score 22.875
Image 3: (anime, A white-haired girl in a pink sweater looks out a window in her bedroom.) with score 22.84375
Image 4: (paintings, A head-on centered symmetrical portrait of Elisha Cuthbert as a holy paladin, wearing steel armour and with blonde hair, depicted in a highly detailed digital painting with dramatic lighting, in the style of Artgerm and Anna Podedworna.) with score 22.8125
Image 5: (paintings, Museum painting of a mouse stealing cheese artwork.) with score 22.453125
Image 6: (paintings, A painting of a Persian cat dressed as 




8.1 Gaussian Noise Attack

In [34]:
import argparse
import sys

sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "gn",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
run_attack_reward_model(args, top_k_prompts, reward_model)


Overall Reward Statistics:
  Original: 21.928125 | Adversarial: 19.5265625

Per-Category Comparison:
  anime: Original = 21.61875 | Adversarial = 20.028125
  photo: Original = 21.3125 | Adversarial = 19.682291666666668
  concept-art: Original = 22.65 | Adversarial = 18.771875
  paintings: Original = 22.3359375 | Adversarial = 19.609375


8.2 FGSM Attack (Aggregate reward score over batches)

In [35]:
sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "fgsm",
    "--attack_batch_size", "8",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
run_attack_reward_model(args, top_k_prompts, reward_model)


Overall Reward Statistics:
  Original: 21.928125 | Adversarial: 19.4671875

Per-Category Comparison:
  anime: Original = 21.61875 | Adversarial = 20.065625
  photo: Original = 21.3125 | Adversarial = 19.653645833333332
  concept-art: Original = 22.65 | Adversarial = 18.5375
  paintings: Original = 22.3359375 | Adversarial = 19.6015625


8.3 FGSM Attack (Update each image individually)

In [36]:
sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "fgsm",
    "--attack_batch_size", "1",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
run_attack_reward_model(args, top_k_prompts, reward_model)


Overall Reward Statistics:
  Original: 21.928125 | Adversarial: 19.4671875

Per-Category Comparison:
  anime: Original = 21.61875 | Adversarial = 20.065625
  photo: Original = 21.3125 | Adversarial = 19.653645833333332
  concept-art: Original = 22.65 | Adversarial = 18.5375
  paintings: Original = 22.3359375 | Adversarial = 19.6015625


8.4 PGD Attack

In [37]:
clear_cuda_memory_and_force_gc()

In [38]:
sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "pgd",
    "--attack_batch_size", "2",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
run_attack_reward_model(args, top_k_prompts, reward_model)


Overall Reward Statistics:
  Original: 21.928125 | Adversarial: 16.597265625

Per-Category Comparison:
  anime: Original = 21.61875 | Adversarial = 16.553125
  photo: Original = 21.3125 | Adversarial = 17.256510416666668
  concept-art: Original = 22.65 | Adversarial = 15.44375
  paintings: Original = 22.3359375 | Adversarial = 17.10546875


8.5 SPSA Attack

In [39]:
sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--reward_model_name", "v1.0",
    "--dataset_name", "hps",
    "--attack_name", "spsa",
    "--saved_images_path", "outputs/stable-diffusion-2-1-base/hps/2025-03-08-04-07-37",
]

args = parse_attack_args()
run_attack_reward_model(args, top_k_prompts, reward_model)


Overall Reward Statistics:
  Original: 21.928125 | Adversarial: 19.47109375

Per-Category Comparison:
  anime: Original = 21.61875 | Adversarial = 20.071875
  photo: Original = 21.3125 | Adversarial = 19.651041666666668
  concept-art: Original = 22.65 | Adversarial = 18.5625
  paintings: Original = 22.3359375 | Adversarial = 19.5859375
