# Combining Multiple Models

## Image to Text + Text to Image

In [None]:
# import PIL
import torch
from PIL import Image, ImageOps
from transformers import BlipProcessor, BlipForConditionalGeneration
from diffusers import StableDiffusionPipeline

models

https://huggingface.co/Salesforce/blip-image-captioning-base

https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5

In [None]:
image_to_text_model = "Salesforce/blip-image-captioning-base"
text_to_image_model = "runwayml/stable-diffusion-v1-5"

- `BlipProcessor` prepares images for the model

- `BlipForConditionalGeneration` generates text descriptions for images

- `.to("mps")` moves the model to the Apple Metal Performance Shaders (MPS) device for GPU acceleration on macOS
- can use "cuda" alternatively on PC

In [None]:
processor = BlipProcessor.from_pretrained(image_to_text_model)
blip_model = BlipForConditionalGeneration.from_pretrained(image_to_text_model)
blip_model.to("mps")

In [None]:
# load text to image model
pipe = StableDiffusionPipeline.from_pretrained(text_to_image_model, torch_dtype=torch.float16, safety_checker=None)
pipe.to("mps")

load image (right side up and at most of size 512x512)

In [None]:
file_path = "./photo.jpg"

def load_image(file_path_, max_size=(512, 512)):
    img = Image.open(file_path_)
    img = ImageOps.exif_transpose(img)
    img = img.convert("RGB")
    img.thumbnail(max_size)
    return img

image = load_image(file_path)

- prepare image for blip model by converting it into a pytorch tensor format

- `blip_model.generate` generates image caption

  - `num_beams` controls the number of beams used for beam search, a decoding strategy to improve output quality

- `processor.decode` decodes the generated output to a human-readable string

In [None]:
# getting description from img
inputs = processor(image, return_tensors="pt").to("mps")
outputs = blip_model.generate(**inputs,num_beams=20)
description = processor.decode(outputs[0], skip_special_tokens=True)

print(f"Description: {description}")

- `num_inference_steps=25` controls number of steps in the diffusion process (higher values may improve quality but increase processing time)

- `image_guidance_scale=1` determines how much the description should influence the final output (higher values make the output more closely match the description)

In [None]:
modified_description = f"A Disney-style portrait of {description} with large, expressive eyes and a whimsical smile."

# new image from modified description
generated_image = pipe(modified_description, num_inference_steps=25, guidance_scale=20).images[0]

In [None]:
output_path = "./output_imagev2.jpg"
generated_image.save(output_path)

## Image to Text + Image to Image

In [None]:
# import PIL
import torch
from PIL import Image, ImageOps
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
from transformers import BlipProcessor, BlipForConditionalGeneration

models

https://huggingface.co/Salesforce/blip-image-captioning-base

https://huggingface.co/timbrooks/instruct-pix2pix

In [None]:
image_to_text_model = "Salesforce/blip-image-captioning-base"
image_to_image_model = "timbrooks/instruct-pix2pix"

- `BlipProcessor` prepares images for the model

- `BlipForConditionalGeneration` generates text descriptions for images

- `.to("cpu")` moves the model to cpu

- can use `"mps"` for GPU acceleration on macOS
- can use `"cuda"` alternatively on PC

In [None]:
# load image to text model
processor = BlipProcessor.from_pretrained(image_to_text_model)
blip_model = BlipForConditionalGeneration.from_pretrained(image_to_text_model)
blip_model.to("cpu")

`EulerAncestralDiscreteScheduler`: A scheduler is used to guide the image generation process by determining how noise is added and removed during the diffusion steps

Starts with a noisy, random image and gradually reduces the noise step by step

As the noise is removed, model refines image based on prompt

In [None]:
# load text to image model
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(image_to_image_model, torch_dtype=torch.float16, safety_checker=None)
pipe.to("cpu")
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

load image (right side up and at most of size 512x512)

In [None]:
file_path = "photo.jpg"

def load_image(file_path_, max_size=(512, 512)):
    img = Image.open(file_path_)
    img = ImageOps.exif_transpose(img)
    img = img.convert("RGB")
    img.thumbnail(max_size)
    return img

image = load_image(file_path)

- prepare image for blip model by converting it into a pytorch tensor format

- `blip_model.generate` generates image caption

  - `num_beams` controls the number of beams used for beam search, a decoding strategy to improve output quality

- `processor.decode` decodes the generated output to a human-readable string

In [None]:
 # getting description from image
inputs = processor(image, return_tensors="pt").to("cpu")
outputs = blip_model.generate(**inputs, num_beams=20)
description = processor.decode(outputs[0], skip_special_tokens=True)

print(f"Description: {description}")

- `num_inference_steps=25` controls number of steps in the diffusion process (higher values may improve quality but increase processing time)

- `image_guidance_scale=1` determines how much the original image should influence the final output (lower values rely more on the original)

In [None]:
modified_description = f"Create a Disney-style portrait of {description} with large, expressive eyes and a whimsical smile."

# new image from modified description
images = pipe(modified_description, image=image, num_inference_steps=25, image_guidance_scale=1).images

In [None]:
output_path = "./output_image.jpg"
images[0].save(output_path)

# Inpainting

In [None]:
import cv2
import numpy as np

# Load the image
image = cv2.imread('inpaint.png')
original_image = image.copy()
mask = np.zeros(image.shape[:2], dtype=np.uint8)  # Mask initialized as black
drawing = False  # True if the user is drawing
points = []  # Store points to create the polygon

# Mouse callback function
def draw_polygon(event, x, y, flags, param):
    global drawing, points, mask

    if event == cv2.EVENT_LBUTTONDOWN:  # Start drawing on left mouse button down
        drawing = True
        points = [(x, y)]  # Initialize points with the starting point

    elif event == cv2.EVENT_MOUSEMOVE:
        if drawing:
            points.append((x, y))  # Add points as the mouse moves
            cv2.line(image, points[-2], points[-1], (0, 255, 0), 2)  # Draw green lines

    elif event == cv2.EVENT_LBUTTONUP:  # Stop drawing on left mouse button up
        drawing = False
        points.append((x, y))

        # Draw final line to close the shape
        cv2.line(image, points[-1], points[0], (0, 255, 0), 2)

        # Fill the polygon on the mask
        points_array = np.array(points, dtype=np.int32)
        cv2.fillPoly(mask, [points_array], 255)  # Fill inside of the drawn shape with white on the mask

# Set up window and bind mouse callback
cv2.namedWindow('Draw Mask')
cv2.setMouseCallback('Draw Mask', draw_polygon)

while True:
    cv2.imshow('Draw Mask', image)

    # Display the masked result in real-time
    masked_result = np.where(mask[:, :, None] == 255, 255, 0).astype(np.uint8)
    cv2.imshow('Masked Result', masked_result)

    key = cv2.waitKey(1)

    if key == ord('r'):  # Press 'r' to reset the drawing
        image = original_image.copy()
        mask = np.zeros(image.shape[:2], dtype=np.uint8)

    elif key == ord('q'):  # Press 'q' to quit and save
        break

# Save the final mask
cv2.imwrite('inpaint_mask.png', mask)
cv2.destroyAllWindows()

In [None]:
import torch
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

# Load the inpainting model with half-precision for efficiency
pipeline = AutoPipelineForInpainting.from_pretrained(
    "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
)

# Load the base image and mask (white pixels define areas to modify)
init_image = load_image("inpaint.png")
mask_image = load_image("inpaint_mask.png")

# Offload model layers to CPU when not in use to save memory
pipeline.enable_model_cpu_offload()

# Set up random generator for reproducible results
generator = torch.Generator("cuda").manual_seed(92)

# Run the inpainting model with the given prompt
prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]

# Display the resulting image in a grid
make_image_grid([image], rows=1, cols=1)


1. **Model Selection and Loading:**
`AutoPipelineForInpainting` loads a pre-trained inpainting model. In this case, it’s the `Kandinsky-2-2 Decoder` model, suitable for creating detailed, AI-assisted modifications to images.
Setting `torch_dtype` to `torch.float16` helps optimize memory usage by loading the model in half-precision.
2. **Image and Mask Preparation:**
`init_image` is the original image we want to inpaint or modify, and `mask_image` is the guide mask image. Areas marked in white in the mask image are eligible for modification based on the prompt.

3. **Generator for Reproducibility:**
By using `torch.Generator` with a fixed seed, we ensure that every run with the same seed produces identical results

4. **Running the Inpainting Process:**
The pipeline generates an output image based on the input prompt and the provided mask, blending the prompt’s style into the inpainted areas of init_image.

5. **Image Display:**
make_image_grid creates a grid layout to visualize the output image(s). Here it’s a 1x1 grid, displaying only the generated result.

# Image-to-image

In [None]:
import torch
from PIL import Image, ImageOps
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
from transformers import BlipProcessor, BlipForConditionalGeneration

```StableDiffusionInstructPix2PixPipeline``` -> This is a pipeline for the model ***InstructPix2Pix*** which takes an image and modifies it based on a specified prompt

In [None]:
# load model
model_id = "timbrooks/instruct-pix2pix"
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
pipe.to("cpu")
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

loads and edits the image as needed

In [None]:
file_path = "photo.jpg"

def load_image(file_path_, max_size=(512, 512)):
    img = Image.open(file_path_)
    img = ImageOps.exif_transpose(img)
    img = img.convert("RGB")
    img.thumbnail(max_size)
    return img

image = load_image(file_path)

Promps for human / or clearly identifiable subjects

In [None]:
#prompt = "Modify subject into a western style character"
#prompt = "Modify the subject to resemble a gigachad, strong jawline, defined cheekbones, ensure he remains recognizable. Keep the background unchanged and the overall expression natural."
#prompt = "Modify subject as a disney movie character style, large expressive eyes, whimsical smile, preserve the original face shape, preserve clothes"
#prompt = "Modify the subject to resemble a comic book character, preserve face shape, preserve background"
#prompt = "as a comic"
#prompt = "Modify the subject to resemble an anime style character, preserve face shape"
#prompt = "as an anime"
#prompt = "as an astronaut"

Prompts for general picture style / Non human photos

In [None]:
#prompt = "make the subject a cartoon like character"
#prompt = "add red flowers in the background"
#prompt = "make the subject a dark brown, keep mouth and toungue unchanged"
#prompt = "make the picture a van gogh style painting"
#prompt = "as a basquiat style painting"
#prompt = "as an anime"

negative prompt

In [None]:
neg = "distortions, unrecognizable features unnatural characteristics, bad anatomy, subject not recognizable"

In [None]:
images = pipe(prompt, negative_prompt=neg, image=image, num_inference_steps=25, guidance_scale=5.5, image_guidance_scale=1.5).images
output_path = "./output_image2.jpg"
images[0].save(output_path)


```prompt``` -> the prompt that gives instructions to the model

```negative_prompt = neg``` -> negative prompt specified with undesired characteristics in the resulting image

```image = image``` -> image we want to modify

```num_inference_steps = 25``` -> number of **denoising steps** (changes the quality of the output image - the more steps lead to higher quality)

```guidance_scale = 5.5``` -> how closely followed the prompt we specified will be (higher value = prompt will be followed more closely, lower value = model has more "freedom")

```image_guidance_scale= 1.5``` -> How similar should de output image be to teh original image (higher values lead to more similar outputs)
