In [1]:
import gradio as gr
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import subprocess

# Initialize models and face analysis
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# Load IP-Adapter
ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

# Function to generate text prompts using Mistral
def generate_text_with_mistral(prompt):
    try:
        structured_prompt = (
            f"{prompt}\n\n"
            "Please summarize this story in exactly 4 concise and coherent sentences. "
            "Do not include any additional text."
        )
        
        command = ["ollama", "run", "mistral-nemo"]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            raise Exception(f"Error generating text: {result.stderr}")
        
        output = result.stdout.strip().replace('\n', ' ')
        sentences = [s.strip() for s in output.replace(';', '.').split('. ') if s.strip()]

        summarized_sentences = sentences[:4]
        return summarized_sentences

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Function to generate images based on the story and image
def generate_images_from_story(image_file, story_text):
    # Load and process the face image
    image = cv2.imread(image_file)
    faces = app.get(image)

    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
    face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)

    # Step 1: Generate 4 prompts from the story
    summarized_prompts = generate_text_with_mistral(story_text)
    
    if not summarized_prompts:
        return [None, None, None, None]

    negative_prompt = "multiple hands, deformed fingers, monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
    
    # Step 2: Generate 4 images based on the prompts
    generated_images = []
    for i, prompt in enumerate(summarized_prompts):
        images = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=1.0,
            num_samples=1,
            width=512,
            height=768,
            num_inference_steps=35,
            seed=2023,
            guidance_scale=8
        )
        # Append generated image
        for img in images:
            generated_images.append(img)
    
    return generated_images

# Gradio interface function
def gradio_interface(image, story):
    generated_images = generate_images_from_story(image, story)
    
    return generated_images

# Gradio app setup
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Image(type="filepath"), gr.Textbox(lines=10, placeholder="Enter your story here...")],
    outputs=[gr.Image(label=f"Generated Image {i+1}") for i in range(4)],
    title="Story-to-Image Generator with Face Integration",
    description="Upload an image and enter a story. The app will generate images based on the story and integrate the face from the uploaded image."
)

# Launch the Gradio app
gr_interface.launch()


  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

Loading pipeline components...: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
  deprecate("LoRALinearLayer", "1.0.0", deprecation_message)
  state_dict = torch.load(self.ip_ckpt, map_location="cpu")


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


# Working Trial

In [1]:
import gradio as gr
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import subprocess
import os

# Initialize models and face analysis
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# Load IP-Adapter
ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

# Function to generate text prompts using Mistral
def generate_text_with_mistral(prompt):
    try:
        structured_prompt = (
            f"{prompt}\n\n"
            "Please summarize and break this story about a scientist flow-wise in exactly 4 concise and coherent sentences. Each sentence should have a maximum of 12 words and must be scientific and in simple but scientific words. These sentences must be prompts for stable diffusion to generate images."
            "Do not include any additional text."
        )
        
        command = ["ollama", "run", "mistral-nemo"]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            raise Exception(f"Error generating text: {result.stderr}")
        
        output = result.stdout.strip().replace('\n', ' ')
        sentences = [s.strip() for s in output.replace(';', '.').split('. ') if s.strip()]

        summarized_sentences = sentences[:4]
        return summarized_sentences

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Function to generate images based on the story and image
def generate_images_from_story(image_file, story_text):
    # Load and process the face image
    image = cv2.imread(image_file)
    faces = app.get(image)

    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
    face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)

    # Step 1: Generate 4 prompts from the story
    summarized_prompts = generate_text_with_mistral(story_text)
    
    if not summarized_prompts:
        return [None, None, None, None]

    negative_prompt = "(multiple faces), (muliple people), multiple hands, deformed fingers, monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
    
    # Step 2: Generate 4 images based on the prompts and save them to file paths
    generated_images_paths = []
    for i, prompt in enumerate(summarized_prompts):
        images = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=1.0,
            num_samples=1,
            width=512,
            height=768,
            num_inference_steps=35,
            seed=2023,
            guidance_scale=8
        )
        
        # Save the generated images to files and return the file paths
        for j, img in enumerate(images):
            img_path = f"generated_image_{i}_{j}.png"
            img.save(img_path)
            generated_images_paths.append(img_path)
    
    return generated_images_paths

# Gradio interface function
def gradio_interface(image, story):
    generated_image_paths = generate_images_from_story(image, story)
    return generated_image_paths

# Gradio app setup
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Image(type="filepath"), gr.Textbox(lines=10, placeholder="Enter your story here...")],
    outputs=[gr.Image(label=f"Generated Image {i+1}") for i in range(4)],
    title="Story-to-Image Generator with Face Integration",
    description="Upload an image and enter a story. The app will generate images based on the story and integrate the face from the uploaded image."
)

# Launch the Gradio app
gr_interface.launch()


  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

Loading pipeline components...: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]
  deprecate("LoRALinearLayer", "1.0.0", deprecation_message)
  state_dict = torch.load(self.ip_ckpt, map_location="cpu")


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
Exception in thread Thread-12 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\Edjon\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "C:\Users\Edjon\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Edjon\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Edjon\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "c:\Users\Edjon\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError:

In [None]:
from PIL import Image, ImageDraw, ImageFont
import subprocess
import matplotlib.pyplot as plt
import textwrap
# def embedding(image_paths, prompt):
def embed_text_as_image_novel(image_path, text):
    # Open the image
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    # Set up the font and size
    try:
        font = ImageFont.truetype("arial.ttf", 16)  # You can customize the font and size
    except IOError:
        font = ImageFont.load_default()

    # Wrap the text to fit the image width
    max_width = image.width - 20  # Leave some padding
    wrapped_text = textwrap.fill(text, width=40)  # Adjust width as needed

    # Calculate text size using textbbox
    text_bbox = draw.textbbox((0, 0), wrapped_text, font=font)
    text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]

    # Calculate text position at the bottom of the image
    padding = 10
    text_position = (10, image.height - text_height - padding)  # Adjust padding if needed

    # Draw a rectangle behind the text for better visibility (optional)
    rectangle_bbox = [text_position[0] - 5, text_position[1] - 5, 
                    text_position[0] + text_width + 5, text_position[1] + text_height + 5]
    draw.rectangle(rectangle_bbox, fill="black")

    # Draw the text on the image
    draw.text(text_position, wrapped_text, font=font, fill="white")

    # Save the image with the embedded text
    output_path = "output_" + image_path.split('/')[-1]  # Prepend 'output_' to the filename
    image.save(output_path)
    return output_path

def display_image(image_path):
    # Open the image using PIL
    image = Image.open(image_path)

    # Display the image using matplotlib
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis('off')  # Hide the axis
    plt.show()

def generate_text_with_mistral(prompt, image_path):
    try:
        # Call the CLI command, adjust as per the actual command structure
        command = ["ollama", "run", "mistral-nemo", image_path]
        result = subprocess.run(
            command, input=prompt,
            capture_output=True,
            text=True
        )
        
        # Check for errors
        if result.returncode != 0:
            raise Exception(f"Error generating text: {result.stderr}")
        
        return result.stdout.strip()

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_images_sequentially(image_paths, prompt):
    comic_data = []
    
# for i, image_path in enumerate(image_paths):
    # Generate the dialogue for each image sequentially using the same prompt
    generated_text = generate_text_with_mistral(prompt, image_paths)
    
    # Store the image and its corresponding dialogue in the dictionary
    
    
    # Embed the dialogue into the image
    output_image_path = embed_text_as_image_novel(image_paths, generated_text)
    comic_data.append(output_image_path)
    print(f"Generated Text for {image_paths}: {generated_text}")
    print(f"Output Image Path: {output_image_path}")

    # Display the image
    display_image(output_image_path)

        # return comic_data
    

# Example usage
desc = "Scientist Tom makes a significant discovery in biology lab."

prompt = f"I want you to follow the scientific description {desc} for the image i shall provide. I want you to Generate JUST ONE small dialogue WITH THE PERSON SPEAKING STORY WISE for image and no other text."

process_images_sequentially("output.png", prompt)


# Final App Trial (Still In Progress, but almost done)

In [1]:
import gradio as gr
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image, ImageDraw, ImageFont
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import subprocess
import os
import textwrap
import matplotlib.pyplot as plt

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"
story_summary = None
noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# Load IP-Adapter
ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

# First Mistral call for generating story prompts
def generate_story_prompts(story_text):
    try:
        # Structured prompt for Mistral to summarize the story into 4 sequential prompts
        structured_prompt = (
            f"{story_text}\n\n"
            "Please summarize and break this story about a scientist flow-wise in exactly 4 concise and coherent sentences. Each sentence should have a maximum of 15 words and must be scientific and in simple words."
            "Do not include any additional text."
        )

        # Call Mistral Nemo to summarize the story into 4 prompts
        command = ["ollama", "run", "mistral-nemo"]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )

        if result.returncode != 0:
            raise Exception(f"Error generating story prompts: {result.stderr}")

        output = result.stdout.strip()
        story_summary = output
        sentences = [s.strip() for s in output.replace(';', '.').split('. ') if s.strip()]
        return sentences[:4]

    except Exception as e:
        print(f"An error occurred while generating story prompts: {str(e)}")
        return None

# Second Mistral call for generating dialogue for each image
def generate_comic_dialogue(prompt, image_path, story_text):
    try:
        # Structured prompt for Mistral to generate dialogue based on the image and story
        structured_prompt = (
            f"Story: {story_text}\n\n"
            f"Part: {prompt}\n\n"
            "Generate JUST ONE short scientific dialogue for the person in this image, following the story and part flow. ONLY THE PERSON PRESENT IN IMAGE MUST HAVE DIALOGUE. No other text."
        )

        command = ["ollama", "run", "mistral-nemo", image_path]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )

        if result.returncode != 0:
            raise Exception(f"Error generating dialogue: {result.stderr}")

        return result.stdout.strip()

    except Exception as e:
        print(f"An error occurred while generating comic dialogue: {str(e)}")
        return None

# Embed text into image
def embed_text_as_image_novel(image_path, text):
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except IOError:
        font = ImageFont.load_default()

    max_width = image.width - 20
    wrapped_text = textwrap.fill(text, width=40)

    text_bbox = draw.textbbox((0, 0), wrapped_text, font=font)
    text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]

    padding = 10
    text_position = (10, image.height - text_height - padding)

    rectangle_bbox = [text_position[0] - 5, text_position[1] - 5, 
                    text_position[0] + text_width + 5, text_position[1] + text_height + 5]
    draw.rectangle(rectangle_bbox, fill="black")
    draw.text(text_position, wrapped_text, font=font, fill="white")

    output_path = "output_" + image_path.split('/')[-1]
    image.save(output_path)
    return output_path

# Process each image to generate dialogue and embed it
def process_images_sequentially(image_paths, prompts, story_text):
    comic_data = []

    for i, image_path in enumerate(image_paths):
        # Call the second Mistral function to generate dialogue for each image
        generated_text = generate_comic_dialogue(prompts[i], image_path, story_text)
        output_image_path = embed_text_as_image_novel(image_path, generated_text)
        comic_data.append(output_image_path)
        print(f"Processed image {i+1}/{len(image_paths)}: Dialogue: {generated_text}")

    return comic_data

# Create comic storyboard from processed images
def create_comic_storyboard(image_paths, output_path, grid_size=(3, 2), padding=10, background_color=(255, 255, 255)):
    images = [Image.open(image_path).convert("RGB") for image_path in image_paths]

    image_width, image_height = images[0].size
    total_width = grid_size[1] * image_width + (grid_size[1] - 1) * padding
    total_height = grid_size[0] * image_height + (grid_size[0] - 1) * padding

    storyboard = Image.new('RGB', (total_width, total_height), color=background_color)

    for index, image in enumerate(images):
        row = index // grid_size[1]
        col = index % grid_size[1]
        x_offset = col * (image_width + padding)
        y_offset = row * (image_height + padding)
        storyboard.paste(image, (x_offset, y_offset))

    storyboard.save(output_path)
    return output_path

# Function to generate images based on the story and image
def generate_images_from_story(image_file, story_text):
    image = cv2.imread(image_file)
    faces = app.get(image)

    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
    face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)

    # First, generate 4 story prompts using Mistral
    summarized_prompts = generate_story_prompts(story_text)

    if not summarized_prompts:
        return [None, None, None, None]

    negative_prompt = "multiple faces, multiple hands, deformed fingers, monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
    
    # Generate images based on the prompts
    generated_images_paths = []
    for i, prompt in enumerate(summarized_prompts):
        images = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=1.0,
            num_samples=1,
            width=512,
            height=768,
            num_inference_steps=35,
            seed=2023,
            guidance_scale=8
        )
        
        for j, img in enumerate(images):
            img_path = f"generated_image_{i}_{j}.png"
            img.save(img_path)
            generated_images_paths.append(img_path)
    
    return generated_images_paths, summarized_prompts

# Gradio interface function
def gradio_interface(image, story):
    generated_image_paths, prompts = generate_images_from_story(image, story)

    # Process each image with dialogue embedding
    processed_images = process_images_sequentially(generated_image_paths, prompts, story_summary)

    # Create comic storyboard
    storyboard_path = create_comic_storyboard(processed_images, "comic_storyboard.png", grid_size=(2, 2))
    
    return storyboard_path

# Gradio app setup
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Image(type="filepath"), gr.Textbox(lines=10, placeholder="Enter your story here...")],
    outputs=gr.Image(label="Generated Comic Storyboard"),
    title="Story-to-Image Comic Generator",
    description="Upload an image and enter a story. The app will generate comic images based on the story, integrate the face from the uploaded image, and create a comic storyboard."
)

# Launch the Gradio app
gr_interface.launch()


  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

Loading pipeline components...: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]
  deprecate("LoRALinearLayer", "1.0.0", deprecation_message)
  state_dict = torch.load(self.ip_ckpt, map_location="cpu")


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [1]:
import gradio as gr
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image, ImageDraw, ImageFont
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import subprocess
import os
import textwrap
import gc  # Garbage collection for memory management

# Initialize models and face analysis
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# Load IP-Adapter
ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

# First Mistral call for generating story prompts
def generate_story_prompts(story_text):
    try:
        # Structured prompt for Mistral to summarize the story into 4 sequential prompts
        structured_prompt = (
            f"{story_text}\n\n"
            "Please summarize and break this story about a scientist flow-wise in exactly 4 concise and coherent sentences. Each sentence should have a maximum of 15 words and must be scientific and in simple words."
            "Do not include any additional text."
        )

        # Call Mistral Nemo to summarize the story into 4 prompts
        command = ["ollama", "run", "mistral-nemo"]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )

        if result.returncode != 0:
            raise Exception(f"Error generating story prompts: {result.stderr}")

        output = result.stdout.strip()
        sentences = [s.strip() for s in output.replace(';', '.').split('. ') if s.strip()]
        return sentences[:4]

    except Exception as e:
        print(f"An error occurred while generating story prompts: {str(e)}")
        return None

# Second Mistral call for generating dialogue for each image
def generate_comic_dialogue(prompt, image_path):
    try:
        # Structured prompt for Mistral to generate dialogue based on the image and story
        structured_prompt = (
            f"{prompt}\n\n"
            "Generate JUST ONE short scientific dialogue for the person in this image, following the story flow. ONLY THE PERSON PRESENT IN IMAGE MUST HAVE DIALOGUE. No other text."
        )

        command = ["ollama", "run", "mistral-nemo", image_path]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True,
            timeout=60  # Timeout to avoid hanging
        )

        if result.returncode != 0:
            raise Exception(f"Error generating dialogue: {result.stderr}")

        return result.stdout.strip()

    except Exception as e:
        print(f"An error occurred while generating comic dialogue: {str(e)}")
        return None

# Embed text into image
def embed_text_as_image_novel(image_path, text):
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except IOError:
        font = ImageFont.load_default()

    max_width = image.width - 20
    wrapped_text = textwrap.fill(text, width=40)

    text_bbox = draw.textbbox((0, 0), wrapped_text, font=font)
    text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]

    padding = 10
    text_position = (10, image.height - text_height - padding)

    rectangle_bbox = [text_position[0] - 5, text_position[1] - 5, 
                    text_position[0] + text_width + 5, text_position[1] + text_height + 5]
    draw.rectangle(rectangle_bbox, fill="black")
    draw.text(text_position, wrapped_text, font=font, fill="white")

    output_path = "output_" + image_path.split('/')[-1]
    image.save(output_path)
    return output_path

# Function to generate images based on the story and image
def generate_images_from_story(image_file, story_text):
    image = cv2.imread(image_file)
    faces = app.get(image)

    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
    face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)

    # First, generate 4 story prompts using Mistral
    summarized_prompts = generate_story_prompts(story_text)

    if not summarized_prompts:
        return [None, None, None, None]

    negative_prompt = "multiple faces, multiple hands, deformed fingers, monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
    
    # Generate images based on the prompts
    generated_images_paths = []
    for i, prompt in enumerate(summarized_prompts):
        images = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=1.0,
            num_samples=1,
            width=512,
            height=768,
            num_inference_steps=35,
            seed=2023,
            guidance_scale=8
        )
        
        # Save the generated images to files and return the file paths
        for j, img in enumerate(images):
            img_path = f"generated_image_{i}_{j}.png"
            img.save(img_path)
            generated_images_paths.append(img_path)
    
    return generated_images_paths, summarized_prompts

# Process each image to generate dialogue and embed it
def process_images_sequentially(image_paths, prompts):
    comic_data = []

    for i, image_path in enumerate(image_paths):
        # Call the second Mistral function to generate dialogue for each image
        generated_text = generate_comic_dialogue(prompts[i], image_path)
        output_image_path = embed_text_as_image_novel(image_path, generated_text)
        comic_data.append(output_image_path)

        # Release resources and free up memory for each image after processing
        del generated_text
        del image_path
        gc.collect()

    return comic_data

# Create comic storyboard from processed images
def create_comic_storyboard(image_paths, output_path, grid_size=(2, 2), padding=10, background_color=(255, 255, 255)):
    images = [Image.open(image_path).convert("RGB") for image_path in image_paths]

    image_width, image_height = images[0].size
    total_width = grid_size[1] * image_width + (grid_size[1] - 1) * padding
    total_height = grid_size[0] * image_height + (grid_size[0] - 1) * padding

    storyboard = Image.new('RGB', (total_width, total_height), color=background_color)

    for index, image in enumerate(images):
        row = index // grid_size[1]
        col = index % grid_size[1]
        x_offset = col * (image_width + padding)
        y_offset = row * (image_height + padding)
        storyboard.paste(image, (x_offset, y_offset))

    storyboard.save(output_path)
    return output_path

# Gradio interface function
def gradio_interface(image, story):
    generated_image_paths, prompts = generate_images_from_story(image, story)

    # Process each image with dialogue embedding
    processed_images = process_images_sequentially(generated_image_paths, prompts)

    # Create comic storyboard
    storyboard_path = create_comic_storyboard(processed_images, "comic_storyboard.png", grid_size=(2, 2))
    
    return storyboard_path

# Gradio app setup
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Image(type="filepath"), gr.Textbox(lines=10, placeholder="Enter your story here...")],
    outputs=gr.Image(label="Generated Comic Storyboard"),
    title="Story-to-Image Comic Generator",
    description="Upload an image and enter a story. The app will generate comic images based on the story, integrate the face from the uploaded image, and create a comic storyboard."
)

# Launch the Gradio app
gr_interface.launch()


  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\Edjon/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.

Loading pipeline components...:  40%|████      | 2/5 [00:00<00:01,  2.28it/s]