In [None]:
import subprocess
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image
from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

# Face analysis and preparation
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

file_name = "example.jpg"  # Image input
image = cv2.imread(filename=file_name)
faces = app.get(image)

faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)  # Crop and prepare the face image

# Load models and pipeline
v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "ip-adapter-faceid-plus_sd15.bin" if not v2 else "ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# Load IP-Adapter
ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)

# Mistral Nemo: Generate 4 prompts
def generate_text_with_mistral(prompt):
    try:
        # Structure the prompt to ensure exactly 4 summarized sentences
        structured_prompt = (
            f"{prompt}\n\n"
            "Please summarize this story in exactly 4 concise and coherent sentences. "
            "Do not include any additional text."
        )
        
        # Call the CLI command to run Mistral Nemo
        command = ["ollama", "run", "mistral-nemo"]
        result = subprocess.run(
            command, input=structured_prompt,
            capture_output=True,
            text=True
        )
        
        # Check for errors
        if result.returncode != 0:
            raise Exception(f"Error generating text: {result.stderr}")
        
        # Process the output, splitting by both periods and semicolons
        output = result.stdout.strip().replace('\n', ' ')  # Remove newlines
        sentences = [s.strip() for s in output.replace(';', '.').split('. ') if s.strip()]

        # Ensure we return exactly the first 4 sentences
        summarized_sentences = sentences[:4]
        
        return summarized_sentences

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Main function to integrate text generation and image generation
def generate_images_from_story(story_text):
    # Step 1: Generate the 4 summarized prompts
    summarized_prompts = generate_text_with_mistral(story_text)
    
    if not summarized_prompts:
        print("No prompts were generated.")
        return

    negative_prompt = "multiple hands, deformed fingers, monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
    
    # Step 2: Generate images for each prompt
    for i, prompt in enumerate(summarized_prompts):
        print(f"Generating image for prompt {i+1}: {prompt}")
        
        images = ip_model.generate(
            prompt=prompt,
            negative_prompt=negative_prompt,
            face_image=face_image,
            faceid_embeds=faceid_embeds,
            shortcut=v2,
            s_scale=1.0,
            num_samples=1,
            width=512,
            height=768,
            num_inference_steps=35,
            seed=2023 + i,  # Vary the seed slightly for each prompt
            guidance_scale=8
        )
        
        for j, image in enumerate(images):
            image.show()
            image.save(f"output_{i}_{j}_out.png")  # Save each generated image


# Example usage
story_text = """
Tom spent many hours in the lab, carefully working until he made an important discovery in biology. 
He shared his findings at a big conference, explaining his work to scientists from around the world. 
For his hard work, Tom received a special award, recognizing the impact of his discovery. 
He also wrote a book to share his research with more people in an easy-to-understand way. 
Tom enjoys helping others, so he mentored students, guiding them in their own research and encouraging them to explore new ideas in biology.
"""

# Generate images from the story
generate_images_from_story(story_text)


In [1]:
%pip install gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting importlib-resources<7.0,>=1.3 (from gradio)
  Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp312-none-win_amd64.whl.metadata (51 kB)
Collecting pydub (from gradio)
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.10-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Dow