In [2]:
import os
import base64
import json
import re
import time
import requests
from openai import OpenAI
from PIL import Image
from io import BytesIO
from config import OPENAI_API_KEY

## GPT-4o-mini + DALL·E 3 Style Transfer Pipeline Base
**User Input:**

+ The user uploads an **image** and provides a **prompt** describing what changes should be made to the image.

**Step 1 — Caption Generation (via GPT-4o-mini):**

+ The uploaded image and the user's prompt are sent to the **GPT-4o-mini** model.
+ *GPT-4o-mini* analyzes the visual content of the image and generates a detailed, DALL·E-friendly caption.
+ This caption includes visual attributes such as colors, drawing style, food category, background, and layout.

**Step 2 — Image Generation (via DALL·E 3):**

+ The original user prompt and the generated caption are merged into a final prompt.
+ This combined prompt is sent to DALL·E 3, along with the original image style as reference.
+ DALL·E 3 generates a new image that preserves the style of the original but incorporates the requested modifications.

**Output:**

+ A newly generated image that reflects both the original style and the desired changes from the prompt.
+ The output is saved and logged along with the image paths and generated caption.

In [7]:
# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)
image_extensions = (".jpg", ".jpeg", ".png")
all_results = []

# Prompt template to instruct GPT for image captioning
prompt_template = """
You are a image captioning assistant helping to generate training captions for a dataset of foods images.
You will receive a food image and a user prompt describing how it should be redesigned.

Task:
Generate a **detailed caption** describing the image. The caption should:
- Include visual details like color, pattern, drawing style, background if visible.
- Analyze the "category"(the general type of food depicted) and describe its key features.
- Be a single sentence, fluent, and descriptive.
- These captions will be used as input for the DALL·E model. Generate captions suitable as prompts for the DALL·E model.

Output caption example:
"A red "category" and a sliced "category" with seeds, in flat cartoon vector style, soft lighting, vivid colors, no background"
"Three "category" with cashew nuts and green leaves on a branch, in colorful flat vector cartoon style, no background"

Use this format:
{{
  "category": "<your generated food category here>",
  "caption": "<your generated caption here>",
  "image_path": "<actual image path>",
  "user_prompt": "<actual user prompt>",
  "generated_image_path":"<your generated image path>"
}}
"""

def download_and_save_image(image_url: str, save_path: str) -> None:
    """
    Downloads an image from a given URL and saves it to the specified path.
    """
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    img.save(save_path)

def output_filename(name: str) -> str:
    """
    Formats a given name into a valid filename.
    """
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', '_', name)
    return name.strip().lower()

# Load the input JSON file containing image paths and user prompts
with open("test_inputs.json", "r", encoding="utf-8") as f:
    prompt_items = json.load(f)

# Iterate through each item (image + user prompt)
for item in prompt_items:
    image_path = item["filename"]
    user_prompt = item["user_prompt"]

    if not image_path.lower().endswith(image_extensions):
        print(f"Skipping {image_path} - unsupported extension")
        continue

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    print(f"Processing: {image_path}")

    # Read and encode image as base64
    with open(image_path, "rb") as img_file:
        base64_image = base64.b64encode(img_file.read()).decode("utf-8")
    
    # Format the prompt for GPT
    formatted_prompt = prompt_template.format(user_prompt=user_prompt, base64_image=base64_image)

    try:
        # Send image and prompt to GPT for caption generation
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": formatted_prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=150,
            temperature=0
        )

        raw_output = response.choices[0].message.content
        print("Caption output:", raw_output)

        json_match = re.search(r'\{.*\}', raw_output, re.DOTALL)
        if not json_match:
            raise ValueError("JSON response not found.")

        parsed = json.loads(json_match.group(0))
        

        caption = parsed["caption"]         
        dalle_prompt = user_prompt 
        
        # Create a full prompt for DALL·E using both the caption and user instruction.
        full_prompt = f"{dalle_prompt}, render in the exact same illustration style and details as described: {caption}"

        # Request DALL·E to generate a new image
        print(f"Generating new image from combined prompt: {full_prompt}")
        dalle_response = client.images.generate(
            model="dall-e-3",
            prompt=full_prompt,
            n=1,
            size="1024x1024"
        )

        # Wait to avoid rate limit issues
        time.sleep(5) 

        generated_image_url = dalle_response.data[0].url
        print(f"Generated image URL: {generated_image_url}")
        safe_prompt = output_filename(user_prompt)
        
        # Download and save the generated image
        os.makedirs("outputs/base_prompt", exist_ok=True)
        generated_image_path = f"outputs/base_prompt/{safe_prompt}_generated.png"
        download_and_save_image(generated_image_url, generated_image_path)
        print(f"Saved generated image to {generated_image_path}")

        parsed["generated_image_path"] = generated_image_path
        parsed["image_path"] = image_path


        all_results.append(parsed)

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        time.sleep(2)

print("\n--- ALL DONE ---")
# Save all results to a JSON file
with open("outputs/base_prompt/foods_fruit_captions_and_generated.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)


Processing: test_images/01_Apple.png
Caption output: {
  "category": "apple",
  "caption": "A shiny red apple with a vibrant green leaf beside a freshly sliced half revealing its creamy white flesh and small brown seeds, illustrated in a flat cartoon vector style with soft lighting and no background.",
  "image_path": "<actual image path>",
  "user_prompt": "<actual user prompt>",
  "generated_image_path": "<your generated image path>"
}
Generating new image from combined prompt: A soft red tomato with seeds, in flat cartoon vector style, no background, render in the exact same illustration style and details as described: A shiny red apple with a vibrant green leaf beside a freshly sliced half revealing its creamy white flesh and small brown seeds, illustrated in a flat cartoon vector style with soft lighting and no background.
Generated image URL: https://oaidalleapiprodscus.blob.core.windows.net/private/org-9DP1r18q6yLXvWOlkzlZ3Ess/user-FraXfwg3c0aTUJi4hrjaVv1s/img-OLpx2Ze6MxW9HKEiO4

## GPT-4o-mini + DALL·E 3 Style Transfer Pipeline Base (Enhanced)

**User Input:**

The user uploads an image and provides a prompt describing what changes should be made to the image.

**Step 1 — Caption Generation and Visual Analysis (via GPT-4o-mini):**

+ The uploaded image and the user's prompt are sent to the GPT-4o-mini model.
+ GPT-4o-mini analyzes the visual content of the image and generates a detailed, DALL·E-friendly caption.
+ Additionally, it extracts key visual features including:
    + Background description (e.g., plain, textured, outdoor, indoor, no background),
    + Style details (e.g., flat vector, watercolor, cartoonish, realistic),
    + Quantity, number of objects visible in the image.

All these extracted details (caption, background, drawing_style, quantity) are structured and returned together.

**Step 2 — Image Generation (via DALL·E 3):**

+ The original user prompt and the full set of extracted **visual features (caption, background, drawing_style, quantity)** are merged into a comprehensive final prompt.
+ This combined prompt is sent to DALL·E 3, along with the original image style as reference.

DALL·E 3 generates a new image that preserves the style and visual composition of the original while incorporating the user’s requested modifications and respecting the extracted background, style, and quantity information.

**Output:**

+ A newly generated image that reflects the original style, background, quantity, and the desired changes from the prompt.
+ The output is saved and logged along with the image paths, generated caption, and all extracted visual features.

In [6]:
# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)
image_extensions = (".jpg", ".jpeg", ".png")
all_results = []

# Prompt template to instruct GPT for image captioning
prompt_template ="""
You are a image captioning assistant helping to generate training captions for a dataset of foods images.
You will receive a food image and a user prompt{prompt} describing how it should be redesigned.

🔹 Important:
Always prioritize the user's input when generating the caption. If the user mentions a category, style, quantity, or any specific detail, **you must strictly follow the user's input**. User instructions take precedence over visual content.

Your Task as follow:

When generating caption first, you MUST take the user prompt into account. If the user input mentions a category, the category specified by the user should take precedence.
Generate a JSON object that contains:
- "caption": Generate a **detailed caption** describing the image. A fluent, descriptive sentence describing the image, including color, drawing style, lighting, quantity, and background.These captions will be used as input for the DALL·E model. Generate captions suitable as prompts for the DALL·E model.
- "drawing_style": a short phrase describing the illustration style (e.g., flat cartoon vector, watercolor, realistic, 3D render, line art, etc.)
- "quantity": an integer described in the user prompt.
- "background": describe if the background is visible, or say "no background".
After generating the initial caption, you MUST check if the category inferred from the image differs from the category mentioned in the user prompt.
If there is a conflict, revise the caption to match the category provided by the user. The user's specified category should override your prediction.

Output caption example:
"A red apple and a sliced apple with seeds, in flat cartoon vector style, soft lighting, vivid colors, no background"
"Three cashew apple with cashew nuts and green leaves on a branch, in colorful flat vector cartoon style, no background"

Use this format:
{{
  "caption": "<a descriptive sentence for DALL·E>",
  "drawing_style": "<short phrase>",
  "quantity": <integer>,
  "background": "<description or 'no background'>",
  "image_path": "<actual image path>",
  "user_prompt": "<actual prompt>",
  "generated_image_path": "<your generated image path>"
}}
"""

def download_and_save_image(image_url: str, save_path: str) -> None:
    """
    Downloads an image from a given URL and saves it to the specified path.
    """
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    img.save(save_path)

def output_filename(name: str) -> str:
    """
    Formats a given name into a valid filename.
    """
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', '_', name)
    return name.strip().lower()

# Load the input JSON file containing image paths and user prompts
with open("test_inputs.json", "r", encoding="utf-8") as f:
    prompt_items = json.load(f)

# Iterate through each item (image + user prompt)
for item in prompt_items:
    image_path = item["filename"]
    user_prompt = item["user_prompt"]

    if not image_path.lower().endswith(image_extensions):
        print(f"Skipping {image_path} - unsupported extension")
        continue

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    print(f"Processing: {image_path}")

    with open(image_path, "rb") as img_file:
        base64_image = base64.b64encode(img_file.read()).decode("utf-8")

    formatted_prompt = prompt_template.format(prompt=user_prompt, base64_image=base64_image)

    try:
        # Send image and prompt to GPT for caption generation
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": formatted_prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=150,
            temperature=0
        )

        raw_output = response.choices[0].message.content
        print("Caption output:", raw_output)

        json_match = re.search(r'\{.*\}', raw_output, re.DOTALL)
        if not json_match:
            raise ValueError("JSON response not found.")

        parsed = json.loads(json_match.group(0))
        

        # Parsed necessary information
        caption = parsed["caption"]         
        dalle_prompt = user_prompt 
        quantity = parsed["quantity"]
        drawing_style = parsed["drawing_style"]
        background = parsed["background"]


        # Create a full prompt for DALL·E using the caption,quantity,drawing_style  and user instruction.
        full_prompt = (
            f"{dalle_prompt}. Strictly draw {quantity} "
            f"in {drawing_style} style. Ensure all of them follow this description: {caption}."
        )

        # Request DALL·E to generate a new image
        print(f"Generating new image from combined prompt: {full_prompt}")
        dalle_response = client.images.generate(
            model="dall-e-3",
            prompt=full_prompt,
            n=1,
            size="1024x1024"
        )

        # to avoid rate limit issues
        time.sleep(5) 

        generated_image_url = dalle_response.data[0].url
        print(f"Generated image URL: {generated_image_url}")
        safe_prompt = output_filename(user_prompt)

        # saved the generated images
        os.makedirs("outputs/base_prompt_improved", exist_ok=True)
        generated_image_path = f"outputs/base_prompt_improved/{safe_prompt}_generated.png"
        download_and_save_image(generated_image_url, generated_image_path)
        print(f"Saved generated image to {generated_image_path}")

        parsed["generated_image_path"] = generated_image_path
        parsed["image_path"] = image_path
        parsed["user_prompt"] = user_prompt


        all_results.append(parsed)

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        time.sleep(2)

print("\n--- ALL DONE ---")

# saved the json file
with open("outputs/base_prompt_improved/foods_fruit_captions_and_generated.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)


Processing: test_images/01_Apple.png
Caption output: {
  "caption": "A shiny red apple and a sliced apple revealing its seeds, illustrated in flat cartoon vector style with vibrant colors and soft lighting, no background.",
  "drawing_style": "flat cartoon vector",
  "quantity": 2,
  "background": "no background",
  "image_path": "<actual image path>",
  "user_prompt": "A soft red tomato with seeds, in flat cartoon vector style, no background",
  "generated_image_path": "<your generated image path>"
}
Generating new image from combined prompt: A soft red tomato with seeds, in flat cartoon vector style, no background. Strictly draw 2 in flat cartoon vector style. Ensure all of them follow this description: A shiny red apple and a sliced apple revealing its seeds, illustrated in flat cartoon vector style with vibrant colors and soft lighting, no background..
Generated image URL: https://oaidalleapiprodscus.blob.core.windows.net/private/org-9DP1r18q6yLXvWOlkzlZ3Ess/user-FraXfwg3c0aTUJi4hr