In [1]:
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import BlipProcessor, BlipForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

# -------------------------------
# Configuration
FLAN_T5_MODEL = "google/flan-t5-large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# Load Models
print("🔧 Loading models...")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(FLAN_T5_MODEL)
t5_model = T5ForConditionalGeneration.from_pretrained(FLAN_T5_MODEL).to(DEVICE)

# -------------------------------
# Functions

def generate_recipe_summary(dish_title):
    input_text = f"Give me a 3-step summary of how to cook {dish_title}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)
    output = t5_model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    # Format summary into 3 steps
    steps = [s.strip() for s in summary.replace("\n", ". ").split(".") if s.strip()]
    formatted_steps = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(steps[:3])])

    return formatted_steps

def fetch_image_from_url(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    return image

# -------------------------------
# Main
def main():
    image_url = input("🔗 Enter the image URL (e.g., from Google Images): ").strip()
    noisy_title = input("📝 Enter a noisy/rough dish title you believe fits this image: ").strip()

    try:
        print("🖼️ Downloading and processing image...")
        image = fetch_image_from_url(image_url)
        image.save("temp_image.jpg")  # BLIP can use this later if needed
    except Exception as e:
        print(f"❌ Failed to download or open the image: {e}")
        return

    print("🧠 Generating formatted 3-step summary using your model...")
    formatted_summary = generate_recipe_summary(noisy_title)

    print("\n📋 --- RECIPE SUMMARY ---")
    print(f"🤖 Summary for '{noisy_title}':\n{formatted_summary}")

    print("\n🔎 To verify manually, you can search:")
    print(f"🔗 https://www.food.com/search/{noisy_title.replace(' ', '%20')}")
    print(f"🔗 https://www.allrecipes.com/search/results/?wt={noisy_title.replace(' ', '%20')}")
    print(f"🔗 https://www.tasty.co/search?q={noisy_title.replace(' ', '%20')}")

if __name__ == "__main__":
    main()


🔧 Loading models...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

🔗 Enter the image URL (e.g., from Google Images): https://img.sndimg.com/food/image/upload/f_auto,c_fill,q_80,w_621,fl_progressive,h_349/v1/img/recipes/54/75/2/wc0parMsS9OTwRMHrOch_0S9A6332.jpg
📝 Enter a noisy/rough dish title you believe fits this image: Turkey Meatloaf
🖼️ Downloading and processing image...
🧠 Generating formatted 3-step summary using your model...

📋 --- RECIPE SUMMARY ---
🤖 Summary for 'Turkey Meatloaf':
Step 1: Preheat the oven to 350 degrees F (180 degrees C)
Step 2: In a large bowl, mix the ground turkey with the bread crumbs
Step 3: Add the bread crumbs to the meat mixture and mix well

🔎 To verify manually, you can search:
🔗 https://www.food.com/search/Turkey%20Meatloaf
🔗 https://www.allrecipes.com/search/results/?wt=Turkey%20Meatloaf
🔗 https://www.tasty.co/search?q=Turkey%20Meatloaf
