In [13]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-v4zi1wr4
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-v4zi1wr4
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [15]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/My Drive/Final Project DL/images/"

Mounted at /content/drive


In [19]:
import torch
import clip
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torchvision import transforms

# Step 1: Load CLIP Model (for image feature extraction)
def load_clip_model(device='cpu', backbone='ViT-B/16'):
    clip_model, preprocess = clip.load(backbone, device=device)
    return clip_model, preprocess

# Step 2: Process Image to Extract Features using CLIP
def process_image_with_clip(image_path, clip_model, preprocess, device='cpu'):
    from PIL import Image
    image = Image.open(image_path)
    image_input = preprocess(image).unsqueeze(0).to(device)  # Move image tensor to the device

    # Extract image features using CLIP
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
    return image_features

# Step 3: Load T5 Model (for text generation)
def load_t5_model(device='cpu'):
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)  # Move T5 model to device
    return tokenizer, model

# Step 4: Generate Dish Name and Ingredients using T5
def generate_description(image_features, tokenizer, model, device='cpu'):
    # Convert the image features to text prompt
    # For now, use a static prompt.
    input_prompt = "Generate dish name and ingredients based on the image."

    # Tokenize the input prompt for T5
    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device)

    # Generate the description using T5
    with torch.no_grad():
        generated_ids = model.generate(input_ids, max_length=50)

    # Decode the output into text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

# Step 5: Putting it all together
def main(image_path, device='cpu'):
    # Load models
    clip_model, preprocess = load_clip_model(device)
    tokenizer, model = load_t5_model(device)

    # Process the image with CLIP
    image_features = process_image_with_clip(image_path, clip_model, preprocess, device)

    # Generate dish description (name and ingredients)
    description = generate_description(image_features, tokenizer, model, device)
    return description

# Example usage
image_path = base_path + "bolognese.jpg"
description = main(image_path, device="cuda" if torch.cuda.is_available() else "cpu")
print(description)


dish name and ingredients based on the image.
