In [1]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-uedgpnzc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-uedgpnzc
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=c13d1a1b575a8896b37b0db5422d21d78e2afdcc28c0cb7f2d4d95661438969b
  Stored in directory: /tmp/pip-ephem-wheel-cache-dtipbvtc/wheels/da/2b/4c/d6691fa9597aac8bb

In [2]:
!pip install torchvision



In [3]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/My Drive/Final Project DL/images/"

Mounted at /content/drive


In [8]:
import torch
import clip
from torchvision.datasets import Food101
from torchvision import transforms
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity

# Load CLIP Model
def load_clip_model(device='cpu', backbone='ViT-B/16'):
    clip_model, preprocess = clip.load(backbone, device=device)
    return clip_model, preprocess

# Process Image with CLIP
def process_image_with_clip(image_path, clip_model, preprocess, device='cpu'):
    from PIL import Image
    image = Image.open(image_path)
    image_input = preprocess(image).unsqueeze(0).to(device)  # Move image tensor to the device

    # Extract image features using CLIP
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
    return image_features

# Load Food-101 Dataset
def load_food101_dataset(batch_size=1, root='/content/food101', device='cpu'):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to CLIP's input size
        transforms.ToTensor(),         # Convert to tensor
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))  # Normalize to [-1, 1]
    ])

    dataset = Food101(root=root, split='train', transform=transform, download=True)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataset, data_loader

# Match Image with Food Classes using CLIP
def predict_food_class(image_features, clip_model, dataset, device='cpu'):
    # Encode the food class names using CLIP (text features)
    class_names = dataset.classes
    text_inputs = torch.cat([clip.tokenize(class_name) for class_name in class_names]).to(device)

    # Extract text features using CLIP
    with torch.no_grad():
        text_features = clip_model.encode_text(text_inputs)

    # Compute cosine similarity between image features and text features
    similarity = cosine_similarity(image_features.cpu().numpy(), text_features.cpu().numpy())

    # Get the index of the class with the highest similarity
    predicted_class_index = similarity.argmax()
    predicted_class_name = class_names[predicted_class_index]
    return predicted_class_name

# Main Function to Process an Image and Predict the Food Name
def main(image_path, clip_model, preprocess, dataset, device='cpu'):
    # Process image to extract features
    image_features = process_image_with_clip(image_path, clip_model, preprocess, device)

    # Predict the food class
    predicted_food_class = predict_food_class(image_features, clip_model, dataset, device)
    return predicted_food_class


In [9]:
# Example Usage
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = load_clip_model(device)
dataset, data_loader = load_food101_dataset(batch_size=1, root='/content/food101', device=device)

# Use a sample image from the Food-101 dataset (for example)
image_path = '/content/food101/food-101/images/baby_back_ribs/1005066.jpg'  # Replace with your image path
predicted_food_name = main(image_path, clip_model, preprocess, dataset, device)

print(f"Predicted Food Name: {predicted_food_name}")

Predicted Food Name: baby_back_ribs
