In [3]:
!pip install torch torchvision
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-hnj04wr_
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-hnj04wr_
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import clip
import torch
from PIL import Image

# بارگذاری مدل CLIP و پیش‌پردازشگر آن
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)


In [9]:
# بارگذاری تصویر
image = preprocess(Image.open("/content/imag3.jpg")).unsqueeze(0).to(device)


In [14]:
# تعریف جملات برای تطابق
text = ["A dog playing with a blue ball"]

# تبدیل متن به ویژگی‌های عددی
text_inputs = torch.cat([clip.tokenize(t) for t in text]).to(device)

# انجام پیش‌بینی تطابق
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

# محاسبه تطابق
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (image_features @ text_features.T).squeeze(0)

# نمایش نتایج
values, indices = similarity.topk(1)
print(f"تصویر به بهترین توصیف: {text[indices[0]]} با شباهت {values[0]:.4f}")


تصویر به بهترین توصیف: A dog playing with a blue ball با شباهت 0.3229


In [None]:
A brown dog chasing a blue ball

In [15]:
# @title بارگذاری مدل و پردازش تصویر
import clip
import torch
from PIL import Image
import os

# Step 2: Load a more powerful CLIP model
# We're using "ViT-L/14" which is larger and more accurate than "ViT-B/32".
# The model is moved to the GPU ("cuda") if available, for faster processing.
device = "cuda" if torch.cuda.is_available() else "cpu"
# مدل قوی‌تر ViT-L/14 را بارگذاری می‌کنیم که دقت بالاتری دارد
model, preprocess = clip.load("ViT-L/14", device=device)
print("Model ViT-L/14 loaded successfully.")

# Step 3: Load and preprocess the image
# The image path is specified here. Make sure the file exists.
# The 'preprocess' function standardizes the image (resizing, normalization)
# for the CLIP model.
image_path = "/content/imag3.jpg"

if not os.path.exists(image_path):
    print(f"خطا: فایل تصویر در مسیر '{image_path}' یافت نشد.")
    print("لطفاً تصویر خود را آپلود کرده و مسیر آن را به درستی وارد کنید.")
else:
    # بارگذاری تصویر
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    print("Image loaded and preprocessed successfully.")

    # Step 4: Define a more diverse and descriptive set of text prompts
    # Providing multiple, varied descriptions helps the model make a more accurate choice.
    # We include specific details and also some incorrect descriptions to test the model's accuracy.
    text_descriptions = [
        "A photo of a brown dog playing with a blue ball in the grass", # توصیف دقیق
        "A golden retriever chasing a ball outdoors", # توصیف نزدیک
        "A pet enjoying a sunny day", # توصیف کلی
        "There is a dog and a ball in the image", # توصیف مبتنی بر اشیاء
        "A cat sleeping on a red sofa", # توصیف کاملا نامرتبط
        "A landscape view of mountains at sunset" # توصیف نامرتبط دیگر
    ]

    # Step 5: Tokenize the text descriptions
    # 'clip.tokenize' converts the text strings into numerical tokens that the model can understand.
    text_inputs = clip.tokenize(text_descriptions).to(device)

    # Step 6: Generate features and calculate similarity
    # We disable gradient calculations for efficiency as we are only doing inference.
    with torch.no_grad():
        # The model encodes both the image and the text prompts into feature vectors.
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_inputs)

        # Normalize the features to have a unit length. This is crucial for cosine similarity.
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Calculate the cosine similarity between the image features and all text features.
        # We convert the result to probabilities using softmax.
        similarity_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    # Step 7: Display the results
    # We get the top 3 most likely descriptions for the image.
    values, indices = similarity_probs[0].topk(3)

    print("\n--- نتایج تطابق تصویر با توصیف‌ها ---")
    print("مدل، تصویر شما را با توصیف‌های زیر مقایسه کرده و این نتایج را برگردانده است:\n")

    for i, (value, index) in enumerate(zip(values, indices)):
        print(f"#{i+1}: \"{text_descriptions[index]}\"")
        print(f"   - احتمال تطابق: {value.item() * 100:.2f}%")
        print("-" * 20)


100%|███████████████████████████████████████| 890M/890M [00:57<00:00, 16.2MiB/s]


Model ViT-L/14 loaded successfully.
Image loaded and preprocessed successfully.

--- نتایج تطابق تصویر با توصیف‌ها ---
مدل، تصویر شما را با توصیف‌های زیر مقایسه کرده و این نتایج را برگردانده است:

#1: "A photo of a brown dog playing with a blue ball in the grass"
   - احتمال تطابق: 71.69%
--------------------
#2: "A golden retriever chasing a ball outdoors"
   - احتمال تطابق: 27.24%
--------------------
#3: "There is a dog and a ball in the image"
   - احتمال تطابق: 0.95%
--------------------
