# **CLIP Zero-shot experiments and results**

## **Using [OpenAI CLIP](https://github.com/openai/CLIP) ViT-B/32 model**

In [None]:
! conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

/bin/bash: line 1: conda: command not found
Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m963.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-hpzaa9q4
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-hpzaa9q4
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda

In [None]:
import os
import clip
import torch
from PIL import Image

In [None]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)



100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 98.0MiB/s]


## **Experiment 1 : Basic classification to distinguish herbaria plant sample from bigger varied classes**

In [None]:
# Download the dataset
cifar100 = "sample.jpg"

# Prepare the inputs
image, class_id = cifar100, "plant"
classes = ["flower", "rock", "herb", "man", "tree", "paper"]
image_input = preprocess(Image.open(image)).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)



In [None]:
# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)


In [None]:
# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

In [None]:
# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")


Top predictions:

            herb: 68.75%
            tree: 23.03%
          flower: 5.47%
           paper: 2.08%
             man: 0.61%


**Observation : CLIP performs excellently with high variance class labels and clear distinguishing features between samples**

## **Experiment 2 : Test with actual name and different specimen**

In [None]:
# Download the dataset
cifar100 = "sample.jpg"

# Prepare the inputs
image = cifar100
classes = ["Elymus hystrix", "Carex crinita", "Embelia pacifica", "Odontites luteus", "Phyllanthus urinaria", "Malvella lepidota"]
image_input = preprocess(Image.open(image)).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of plant species {c}") for c in classes]).to(device)



In [None]:
# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)


In [None]:
# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

In [None]:
# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:}: {100 * value.item():.2f}%")


Top predictions:

Odontites luteus: 71.04%
Malvella lepidota: 13.15%
Embelia pacifica: 11.97%
Phyllanthus urinaria: 3.71%
Elymus hystrix: 0.10%


**Observation : CLIP performs inconsistently depending on sample features and gives varied levels of correctness not suitable for evaluation**

## **Experiment 3 : Test with 10 images**

Get ground truth taxon labels and set as classes

In [None]:
import pandas as pd
import os

In [None]:
path = "drive/MyDrive/testing/"

gt = pd.DataFrame(columns=['ID'])

temp=[]
for f in os.listdir(path):
    if f.endswith("jpg"):
      temp.append(f[:-4])

gt['ID'] = temp

with open(path+"taxon_gt.txt", 'r') as tf:
    t = []
    classes = []
    for line in tf:
        id = line.rstrip('\n').split(":")[0]
        if id in list(gt['ID']):
          t.append(line.rstrip('\n').split(":")[1].lstrip())
        classes.append(line.rstrip('\n').split(":")[1].lstrip())


In [None]:
gt['Taxon'] = t

In [None]:
gt

Unnamed: 0,ID,Taxon
0,1317840733,Cordia sulcata
1,1318027385,Tigridia pavonia
2,1212567865,Sidalcea asprella
3,1317726996,Hohenbergia antillana
4,437160969,Cuscuta gronovii
5,1318182025,Cyathea squamipes
6,1318293083,Croton californicus
7,1318212360,Clermontia persicifolia
8,1317746297,Elymus hystrix
9,1317278320,Myrtopsis pomaderridifolia


Function for zero shot classification for 1 image:

In [None]:
def CLIP_zero_shot(image):
  image_input = preprocess(Image.open(path+image+".jpg")).unsqueeze(0).to(device)
  text_inputs = torch.cat([clip.tokenize(f"a photo of plant species {c}") for c in classes]).to(device)

  # Calculate features
  with torch.no_grad():
      image_features = model.encode_image(image_input)
      text_features = model.encode_text(text_inputs)

  # Pick the top 5 most similar labels for the image
  image_features /= image_features.norm(dim=-1, keepdim=True)
  text_features /= text_features.norm(dim=-1, keepdim=True)
  similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
  values, indices = similarity[0].topk(10)

  # Print the result
  pred = []
  print("\nTop predictions:\n")
  for value, index in zip(values, indices):
      print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")
      pred.append(classes[index])

  return pred

In [None]:
preds = {}

for f in gt['ID']:
      p = CLIP_zero_shot(f)
      preds[f] = p


Top predictions:

Muhlenbergia richardsonis: 3.26%
  Elymus hystrix: 2.13%
 Juncus interior: 1.91%
Isochilus linearis: 1.86%
    Neuropoa fax: 1.72%
  Carex leptalea: 1.66%
Andropogon floridanus: 1.61%
Bromus hordeaceus: 1.51%
Bromus commutatus: 1.42%
Triticum turgidum: 1.40%

Top predictions:

Lawsonia inermis: 1.90%
Sceptridium dissectum: 1.51%
   Crateva magna: 1.41%
Betula alleghaniensis: 1.33%
Diplazium sylvaticum: 1.19%
Asplenium platyneuron: 1.12%
Ribes hudsonianum: 1.08%
Rubus allegheniensis: 0.75%
Plagiomnium medium: 0.73%
Rhizomnium glabrescens: 0.70%

Top predictions:

Triticum turgidum: 4.38%
Sceptridium dissectum: 2.20%
   Crateva magna: 1.85%
Lawsonia inermis: 1.32%
Calamagrostis canadensis: 1.18%
Bromus commutatus: 1.12%
Neodolichomitra yunnanensis: 1.02%
Eriochloa acuminata: 0.89%
Bromus hordeaceus: 0.79%
Aristida nemorivaga: 0.77%

Top predictions:

   Crateva magna: 3.38%
Diplazium sylvaticum: 1.70%
Plagiomnium medium: 1.50%
Philodendron sagittifolium: 1.43%
Justicia

In [None]:
def evaluate(gt, pred):
  acc = 0
  for i in pred:
    if gt[gt['ID']==i]['Taxon'].item() in pred[i]:
      acc+=1

  acc = acc/len(pred)*100
  print(f"Accuracy : {acc}%")

In [None]:
evaluate(gt, preds)

Accuracy : 0.0%
