Image Classification using Hugging Face

In [74]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests
import matplotlib.pyplot as plt
image = Image.open('photo.jpg')
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = processor(images=image, return_tensors="pt")

print(inputs)
pixel_values = inputs['pixel_values']
outputs = model(pixel_values)
logits = outputs.logits

print(logits.shape)

# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])

{'pixel_values': tensor([[[[ 0.9059,  0.9059,  0.9137,  ...,  0.9451,  0.9451,  0.9451],
          [ 0.9216,  0.9137,  0.9137,  ...,  0.9529,  0.9529,  0.9451],
          [ 0.9137,  0.9137,  0.9137,  ...,  0.9529,  0.9451,  0.9451],
          ...,
          [ 0.8824,  0.8824,  0.8902,  ..., -0.3647, -0.6392, -0.6863],
          [ 0.8824,  0.8902,  0.8902,  ...,  0.3569, -0.0824, -0.5608],
          [ 0.8745,  0.8745,  0.8902,  ...,  0.4588,  0.4039,  0.1765]],

         [[ 0.9451,  0.9451,  0.9373,  ...,  0.9608,  0.9608,  0.9608],
          [ 0.9373,  0.9373,  0.9373,  ...,  0.9608,  0.9529,  0.9608],
          [ 0.9373,  0.9373,  0.9373,  ...,  0.9608,  0.9608,  0.9608],
          ...,
          [ 0.9137,  0.9137,  0.9137,  ..., -0.3490, -0.6314, -0.6784],
          [ 0.9059,  0.9216,  0.9216,  ...,  0.3725, -0.0667, -0.5608],
          [ 0.9137,  0.9137,  0.9137,  ...,  0.4745,  0.4275,  0.1922]],

         [[ 0.9216,  0.9216,  0.9294,  ...,  0.9373,  0.9373,  0.9373],
          [ 0

Zero Shot Image Classification

In [75]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

image = Image.open('photo.jpg')

inputs = processor(text=["remote", "radio","phone"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
print(probs)

tensor([[0.0906, 0.0120, 0.8975]], grad_fn=<SoftmaxBackward0>)


Zero Shot Object Detection

In [76]:
import requests
from PIL import Image
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")


image = Image.open('Image.jpg')
texts = [["a photo of a cat", "a photo of a dog","phone","flower","alien"]]
inputs = processor(text=texts, images=image, return_tensors="pt")

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Note: boxes need to be visualized on the padded, unnormalized image
# hence we'll set the target image sizes (height, width) based on that

def get_preprocessed_image(pixel_values):
    pixel_values = pixel_values.squeeze().numpy()
    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
    unnormalized_image = Image.fromarray(unnormalized_image)
    return unnormalized_image

unnormalized_image = get_preprocessed_image(inputs.pixel_values)

target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(
    outputs=outputs, threshold=0.2, target_sizes=target_sizes
)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


Detected flower with confidence 0.222 at location [1.59, 274.93, 411.15, 967.19]
