## Before you start

Let's make sure that we have access to GPU. We can use `nvidia-smi` command to do that. In case of any problems navigate to `Edit` -> `Notebook settings` -> `Hardware accelerator`, set it to `GPU`, and then click `Save`.

In [None]:
!nvidia-smi

In [None]:
import os
HOME = os.getcwd()
print(HOME)

## Install Grounding DINO 🦕

In [None]:
%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
%pip install -e .

In [None]:
CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))

## Download Grounding DINO Weights 🏋️

In [None]:
%cd {HOME}
!mkdir {HOME}\weights
%cd {HOME}/weights
WEIGHTS_FILE = os.path.join(HOME, "weights", 'groundingdino_swint_ogc.pth')

import urllib.request

!wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth -o groundingdino_swint_ogc.pth

urllib.request.urlretrieve('https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth', WEIGHTS_FILE)

In [None]:
WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

## Load Grounding DINO Model

In [None]:
%cd {HOME}/GroundingDINO

from groundingdino.util.inference import load_model, load_image, predict, annotate
model = load_model(CONFIG_PATH, WEIGHTS_PATH)

## Object Detection with Grounding DINO

In [None]:
import supervision as sv
import matplotlib.pyplot as plt
import torch
import cv2
import time

In [None]:
IMAGE_NAME = "HL_microwave_close.jpg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

image_source, image = load_image(IMAGE_PATH)
img_h = image_source.shape[0]
img_w = image_source.shape[1]

In [None]:
begin = time.time()
TEXT_PROMPT = "button"
BOX_TRESHOLD = 0.2
TEXT_TRESHOLD = 0.2

# Tensor of found boxes (with confidence above box_threshold)
# Tensor of logits for text phrases
# List[str] of phrases from prompt found corresponding to boxes (with confidence above text_threshold)
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
# print(phrases)

# Get box coordinates
scale_fct = torch.Tensor([img_w, img_h, img_w, img_h])
boxes = boxes * scale_fct

if(boxes.numel() == 0):
   print('No objects detected.')
# Output details for each detection:
for result in zip(boxes, logits, phrases):
    print(f'{result[2]}: confidence {result[1]}, box {result[0].tolist()}')
# print(boxes)
# print(logits)
# print(phrases)

for box in boxes:
  # Draw blue circle as center of each box (0, 0) is top-left of image
  annotated_frame = cv2.circle(annotated_frame, (int(box[0]), int(box[1])), 10, (255, 0, 0), -1)

%matplotlib inline
# sv.plot_image(annotated_frame, (16, 16))
annotated_frame = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(16, 16))
plt.imshow(annotated_frame)
plt.axis('off')
plt.show()
print(f'{time.time() - begin} sec')
