In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import torch
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

# Load the model and processor
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Load the image
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)

# Text queries
text = "a cat. a remote control."

# Prepare inputs
inputs = processor(images=image, text=text, return_tensors="pt").to(device)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Process results
results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)

# Draw bounding boxes on the image
draw = ImageDraw.Draw(image)
for box in results[0]["boxes"]:
    box = box.tolist()
    draw.rectangle(box, outline="red", width=3)

# Display the image with bounding boxes
image.show()

# Save the image to a specified folder
save_path = "/content/drive/MyDrive/LLM/predicted.jpg"
image.save(save_path)

print(f"Image saved to {save_path}")


Image saved to /content/drive/MyDrive/LLM/predicted.jpg


In [None]:
%cd /content/drive/MyDrive/LLM

/content/drive/MyDrive/LLM


In [None]:
import locale
print(locale.getpreferredencoding())

ANSI_X3.4-1968


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!git clone https://github.com/IDEA-Research/GroundingDINO.git

Cloning into 'GroundingDINO'...
remote: Enumerating objects: 463, done.[K
remote: Counting objects: 100% (232/232), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 463 (delta 170), reused 146 (delta 131), pack-reused 231 (from 1)[K
Receiving objects: 100% (463/463), 12.87 MiB | 17.76 MiB/s, done.
Resolving deltas: 100% (241/241), done.


In [None]:
%cd GroundingDINO/

/content/drive/MyDrive/LLM/GroundingDINO


In [None]:
!pip install -e .

Obtaining file:///content/drive/MyDrive/LLM/GroundingDINO
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting addict (from groundingdino==0.1.0)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting yapf (from groundingdino==0.1.0)
  Downloading yapf-0.40.2-py3-none-any.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting timm (from groundingdino==0.1.0)
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting supervision>=0.22.0 (from groundingdino==0.1.0)
  Downloading supervision-0.23.0-py3-none-any.whl.metadata (14 kB)
Downloading supervision-0.23.0-py3-none-any.whl (151 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.5/151.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading addict-

In [None]:
!mkdir weights
%cd weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
%cd ..


mkdir: cannot create directory ‘weights’: File exists
/content/drive/MyDrive/LLM/GroundingDINO/weights
/content/drive/MyDrive/LLM/GroundingDINO


In [None]:
!pwd

/content/drive/MyDrive/LLM/GroundingDINO


In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
IMAGE_PATH = "/content/drive/MyDrive/LLM/weee.jpeg"
TEXT_PROMPT = "person . book ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("/content/drive/MyDrive/LLM/1Classi.jpg", annotated_frame)



final text_encoder_type: bert-base-uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



True

In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

# Load the model
model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")

# Image path and text prompt
IMAGE_PATH = "/content/drive/MyDrive/LLM/IMG_20211101_192955_031.jpg"
TEXT_PROMPT = "person . shoes . head . shirt ."
BOX_THRESHOLD = 0.35
TEXT_THRESHOLD = 0.25

# Load the image
image_source, image = load_image(IMAGE_PATH)

# Predict the bounding boxes and phrases
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_THRESHOLD,
    text_threshold=TEXT_THRESHOLD
)

# Annotate the image with the predicted boxes and labels
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

# Save the annotated image
cv2.imwrite("/content/drive/MyDrive/LLM/1Domi.jpg", annotated_frame)

# Format and print the results: labels with their corresponding coordinates
detected_objects = []
for phrase, box in zip(phrases, boxes):
    # Convert box coordinates to a more readable format (x_min, y_min, x_max, y_max)
    x_min, y_min, x_max, y_max = box[:4]
    coordinates = (x_min, y_min, x_max, y_max)
    detected_objects.append([phrase, coordinates])

# Return the detected objects with coordinates
for obj in detected_objects:
    print(f"{obj[0]}: {obj[1]}")


final text_encoder_type: bert-base-uncased
person: (tensor(0.4707), tensor(0.5510), tensor(0.4854), tensor(0.8575))
shoes: (tensor(0.5345), tensor(0.9240), tensor(0.1583), tensor(0.1110))
shoes: (tensor(0.3367), tensor(0.9323), tensor(0.2165), tensor(0.0714))
head: (tensor(0.5917), tensor(0.1821), tensor(0.1338), tensor(0.1181))
shirt: (tensor(0.5501), tensor(0.4129), tensor(0.3250), tensor(0.3353))


In [None]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
import os

# Load the model
model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")

# Define input and output directories
INPUT_DIR = "/content/drive/MyDrive/LLM/images"
ANNOTATED_IMAGES_DIR = "/content/drive/MyDrive/LLM/annotated_images"
LABELS_DIR = "/content/drive/MyDrive/LLM/labels"

# Create output directories if they do not exist
os.makedirs(ANNOTATED_IMAGES_DIR, exist_ok=True)
os.makedirs(LABELS_DIR, exist_ok=True)

# Text prompt and thresholds
TEXT_PROMPT = "person . shoes . head . shirt ."
BOX_THRESHOLD = 0.35
TEXT_THRESHOLD = 0.25

# Process each image in the input directory
for image_name in os.listdir(INPUT_DIR):
    # Construct the full image path
    image_path = os.path.join(INPUT_DIR, image_name)

    # Load the image
    image_source, image = load_image(image_path)

    # Predict the bounding boxes and phrases
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption=TEXT_PROMPT,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD
    )

    # Annotate the image with the predicted boxes and labels
    annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

    # Save the annotated image
    annotated_image_path = os.path.join(ANNOTATED_IMAGES_DIR, image_name)
    cv2.imwrite(annotated_image_path, annotated_frame)

    # Prepare the file path to save coordinates
    label_filename = os.path.splitext(image_name)[0] + ".txt"
    label_output_path = os.path.join(LABELS_DIR, label_filename)

    # Save the coordinates to a text file
    with open(label_output_path, 'w') as f:
        for phrase, box in zip(phrases, boxes):
            # Extract and convert tensor values to float
            x_min, y_min, x_max, y_max = box[:4].tolist()
            # Write the label and coordinates to the file
            f.write(f"{phrase} {x_min} {y_min} {x_max} {y_max}\n")

print("Processing complete. Annotated images and labels have been saved.")


final text_encoder_type: bert-base-uncased
