In [7]:
import os
import sys
from pathlib import Path
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image

try:
    sys.path.append("./GroundingDINO")
    from groundingdino.util.inference import load_model, predict
except Exception as e:
    print("GroundingDINO imports failed:", e)

try:
    sys.path.append("./segment-anything")
    from segment_anything import sam_model_registry, SamPredictor
except Exception as e:
    print("Segment Anything imports failed:", e)

def groundingdino_predict(image_pil: Image.Image, text_prompt: str, model, device='cpu', box_threshold=0.3):
    image = np.array(image_pil)[:, :, ::-1].copy()
    try:
        boxes, scores, labels = predict(model=model, image=image, caption=text_prompt, box_threshold=box_threshold)
        return boxes, scores, labels
    except Exception as e:
        print('GroundingDINO predict failed:', e)
        return [], [], []

def boxes_to_sam_prompts(boxes, image_shape):
    h,w = image_shape[:2]
    boxes_norm, points, point_labels = [], [], []
    for b in boxes:
        x1,y1,x2,y2 = b
        boxes_norm.append([x1,y1,x2,y2])
        cx, cy = (x1+x2)/2, (y1+y2)/2
        points.append([cx, cy])
        point_labels.append(1)
    return np.array(boxes_norm), np.array(points), np.array(point_labels)

import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

def show_image_with_masks(image_path, masks, boxes=None, labels=None):
    image = cv2.imread(image_path)[:, :, ::-1]
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    for i, mask in enumerate(masks):
        colored_mask = np.zeros_like(image)
        colored_mask[mask] = [0, 255, 0]
        plt.imshow(np.dstack((colored_mask[:, :, 1], mask.astype(float) * 0.5)))
        if boxes is not None:
            x0, y0, x1, y1 = boxes[i].astype(int)
            plt.gca().add_patch(plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
                                              edgecolor='red', facecolor='none', lw=2))
            if labels is not None:
                plt.text(x0, y0, labels[i], color='white', backgroundcolor='red')
    plt.axis("off")
    plt.show()



def run_text_driven_segmentation(
    image_path,
    text_prompt,
    groundingdino_weights="./weights/groundingdino.pth",
    sam_weights="./weights/sam_vit_h.pth",
    device="cuda"
):

    try:
        from groundingdino.util.inference import load_model, load_image, predict
        config_path = "./GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
        dino_model = load_model(config_path, groundingdino_weights, device=device)

        image_source, image_tensor = load_image(image_path)
        image_tensor = image_tensor.to(device)
    except Exception as e:
        print("GroundingDINO load failed:", e)
        return [], [], [], []


    try:
        boxes, scores, labels = predict(
            model=dino_model,
            image=image_tensor,
            caption=text_prompt,
            box_threshold=0.3,
            text_threshold=0.25
        )
    except Exception as e:
        print("GroundingDINO predict failed:", e)
        boxes, scores, labels = [], [], []

    if boxes is None or len(boxes) == 0:
        print("No boxes detected by GroundingDINO. Try adjusting thresholds or use CLIPSeg fallback.")
        return [], [], [], []


    try:
        from segment_anything import sam_model_registry, SamPredictor
        sam = sam_model_registry["vit_h"](checkpoint=sam_weights)
        sam.to(device=device)
        predictor = SamPredictor(sam)
    except Exception as e:
        print("SAM predictor not available:", e)
        return [], [], [], []


    image = cv2.imread(image_path)[:, :, ::-1]
    predictor.set_image(image)

    masks = []
    for box in boxes:
        box = np.array(box).astype(int)
        mask, _, _ = predictor.predict(box=box[None, :], multimask_output=False)
        masks.append(mask[0])


    show_image_with_masks(image_path, masks, boxes, labels)
    return masks, boxes, scores, labels

from google.colab import files
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
print("Loaded image:", image_path)

text_prompt = input("Enter the object you want to segment: ")
print("Prompt:", text_prompt)

masks, boxes, scores, labels = run_text_driven_segmentation(
    image_path=image_path,
    text_prompt=text_prompt,
    groundingdino_weights="./weights/groundingdino.pth",
    sam_weights="./weights/sam_vit_h.pth",
    device="cuda"
)




Saving Bhavya_photo.jpg to Bhavya_photo (2).jpg
Loaded image: Bhavya_photo (2).jpg
Enter the object you want to segment: coffee
Prompt: coffee
final text_encoder_type: bert-base-uncased
GroundingDINO load failed: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx




In [2]:
# Install GroundingDINO + Segment Anything
!apt-get -y update && apt-get -y install git-lfs ffmpeg

# Clone GroundingDINO
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd GroundingDINO
!pip install -r requirements.txt
!python setup.py build develop
%cd ..

# Clone SAM
!git clone https://github.com/facebookresearch/segment-anything.git
%cd segment-anything
!pip install -e .
%cd ..

# Extra deps
!pip install transformers timm opencv-python matplotlib pillow scikit-image
!pip install git+https://github.com/openai/CLIP.git@main


Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,045 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,

In [3]:
!mkdir -p ./weights
!wget -O ./weights/groundingdino.pth https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth


--2025-10-01 08:32:36--  https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth
Resolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.12, 3.165.160.61, ...
Connecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/642134fe0c65c54470dfb65f/a2c3c96ba5839ee787cfa06362c8501ea67f9f0455606e66f7fef07acf7e8c43?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251001%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251001T083236Z&X-Amz-Expires=3600&X-Amz-Signature=1e471e65c5af2fb0f388b28cf00582966ed43a0920576dc157236f38da84dc80&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27groundingdino_swint_ogc.pth%3B+filename%3D%22groundingdino_swint_ogc.pth%22%3B&x-id=GetObject&Expires=1759311156&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdG

In [4]:
!wget -O ./weights/sam_vit_h.pth https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth


--2025-10-01 08:32:55--  https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.14, 3.163.189.96, 3.163.189.108, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2564550879 (2.4G) [binary/octet-stream]
Saving to: ‘./weights/sam_vit_h.pth’


2025-10-01 08:33:08 (179 MB/s) - ‘./weights/sam_vit_h.pth’ saved [2564550879/2564550879]

