In [1]:
from groundingdino.util.inference import load_model, load_image, predict
import pandas as pd

data_path = "/root/kl/unsloth_vlm_full_dataset_4o/gpt-4o_2.csv"
data = pd.read_csv(data_path, encoding='latin1')

count = 0
reference_object_name = data["Answers"][count].split("Reference Object: ")[1].split("\n")[0].strip()
target_object_name = data["Answers"][count].split("Target Object: ")[1].split("\n")[0].strip()
image_path_name = data["image_paths"][count].strip()

CONFIG_PATH = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
DEVICE = "cpu"
IMAGE_PATH = image_path_name
TEXT_PROMPT_Ref_Obj = reference_object_name
TEXT_PROMPT_Tar_Obj = target_object_name
BOX_THRESHOLD = 0.35
TEXT_THRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)
model = load_model(CONFIG_PATH, CHECKPOINT_PATH)

Ref_Obj_Bbox, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT_Ref_Obj,
    box_threshold=BOX_THRESHOLD,
    text_threshold=TEXT_THRESHOLD,
    device=DEVICE,
)

Ref_Obj_Bbox_list = Ref_Obj_Bbox.tolist()[0]

Tar_Obj_Bbox, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT_Tar_Obj,
    box_threshold=BOX_THRESHOLD,
    text_threshold=TEXT_THRESHOLD,
    device=DEVICE,
)

Tar_Obj_Bbox_list = Tar_Obj_Bbox.tolist()[0]



final text_encoder_type: bert-base-uncased




In [28]:
Ref_Obj_Bbox_list

[0.3368787169456482,
 0.6371487975120544,
 0.0827253982424736,
 0.4411730468273163]

In [29]:
Tar_Obj_Bbox_list

[0.5124080777168274,
 0.6462185382843018,
 0.10676079988479614,
 0.4224409759044647]

In [2]:
print("Reference Object: ", reference_object_name)
print("Target Object: ", target_object_name)
print("Image Path: ", image_path_name)

Reference Object:  Person in red
Target Object:  Person in gray
Image Path:  /root/kl/unsloth_vlm/gpt-4o-mini/1.jpg


In [13]:
def get_grid_indices(image_shape, bbox_cxcywh):
    H, W = image_shape
    cx, cy, bw, bh = bbox_cxcywh
    x1 = (cx - bw / 2) * W
    y1 = (cy - bh / 2) * H
    x2 = (cx + bw / 2) * W
    y2 = (cy + bh / 2) * H
    cell_w = W / 3
    cell_h = H / 3
    grid_indices = []
    for row in range(3):         
        for col in range(3):     
            grid_id = row * 3 + col + 1
            gx1 = int(col * cell_w)
            gy1 = int(row * cell_h)
            gx2 = int((col + 1) * cell_w)
            gy2 = int((row + 1) * cell_h)
            if not (x2 <= gx1 or x1 >= gx2 or y2 <= gy1 or y1 >= gy2):
                grid_indices.append(grid_id)
    return sorted(grid_indices)

def grid_to_description(grids):
    # 1 | 2 | 3
    # ---------
    # 4 | 5 | 6
    # ---------
    # 7 | 8 | 9
    if len(grids) == 0:
        return "位置不明"
    elif len(grids) == 1:
        if grids ==[1]:
            return "左上方"
        elif grids == [2]:
            return "上方"
        elif grids == [3]:
            return "右上方"
        elif grids == [4]:
            return "左側"
        elif grids == [5]:
            return "中央"
        elif grids == [6]:
            return "右側"
        elif grids == [7]:
            return "左下方"
        elif grids == [8]:
            return "下方"
        elif grids == [9]:  
            return "右下方"
    elif len(grids) == 2:
        if grids == [1,2] or grids == [1,4]:
            return "左上方"
        elif grids == [2,3] or grids == [3,6]:
            return "右上方"
        elif grids == [4,7] or grids == [7,8]:
            return "左下方"
        elif grids == [6,9] or grids == [8,9]:
            return "右下方"
        elif grids == [2,5] :
            return "中間偏上方"
        elif grids == [4,5]:
            return "中間偏左方"
        elif grids == [5,6]:
            return "中間偏右方"
        elif grids == [5,8]:
            return "中間偏下方"
    elif len(grids) == 3:
        if grids == [1,2,3]:
            return "上方"
        elif grids == [4,5,6] or grids == [2,5,8]:
            return "中間"
        elif grids == [7,8,9]:
            return "下方"
        elif grids == [1,4,7]:
            return "左側"
        elif grids == [3,6,9]:
            return "右側"
    elif len(grids) == 4:
        if grids == [1,2,4,5]:
            return "中間偏左上方"
        elif grids == [2,3,5,6]:
            return "中間偏右上方"
        elif grids == [4,5,7,8]:
            return "中間偏左下方"
        elif grids == [5,6,8,9]:
            return "中間偏右下方"
    elif len(grids) == 6:
        if grids == [1,2,3,4,5,6]:
            return "中間偏上方"
        elif grids == [4,5,6,7,8,9]:
            return "中間偏下方"
        elif grids == [1,2,4,5,7,8]:
            return "中間偏左側"
        elif grids == [2,3,5,6,8,9]:
            return "中間偏右側"
    elif len(grids) == 9:
        return "中間"

ref_grids = get_grid_indices(image.shape[:2], Ref_Obj_Bbox_list)
tar_grids = get_grid_indices(image.shape[:2], Tar_Obj_Bbox_list)

print("Reference Object - "+reference_object_name+": "+grid_to_description(ref_grids))
print("Target Object - "+target_object_name+": "+grid_to_description(tar_grids))


Reference Object - Person in red: 中間偏左下方
Target Object - Person in gray: 中間偏下方


# Position Algorithm

In [19]:
def position_algorithm():
    ref_grids = get_grid_indices(image.shape[:2], Ref_Obj_Bbox_list)
    tar_grids = get_grid_indices(image.shape[:2], Tar_Obj_Bbox_list)
    reference_object_position = grid_to_description(ref_grids)
    target_object_position = grid_to_description(tar_grids)
    return reference_object_position, target_object_position
    

# def direction_algorithm(image_shape, bbox_cxcywh):
    

In [None]:
position_algorithm() # reference_object_position, target_object_position

('中間偏左下方', '中間偏下方')

# SAM + Direction Algorithm

In [None]:
image_path_name #'/root/kl/unsloth_vlm/gpt-4o-mini/1.jpg'
Depth_map = image_path_name.replace('.jpg', '.csv')

'/root/kl/unsloth_vlm/gpt-4o-mini/1.csv'

In [None]:
import numpy as np
import PIL.Image as Image
from ultralytics import SAM

image = Image.open(image_path_name).convert('RGB')
image_numpy = np.array(image)

sam = SAM("sam_b.pt")  

h, w, _ = image_numpy.shape

def convert_bbox(bbox, w, h):
    cx, cy, bw, bh = bbox
    x_min = int((cx - bw/2) * w)
    y_min = int((cy - bh/2) * h)
    x_max = int((cx + bw/2) * w)
    y_max = int((cy + bh/2) * h)
    return [x_min, y_min, x_max, y_max]

ref_box = convert_bbox(Ref_Obj_Bbox_list, w, h)
tar_box = convert_bbox(Tar_Obj_Bbox_list, w, h)

ref_results = sam.predict(image_numpy, bboxes=[ref_box])
tar_results = sam.predict(image_numpy, bboxes=[tar_box])

ref_mask = (ref_results[0].masks.data[0].cpu().numpy() * 1).astype(np.uint8)
tar_mask = (tar_results[0].masks.data[0].cpu().numpy() * 1).astype(np.uint8)

print("Ref mask shape:", ref_mask.shape, "unique:", np.unique(ref_mask))
print("Tar mask shape:", tar_mask.shape, "unique:", np.unique(tar_mask))

Depth_map = pd.read_csv(image_path_name.replace('.jpg', '.csv'))

ref_mask_depth = Depth_map * ref_mask
tar_mask_depth = Depth_map * tar_mask

ref_mask_depth_mean = ref_mask_depth.mean()
tar_mask_depth_mean = tar_mask_depth.mean()

thereshold = 50 #cm
if ref_mask_depth_mean + thereshold < tar_mask_depth_mean:
    print("Target object is behind the reference object.")

if tar_mask_depth_mean + thereshold < ref_mask_depth_mean:
    print("Target object is in front of the reference object.")

View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/sam_b.pt to 'sam_b.pt': 100% ━━━━━━━━━━━━ 357.7/357.7MB 1.3MB/s 4:32309s

0: 1024x1024 1 0, 1193.3ms
Speed: 96.9ms preprocess, 1193.3ms inference, 63.4ms postprocess per image at shape (1, 3, 1024, 1024)

0: 1024x1024 1 0, 97.9ms
Speed: 5.1ms preprocess, 97.9ms inference, 0.3ms postprocess per image at shape (1, 3, 1024, 1024)
Ref mask shape: (600, 800) unique: [0 1]
Tar mask shape: (600, 800) unique: [0 1]
