In [1]:
import torch
import numpy as np
import cv2
import matplotlib.pyplot as plt
import argparse
import glob
import os
import yaml
import re
import csv

from sam_run import SegmentAnythingModelRun
from depthanything import DepthAnythingRunner

xFormers not available
xFormers not available


In [2]:
input_files = "./new_images/"

parser = argparse.ArgumentParser(description='Depth Anything V2')

parser.add_argument('--encoder', type=str, default='vitl')
parser.add_argument('--input-size', type=int, default=518)

parser.add_argument('--img-path', type=str, default=input_files)
parser.add_argument('--outdir', type=str, default='./vis_depth')

parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')

parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output')
parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')

args = parser.parse_args(args=[])

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
sam_runner = SegmentAnythingModelRun() 
depth_runner = DepthAnythingRunner(args)
print(device)

cuda


In [4]:
def get_coordinate(raw_img):

        coords = []
        def click_event(event, x, y, flags, params):
            # Check for left mouse click
            
            real_shape, resized_shape = params
            if event == cv2.EVENT_LBUTTONDOWN:
                real_x = int(x * (real_shape[1]/resized_shape[1]))
                real_y = int(y * (real_shape[0]/resized_shape[0]))

                coords.clear()
                coords.extend([real_x, real_y])

                print(f"Pixel Coordinates: x={real_x}, y={real_y}")
                
                cv2.circle(resized_img, (x, y), 2, (0, 255, 0), -1)
                cv2.imshow("Select Ball Center", resized_img)


        scale_percent = 20 
        width = int(raw_img.shape[1] * scale_percent / 100)
        height = int(raw_img.shape[0] * scale_percent / 100)
        dim = (width, height)

        resized_img = cv2.resize(raw_img, dim, interpolation = cv2.INTER_AREA)

        real_shape = (raw_img.shape[0], raw_img.shape[1])
        resized_shape = (resized_img.shape[0], resized_img.shape[1])
        
        while not coords:
            cv2.namedWindow("Select Ball Center", cv2.WINDOW_NORMAL)
            cv2.setMouseCallback("Select Ball Center", click_event, param=(real_shape, resized_shape))

            print("Click on the center of the ball. Press any key to confirm selection.")
            cv2.imshow("Select Ball Center", resized_img)
            cv2.waitKey(0)
            cv2.destroyAllWindows()

            if not coords:
                print("No point selected! You must click on the ball before closing the window.")

        return coords[0], coords[1]

In [5]:
def get_depth(raw_img):
        depth = depth_runner.process_image(raw_img)

        return depth

In [6]:
def get_mask(raw_img):

        x, y = get_coordinate(raw_img)
        mask = sam_runner.process_image(raw_img, x, y)

        return mask

In [7]:
def real_depth(diameter, ball_contour, pixel_count, focal_length_in_px):
        if len(ball_contour) >= 5:
            ellipse = cv2.fitEllipse(ball_contour)
            (x, y), (d1, d2), angle = ellipse

            d_major = max(d1, d2)
            d_minor = min(d1, d2)
            
            aspect_ratio = d_minor / d_major
            
            # If difference is > 5% (adjust 0.05 as needed)
            if (1.0 - aspect_ratio) > 0.2:

                pixel_diameter = d_major 
            else:
                pixel_diameter = (d1 + d2) / 2

        else:
            _, _, w, h = cv2.boundingRect(ball_contour)
            pixel_diameter = (w + h) / 2    
        
        print(diameter)

        real_depth_ellipse = (diameter * focal_length_in_px) / pixel_diameter


        pixel_diameter = 2 * np.sqrt(pixel_count / np.pi)
        real_depth_circle = (diameter * focal_length_in_px) / pixel_diameter


        return (real_depth_ellipse, real_depth_circle)

In [8]:
def get_spatial_features(ball_contour, image_width, image_height):
    c_x, c_y = image_width / 2, image_height / 2
    
    M = cv2.moments(ball_contour)
    if M["m00"] != 0:
        u = int(M["m10"] / M["m00"])
        v = int(M["m01"] / M["m00"])
    else:
        x, y, w, h = cv2.boundingRect(ball_contour)
        u, v = x + w/2, y + h/2


    raw_dist = np.sqrt((u - c_x)**2 + (v - c_y)**2)
    
    max_dist = np.sqrt(c_x**2 + c_y**2)
    normalized_dist = raw_dist / max_dist
    
    return normalized_dist

In [None]:
def calculate_depth_and_area(raw_img, diameter):
        img_rgb = cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB)

        mask = get_mask(img_rgb)
        depth = get_depth(img_rgb)
        obj_depth = depth[]
        q1 = 

        mask_uint8 = mask.astype(np.uint8) * 255

        kernel = np.ones((7, 7), np.uint8) 
        eroded_mask = cv2.erode(mask_uint8, kernel, iterations=1)

        safe_ball_pixels = depth[eroded_mask > 0]

        final_depth_val = np.median(safe_ball_pixels)
        print(f"Final Depth Value: {final_depth_val}")

        contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        ball_contour = max(contours, key=cv2.contourArea)

        number_of_pixel = np.sum(mask)

        depth_from_pixel = real_depth(diameter, ball_contour, number_of_pixel, 3721.2)

        distance_from_center = get_spatial_features(ball_contour, raw_img.shape[0], raw_img.shape[1])

        return {"relative_depth":final_depth_val, "num_of_pixel":number_of_pixel, 
                "real_depth":depth_from_pixel, "distance_from_center":distance_from_center
                }


In [10]:
def extract_image():
        if os.path.isdir(args.img_path):
            filenames = glob.glob(os.path.join(args.img_path, '*'))
        else:
            filenames = [args.img_path]

        os.makedirs(args.outdir, exist_ok=True)

        img_dict = {}
        

        for i, filename in enumerate(filenames):
            img = cv2.imread(filename)
            if img is None:
                continue
            key = os.path.splitext(os.path.basename(filename))[0]

            diameter = {'1':0.04,'2':0.038, '3':0.025, '4':0.04}

            img_dict[key] = calculate_depth_and_area(img, diameter[key[-1]])
        
        return img_dict

In [11]:
img_dict = extract_image()

Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1690, y=1921
Final Depth Value: 0.2746066153049469
0.04
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1806, y=2256
Final Depth Value: 0.49699637293815613
0.025
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1160, y=2021
Final Depth Value: 0.46296119689941406
0.04
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1590, y=2086
Final Depth Value: 0.7681509256362915
0.04
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1365, y=1961
Final Depth Value: 0.6183984279632568
0.04
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1520, y=1931
Final Depth Value: 0.7500551342964172
0.04
Click on the center of the ball. Press any key to confirm selection.
Pixel Coordinates: x=1500, y=1516
Final Depth Value:

In [12]:
img_dict

{'10cm4': {'relative_depth': np.float32(0.27460662),
  'num_of_pixel': np.int64(2007390),
  'real_depth': (0.09318923762168982, np.float64(0.09310479884209347)),
  'distance_from_center': np.float64(0.33448532291521144)},
 '15cm3': {'relative_depth': np.float32(0.49699637),
  'num_of_pixel': np.int64(377487),
  'real_depth': (0.1342960699899243, np.float64(0.13418901397797386)),
  'distance_from_center': np.float64(0.26079453801287)},
 '20cm4': {'relative_depth': np.float32(0.4629612),
  'num_of_pixel': np.int64(438059),
  'real_depth': (0.1992467016222336, np.float64(0.19930658167482737)),
  'distance_from_center': np.float64(0.46092687401212135)},
 '25cm1': {'relative_depth': np.float32(0.7681509),
  'num_of_pixel': np.int64(289379),
  'real_depth': (0.24457634385826935, np.float64(0.24521918152032646)),
  'distance_from_center': np.float64(0.2719592662665323)},
 '30cm1': {'relative_depth': np.float32(0.6183984),
  'num_of_pixel': np.int64(186188),
  'real_depth': (0.3050620837408946

In [25]:
new_dict = {}

for key, info in img_dict.items():
    cm_match = re.search(r'(\d+)cm', key)
    
    if cm_match:
        real_depth = float(info['real_depth'][1]) 
        
        relative_depth = float(info['relative_depth'])
        distance_from_center = float(info['distance_from_center'])
        num_of_px = float(info['num_of_pixel'])
        
        new_dict[key] = {
            'real_depth': real_depth,
            'relative_depth': relative_depth,
            'distance_from_center': distance_from_center,
            'num_of_px': num_of_px,
        }

In [33]:
real_depth = []
relative_depth = []
distance_from_center = []
num_of_px = []
for key in new_dict:
    real_depth.append(new_dict[key]["real_depth"])
    relative_depth.append(new_dict[key]["relative_depth"])
    distance_from_center.append(new_dict[key]["distance_from_center"])
    num_of_px.append(new_dict[key]["num_of_px"])

real_depth = np.array(real_depth)
relative_depth = np.array(relative_depth)
distance_from_center = np.array(distance_from_center)
num_of_px = np.array(num_of_px)

In [38]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib


# Ensure they are all arrays
X = np.column_stack((
    np.array(relative_depth), 
    np.array(distance_from_center), 
    np.array(np.log10(num_of_px))
))
y = real_depth

pipeline = Pipeline([
    ('scaler', StandardScaler()),           
    ('poly', PolynomialFeatures(degree=2)), 
    ('regressor', LinearRegression())       
])

pipeline.fit(X, y)
print(f"Final Scaled R^2: {pipeline.score(X, y):.4f}")

joblib.dump(pipeline, './regression/size_estimation_pipeline.pkl')


Final Scaled R^2: 0.9147


['./regression/size_estimation_pipeline.pkl']

In [27]:
import pandas as pd

# 1. Convert the nested dictionary to a DataFrame
# 'orient=index' makes the main keys ('10cm4') the rows
df = pd.DataFrame.from_dict(img_dict, orient='index')

# 2. (Optional) Move the ID from the index to a proper column
df.index.name = 'img_id'
df.reset_index(inplace=True)

# 3. Save to CSV
df.to_csv('depth_data.csv', index=False)