In [1]:
import torch
import torchvision
from PIL import Image
from torchvision import transforms as T
import numpy as np
import cv2

In [2]:
# Load the pre-trained SSD model
model = torchvision.models.detection.ssd300_vgg16(weights='SSD300_VGG16_Weights.COCO_V1')
model.eval()

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=

In [3]:
# Count the number of layers in the model
layer_count = sum(1 for _ in model.children())
print(f'Total number of layers in the SSD300 VGG16 model: {layer_count}')


Total number of layers in the SSD300 VGG16 model: 4


In [11]:
# Load the image using PIL
ig = Image.open(r'kb.jpg')

# Define the transformation (PIL -> Tensor)
transform = T.ToTensor()
img = transform(ig)

In [12]:
# Make predictions
with torch.no_grad():
    pred = model([img])

In [13]:
# Extract bounding boxes, scores, and labels
bboxes, scores, labels = pred[0]['boxes'], pred[0]['scores'], pred[0]['labels']

# Load class names (replace this with your actual list of COCO class names)
coco_names = ["person" , "bicycle" , "car" , "motorcycle" , "airplane" , "bus" , "train" , "truck" , "boat" , "traffic light" , "fire hydrant" , "street sign" , "stop sign" , "parking meter" , "bench" , "bird" , "cat" , "dog" , "horse" , "sheep" , "cow" , "elephant" , "bear" , "zebra" , "giraffe" , "hat" , "backpack" , "umbrella" , "shoe" , "eye glasses" , "handbag" , "tie" , "suitcase" ,
"frisbee" , "skis" , "snowboard" , "sports ball" , "kite" , "baseball bat" ,
"baseball glove" , "skateboard" , "surfboard" , "tennis racket" , "bottle" ,
"plate" , "wine glass" , "cup" , "fork" , "knife" , "spoon" , "bowl" ,
"banana" , "apple" , "sandwich" , "orange" , "broccoli" , "carrot" , "hot dog" ,
"pizza" , "donut" , "cake" , "chair" , "couch" , "potted plant" , "bed" ,
"mirror" , "dining table" , "window" , "desk" , "toilet" , "door" , "tv" ,
"laptop" , "mouse" , "remote" , "keyboard" , "cell phone" , "microwave" ,
"oven" , "toaster" , "sink" , "refrigerator" , "blender" , "book" ,
"clock" , "vase" , "scissors" , "teddy bear" , "hair drier" , "toothbrush" , "hair brush"]

In [14]:
# Threshold for detection score (you can adjust this)
num = torch.argwhere(scores > 0.10).shape[0]

# Convert PIL image to OpenCV format
igg = cv2.imread(r'kb.jpg')  # Or use: np.array(ig) if you loaded with PIL

In [15]:
# Draw bounding boxes and labels on the image
font = cv2.FONT_HERSHEY_SIMPLEX
for i in range(num):
    x1, y1, x2, y2 = bboxes[i].numpy().astype('int')
    igg = cv2.rectangle(igg, (x1, y1), (x2, y2), (0, 255, 0), 1)
    
    # Get class name
    class_name = coco_names[labels.numpy()[i] - 1]
    
    # Put label on the image
    igg = cv2.putText(igg, class_name, (x1, y1 - 10), font, 0.5, (0, 255, 0), 1, cv2.LINE_AA)


In [16]:
# Display the image with OpenCV
cv2.imshow('Image', igg)
cv2.waitKey(0)  # Wait for a key press
cv2.destroyAllWindows()  # Close the image window

In [17]:
# Open video file
cap = cv2.VideoCapture('2099536-sd_960_540_30fps.mp4')
if not cap.isOpened():
    raise IOError('Cannot open video')

font_scale = 3
font = cv2.FONT_HERSHEY_PLAIN

# Define transformation for input image
transform = T.ToTensor()

while True:
    ret, frame = cap.read()

    if not ret:
        break

    # Convert frame (BGR) to PIL image
    ig = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    ig = Image.fromarray(ig)

    # Apply transformations to the image (PIL -> Tensor)
    img = transform(ig)

    # Make predictions
    with torch.no_grad():
        prediction = model([img])

    # Extract bounding boxes, scores, and labels
    bboxes, scores, labels = prediction[0]['boxes'], prediction[0]['scores'], prediction[0]['labels']

    # Threshold for detection score (you can adjust this)
    num = torch.argwhere(scores > 0.55).shape[0]

    # Draw bounding boxes and labels on the frame
    for i in range(num):
        x1, y1, x2, y2 = bboxes[i].numpy().astype('int')
        label = coco_names[labels[i].item() - 1]
        confidence = scores[i].item()

        # Draw rectangle and label
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
        # cv2.putText(frame, f"{label} ({confidence:.2f})", (x1, y1 - 10), font, fontScale=font_scale, color=(0, 255, 0), thickness=3)
        cv2.putText(frame, label, (x1, y1 - 10), font, fontScale=font_scale, color=(0, 255, 0), thickness=3)


    # Display the frame with detection
    cv2.imshow('Object Detection', frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(2) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()