# Object Detection Model Training: Key Parameters and Configurations  : SSD_Lite_320(MobileNET_V1 / MobileNET_V2 Backbone )

## 1. Training Hyperparameters  

| **Parameter**       | **Description**                        | **Typical Values**         |
|---------------------|------------------------------------|---------------------------|
| **Batch Size**      | Number of samples per batch        | 8, 16, 32                 |
| **Learning Rate**   | Initial learning rate             | 0.001, 0.0001, 0.005      |
| **Optimizer**       | Optimization algorithm            | SGD, Adam, AdamW, RMSprop |
| **Momentum**        | Momentum for SGD optimizer        | 0.9, 0.95                 |
| **Weight Decay**    | L2 regularization term            | 0.0005, 0.001             |
| **Learning Rate Scheduler** | Schedule for adjusting learning rate | CosineAnnealingLR, ReduceLROnPlateau, MultiStepLR |
| **Epochs**         | Number of training iterations     | 50, 100, 200              |

## 2. Loss Function Parameters  

The model employs a **multi-task loss function** consisting of classification and localization losses. Key parameter to tune:

- **`alpha`**: Weighting factor between classification and localization losses (default: `1.0`).


In [None]:
import torch
import torchvision
from torchvision.io.image import decode_image
from torchvision.models.detection import (
    ssdlite320_mobilenet_v3_large,
    SSDLite320_MobileNet_V3_Large_Weights,
)
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image

In [None]:
model = ssdlite320_mobilenet_v3_large(
    weights=ssdlite320_mobilenet_v3_large, progress=True, num_classes=91
)
model.eval()
x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)]
predictions = model(x)
print(predictions)



[{'boxes': tensor([[  6.0037,   3.6826, 315.0123, 316.9638],
        [  5.1216,   3.1682, 315.6669, 317.4081],
        [211.2740, 165.9436, 218.6624, 180.2623],
        ...,
        [108.9881,  29.2311, 125.4374,  44.8632],
        [ 50.6379,  71.6034,  61.0590,  81.9207],
        [  6.3974,   0.0000, 314.6940, 320.0000]], grad_fn=<StackBackward0>), 'scores': tensor([0.0640, 0.0457, 0.0395, 0.0381, 0.0372, 0.0370, 0.0364, 0.0361, 0.0359,
        0.0358, 0.0358, 0.0356, 0.0356, 0.0355, 0.0353, 0.0351, 0.0351, 0.0351,
        0.0349, 0.0349, 0.0348, 0.0347, 0.0347, 0.0346, 0.0344, 0.0344, 0.0343,
        0.0343, 0.0341, 0.0341, 0.0338, 0.0336, 0.0336, 0.0335, 0.0335, 0.0335,
        0.0334, 0.0334, 0.0333, 0.0333, 0.0332, 0.0332, 0.0331, 0.0331, 0.0331,
        0.0330, 0.0329, 0.0328, 0.0328, 0.0328, 0.0327, 0.0327, 0.0326, 0.0326,
        0.0325, 0.0324, 0.0322, 0.0322, 0.0321, 0.0321, 0.0320, 0.0320, 0.0320,
        0.0320, 0.0318, 0.0317, 0.0317, 0.0317, 0.0317, 0.0317, 0.0317, 0.0317

In [None]:
weights_for_ssdlite_320 = SSDLite320_MobileNet_V3_Large_Weights
auto_transforms_ssdlite_3320 = weights_for_ssdlite_320.transforms
auto_transforms_ssdlite_3320

<property at 0x1ca5c036d90>

In [None]:
from torchinfo import summary

summary(model, input_size=(1, 3, 320, 320))

Layer (type:depth-idx)                                       Output Shape              Param #
SSD                                                          [300, 4]                  --
├─GeneralizedRCNNTransform: 1-1                              [1, 3, 320, 320]          --
├─SSDLiteFeatureExtractorMobileNet: 1-2                      [1, 128, 1, 1]            --
│    └─Sequential: 2-1                                       --                        --
│    │    └─Sequential: 3-1                                  [1, 672, 20, 20]          869,096
│    │    └─Sequential: 3-2                                  [1, 480, 10, 10]          751,416
│    └─ModuleList: 2-2                                       --                        --
│    │    └─Sequential: 3-3                                  [1, 512, 5, 5]            258,304
│    │    └─Sequential: 3-4                                  [1, 256, 3, 3]            100,480
│    │    └─Sequential: 3-5                                  [1, 256, 2, 2]

In [None]:
summary(
    model=model,
    input_size=(32, 3, 224, 224),  # make sure this is "input_size", not "input_shape"
    # col_names=["input_size"], # uncomment for smaller output
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"],
)

Layer (type (var_name))                                                Input Shape          Output Shape         Param #              Trainable
SSD (SSD)                                                              [32, 3, 224, 224]    [300, 4]             --                   True
├─GeneralizedRCNNTransform (transform)                                 [32, 3, 224, 224]    [32, 3, 320, 320]    --                   --
├─SSDLiteFeatureExtractorMobileNet (backbone)                          [32, 3, 320, 320]    [32, 128, 1, 1]      --                   True
│    └─Sequential (features)                                           --                   --                   --                   True
│    │    └─Sequential (0)                                             [32, 3, 320, 320]    [32, 672, 20, 20]    869,096              True
│    │    └─Sequential (1)                                             [32, 672, 20, 20]    [32, 480, 10, 10]    751,416              True
│    └─ModuleList (extra

Real_time inference on Laptop webcam using SSD_lite model on CUDA.

In [29]:
import cv2
import torch
import torchvision
from torchvision.transforms import functional as F
import time
from coco_classes import COCO_CLASSES

# Load the pretrained model with GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True).to(
    device
)
model.eval()


# Initialize webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise IOError("Cannot open webcam")

# Set desired frame size (model expects 320x320)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1080)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Warmup GPU
x = torch.randn(1, 3, 320, 320).to(device)
_ = model(x)

while True:
    # Read frame from webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = F.to_tensor(img).to(device)
    img = F.resize(img, (320, 320))  # Resize to model input size

    # Perform inference
    with torch.no_grad():
        start_time = time.time()
        predictions = model([img])[0]
        inference_time = time.time() - start_time

    # Filter predictions with confidence > 0.5
    mask = predictions["scores"] > 0.5
    boxes = predictions["boxes"][mask].cpu().numpy()
    labels = predictions["labels"][mask].cpu().numpy()
    scores = predictions["scores"][mask].cpu().numpy()

    # Draw predictions
    for box, label, score in zip(boxes, labels, scores):
        x1, y1, x2, y2 = map(int, box)

        # Scale boxes back to original frame size
        scale_x = frame.shape[1] / 320
        scale_y = frame.shape[0] / 320
        x1 = int(x1 * scale_x)
        y1 = int(y1 * scale_y)
        x2 = int(x2 * scale_x)
        y2 = int(y2 * scale_y)

        # Draw rectangle and label
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label_text = f"{COCO_CLASSES[label]}: {score:.2f}"
        cv2.putText(
            frame,
            label_text,
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 255, 0),
            2,
        )

    # Display FPS
    fps_text = f"FPS: {1 / inference_time:.2f}"
    cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.imshow("Real-Time Detection", frame)

    # Exit on 'q' press
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()

Achieving 20FPS with NVIDIA-CUDA GPU Support , very promising results.