In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/traffic-data/traffic_wala_dataset/data.yaml
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/12_mp4-10_jpg.rf.4bb699a2ec90e19cb4680ee239ae579c.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/6_mp4-31_jpg.rf.c786274ab129629ca618e454b4146f11.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/6_mp4-29_jpg.rf.e73ca25e92e590b325006f010ad4e319.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/4_mp4-8_jpg.rf.1d199d41f6c3738763c192883ad1a5b4.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/7_mp4-24_jpg.rf.ed2484fe45c32f27443cccd8bbf99361.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/2_mp4-1_jpg.rf.b52c12a365fbeb71e302f0505038959b.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/test2_mp4-21_jpg.rf.bbb3ba4e4b69e5a12822300829bf75f0.txt
/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels/7_mp4-11_jpg.rf.02a513fa7473144494b7a53fc14b6560.txt
/kaggle/input/traffic-data/

In [5]:
import os
import torch
import torch.nn as nn
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torchvision.ops import box_iou, nms, sigmoid_focal_loss
from torchvision.utils import draw_bounding_boxes
from PIL import Image, ImageDraw, ImageFont
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torchvision
import math

# --- Paths 
TRAIN_IMG_DIR = "/kaggle/input/traffic-data/traffic_wala_dataset/train/images"
TRAIN_LABEL_DIR = "/kaggle/input/traffic-data/traffic_wala_dataset/train/labels"
VAL_IMG_DIR = "/kaggle/input/traffic-data/traffic_wala_dataset/valid/images"
VAL_LABEL_DIR = "/kaggle/input/traffic-data/traffic_wala_dataset/valid/labels"
TEST_IMG_DIR = "/kaggle/input/traffic-data/traffic_wala_dataset/valid/images"

# --- Utils 
def load_annotations(label_path, original_size):
    boxes = []
    classes = []
    if not os.path.exists(label_path):
        return boxes, classes
    
    orig_w, orig_h = original_size
    with open(label_path, 'r') as f:
        for line in f.readlines():
            cls, cx, cy, w, h = map(float, line.strip().split())
            
            x_center = cx * orig_w
            y_center = cy * orig_h
            box_w = w * orig_w
            box_h = h * orig_h
            
            x_min = x_center - box_w / 2
            y_min = y_center - box_h / 2
            x_max = x_center + box_w / 2
            y_max = y_center + box_h / 2
            
            boxes.append([x_min, y_min, x_max, y_max])
            classes.append(int(cls)) 
    return boxes, classes

# --- Anchor Box Generation 
class AnchorGenerator(nn.Module):
    def __init__(self, input_size, strides, scales, aspect_ratios):
        super().__init__()
        self.input_size = input_size
        self.strides = strides 
        self.scales = scales 
        self.aspect_ratios = aspect_ratios 

        self._all_anchors = self._generate_anchors() 

    def _generate_anchors(self):
        all_anchors = []
        input_h, input_w = self.input_size

        for stride in self.strides:
            feature_h = math.ceil(input_h / stride)
            feature_w = math.ceil(input_w / stride)

            for y in range(feature_h):
                for x in range(feature_w):
                    center_x = (x + 0.5) * stride
                    center_y = (y + 0.5) * stride

                    for base_scale in self.scales:
                        for ratio in self.aspect_ratios:
                            anchor_w = base_scale * math.sqrt(ratio)
                            anchor_h = base_scale / math.sqrt(ratio)

                            xmin = center_x - anchor_w / 2
                            ymin = center_y - anchor_h / 2
                            xmax = center_x + anchor_w / 2
                            ymax = center_y + anchor_h / 2
                            all_anchors.append([xmin, ymin, xmax, ymax])
        
        return torch.tensor(all_anchors, dtype=torch.float32)

    def forward(self):
        return self._all_anchors

# --- Target Assignment 
def assign_targets_to_anchors(anchors, gt_boxes, gt_labels, iou_thresholds=(0.5, 0.4)):
    num_anchors = anchors.shape[0]
    num_gt = gt_boxes.shape[0]

    assigned_labels = torch.full((num_anchors,), -1, dtype=torch.long, device=anchors.device)
    assigned_loc_targets = torch.zeros((num_anchors, 4), dtype=torch.float32, device=anchors.device)
    assigned_gt_boxes_for_pos_anchors = torch.empty((0, 4), dtype=torch.float32, device=anchors.device) 

    if num_gt == 0: 
        assigned_labels[:] = 0 
        return assigned_labels, assigned_loc_targets, assigned_gt_boxes_for_pos_anchors

    iou_matrix = box_iou(anchors, gt_boxes)

    max_iou_per_anchor, best_gt_idx_per_anchor = iou_matrix.max(dim=1) 
    max_iou_per_gt, best_anchor_idx_per_gt = iou_matrix.max(dim=0)     

    assigned_labels[best_anchor_idx_per_gt] = gt_labels[range(num_gt)] 

    positive_mask_iou = max_iou_per_anchor >= iou_thresholds[0]
    assigned_labels[positive_mask_iou] = gt_labels[best_gt_idx_per_anchor[positive_mask_iou]]

    negative_mask_iou = max_iou_per_anchor < iou_thresholds[1]
    already_positive_mask = (assigned_labels == 1)
    assigned_labels[negative_mask_iou & ~already_positive_mask] = 0 

    positive_anchor_indices = torch.where(assigned_labels == 1)[0] 
    if positive_anchor_indices.numel() > 0:
        pos_anchors = anchors[positive_anchor_indices]
        
        assigned_gt_boxes_for_pos_anchors = gt_boxes[best_gt_idx_per_anchor[positive_anchor_indices]]

        pos_anchor_widths = pos_anchors[:, 2] - pos_anchors[:, 0]
        pos_anchor_heights = pos_anchors[:, 3] - pos_anchors[:, 1]
        pos_anchor_center_x = (pos_anchors[:, 0] + pos_anchors[:, 2]) / 2
        pos_anchor_center_y = (pos_anchors[:, 1] + pos_anchors[:, 3]) / 2

        pos_gt_widths = assigned_gt_boxes_for_pos_anchors[:, 2] - assigned_gt_boxes_for_pos_anchors[:, 0]
        pos_gt_heights = assigned_gt_boxes_for_pos_anchors[:, 3] - assigned_gt_boxes_for_pos_anchors[:, 1]
        pos_gt_center_x = (assigned_gt_boxes_for_pos_anchors[:, 0] + assigned_gt_boxes_for_pos_anchors[:, 2]) / 2
        pos_gt_center_y = (assigned_gt_boxes_for_pos_anchors[:, 1] + assigned_gt_boxes_for_pos_anchors[:, 3]) / 2

        tx = (pos_gt_center_x - pos_anchor_center_x) / pos_anchor_widths
        ty = (pos_gt_center_y - pos_anchor_center_y) / pos_anchor_heights
        tw = torch.log(pos_gt_widths / pos_anchor_widths)
        th = torch.log(pos_gt_heights / pos_anchor_heights)

        assigned_loc_targets[positive_anchor_indices] = torch.stack([tx, ty, tw, th], dim=1)

    return assigned_labels, assigned_loc_targets, assigned_gt_boxes_for_pos_anchors

# --- Bounding Box Decoding 
def decode_boxes(anchors, reg_offsets):
    anchor_widths = anchors[:, 2] - anchors[:, 0]
    anchor_heights = anchors[:, 3] - anchors[:, 1]
    anchor_center_x = (anchors[:, 0] + anchors[:, 2]) / 2
    anchor_center_y = (anchors[:, 1] + anchors[:, 3]) / 2

    pred_center_x = anchor_center_x + reg_offsets[:, 0] * anchor_widths
    pred_center_y = anchor_center_y + reg_offsets[:, 1] * anchor_heights

    pred_w = anchor_widths * torch.exp(torch.clamp(reg_offsets[:, 2], -7.0, 7.0)) 
    pred_h = anchor_heights * torch.exp(torch.clamp(reg_offsets[:, 3], -7.0, 7.0))

    pred_xmin = pred_center_x - pred_w / 2
    pred_ymin = pred_center_y - pred_h / 2
    pred_xmax = pred_center_x + pred_w / 2
    pred_ymax = pred_center_y + pred_h / 2

    return torch.stack([pred_xmin, pred_ymin, pred_xmax, pred_ymax], dim=1)

# --- Loss Functions 
class DIoULoss(nn.Module):
    def forward(self, preds_decoded_boxes, targets_gt_boxes):
        if targets_gt_boxes.numel() == 0 or preds_decoded_boxes.numel() == 0:
            return torch.tensor(0.0, device=preds_decoded_boxes.device)

        if preds_decoded_boxes.shape != targets_gt_boxes.shape:
            print("Warning: Shape mismatch in DIoULoss. Ensure 1-to-1 box matching for regression.")
            return torch.tensor(1.0, device=preds_decoded_boxes.device)

        iou = box_iou(preds_decoded_boxes, targets_gt_boxes)
        iou_val = torch.diag(iou)

        preds_cx = (preds_decoded_boxes[:, 0] + preds_decoded_boxes[:, 2]) / 2
        preds_cy = (preds_decoded_boxes[:, 1] + preds_decoded_boxes[:, 3]) / 2
        targets_cx = (targets_gt_boxes[:, 0] + targets_gt_boxes[:, 2]) / 2
        targets_cy = (targets_gt_boxes[:, 1] + targets_gt_boxes[:, 3]) / 2

        center_dist_sq = (preds_cx - targets_cx)**2 + (preds_cy - targets_cy)**2

        enclose_xmin = torch.min(preds_decoded_boxes[:, 0], targets_gt_boxes[:, 0])
        enclose_ymin = torch.min(preds_decoded_boxes[:, 1], targets_gt_boxes[:, 1])
        enclose_xmax = torch.max(preds_decoded_boxes[:, 2], targets_gt_boxes[:, 2])
        enclose_ymax = torch.max(preds_decoded_boxes[:, 3], targets_gt_boxes[:, 3])
        
        enclose_width = enclose_xmax - enclose_xmin
        enclose_height = enclose_ymax - enclose_ymin
        
        c_diag_sq = enclose_width**2 + enclose_height**2

        diou_term = center_dist_sq / (c_diag_sq + 1e-6)
        
        diou_loss = 1 - iou_val + diou_term
        return diou_loss.mean()

# --- Model 
class SimpleRetinaNet(nn.Module):
    def __init__(self, num_classes=1, input_size=(640, 640)):
        super().__init__()
        backbone = torchvision.models.resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(backbone.children())[:-2]) 
        
        self.input_size = input_size
        
        self.neck = nn.Conv2d(512, 256, kernel_size=1)

        self.strides = [32] 
        self.scales = [32 * 2**0, 32 * 2**(1/3), 32 * 2**(2/3)] 
        self.aspect_ratios = [0.5, 1.0, 2.0] 

        self.num_anchors_per_location = len(self.scales) * len(self.aspect_ratios)
        self.num_classes = num_classes 

        self.cls_head = nn.Sequential(
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Conv2d(256, self.num_anchors_per_location * self.num_classes, 1) 
        )
        self.reg_head = nn.Sequential(
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Conv2d(256, self.num_anchors_per_location * 4, 1) 
        )
        
        prior_prob = 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)
        torch.nn.init.constant_(self.cls_head[-1].bias, bias_value)
        torch.nn.init.normal_(self.cls_head[-1].weight, mean=0, std=0.01)
        
        torch.nn.init.normal_(self.reg_head[-1].weight, mean=0, std=0.01)
        torch.nn.init.constant_(self.reg_head[-1].bias, 0)

        self.anchor_generator = AnchorGenerator(
            input_size=self.input_size, 
            strides=self.strides, 
            scales=self.scales, 
            aspect_ratios=self.aspect_ratios
        )

    def forward(self, x):
        features = self.backbone(x) 
        neck_features = self.neck(features)   

        B, C, H, W = neck_features.shape

        cls_logits = self.cls_head(neck_features) 
        reg_preds = self.reg_head(neck_features)   

        cls_logits = cls_logits.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
        reg_preds = reg_preds.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
        
        return cls_logits, reg_preds

# --- Dataset 
class TrafficDataset(Dataset):
    def __init__(self, img_dir, label_dir, transforms):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.image_files = sorted([f for f in os.listdir(img_dir) if f.endswith('.jpg')])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, os.path.splitext(self.image_files[idx])[0] + '.txt')

        img = Image.open(img_path).convert("RGB")
        original_w, original_h = img.size

        image_np = np.array(img)
        
        if image_np.dtype == np.uint8:
            image_np = image_np.astype(np.float32) / 255.0
        else:
            image_np = image_np.astype(np.float32)

        boxes_original_coords, class_ids = load_annotations(label_path, original_size=(original_w, original_h))
        
        transformed = self.transforms(image=image_np, bboxes=boxes_original_coords)
        image_tensor = transformed["image"]
        boxes_tensor = torch.tensor(transformed["bboxes"], dtype=torch.float32)
        
        gt_class_labels = torch.ones(len(boxes_tensor), dtype=torch.long) if len(boxes_tensor) > 0 else torch.empty(0, dtype=torch.long)

        return image_tensor, (boxes_tensor, gt_class_labels) 

# --- Transforms 
train_tfms = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
    A.Resize(640, 640),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=[]))

val_tfms = A.Compose([
    A.Resize(640, 640),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=[]))

test_tfms = A.Compose([
    A.Resize(640, 640),
    ToTensorV2()
])

# --- Loaders 
def collate_fn_detection(batch):
    images = [item[0] for item in batch]
    gt_info = [item[1] for item in batch] 

    images = torch.stack(images, 0)
    return images, gt_info

train_ds = TrafficDataset(TRAIN_IMG_DIR, TRAIN_LABEL_DIR, train_tfms)
val_ds = TrafficDataset(VAL_IMG_DIR, VAL_LABEL_DIR, val_tfms)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=collate_fn_detection)
val_loader = DataLoader(val_ds, batch_size=4, shuffle=False, collate_fn=collate_fn_detection)

# --- Model Setup 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleRetinaNet(num_classes=1, input_size=(640, 640)).to(device)

model.anchors = model.anchor_generator().to(device)

for param in model.backbone.parameters():
    param.requires_grad = False

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=3,
    verbose=True
)

reg_criterion = DIoULoss() 

# --- Training Loop (Unchanged) 
print("Starting Training...")
for epoch in range(20):
    model.train()
    
    if epoch == 7:
        print("Unfreezing backbone parameters.")
        for param in model.backbone.parameters():
            param.requires_grad = True

    total_cls_loss, total_reg_loss = 0.0, 0.0 
    num_batches = 0

    for images, targets_batch_info in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        images = images.to(device)
        
        cls_preds_batch, reg_preds_batch = model(images) 

        batch_cls_loss_tensor = torch.tensor(0.0, device=device)
        batch_reg_loss_tensor = torch.tensor(0.0, device=device)
        
        num_valid_samples_for_cls_loss_in_batch = 0 
        num_valid_samples_for_reg_loss_in_batch = 0 

        for i in range(images.shape[0]): 
            gt_boxes_i, gt_labels_i = targets_batch_info[i] 
            gt_boxes_i = gt_boxes_i.to(device)
            gt_labels_i = gt_labels_i.to(device) 

            assigned_labels_i, _, assigned_gt_boxes_for_pos_anchors_i = assign_targets_to_anchors(
                model.anchors, gt_boxes_i, gt_labels_i
            )
            
            cls_logits_i = cls_preds_batch[i, :, 0] 
            
            positive_mask_cls = (assigned_labels_i == 1)
            negative_mask_cls = (assigned_labels_i == 0)
            
            selected_cls_preds = cls_logits_i[positive_mask_cls | negative_mask_cls]
            selected_gt_labels = assigned_labels_i[positive_mask_cls | negative_mask_cls].float()

            if selected_cls_preds.numel() > 0:
                current_cls_loss_img = sigmoid_focal_loss(selected_cls_preds, selected_gt_labels, 
                                                     reduction='sum', alpha=0.25, gamma=2.0)
                batch_cls_loss_tensor += current_cls_loss_img
                num_valid_samples_for_cls_loss_in_batch += 1

            positive_anchor_indices = torch.where(assigned_labels_i == 1)[0]

            if positive_anchor_indices.numel() > 0:
                reg_preds_pos = reg_preds_batch[i, positive_anchor_indices] 
                
                decoded_pred_boxes = decode_boxes(model.anchors[positive_anchor_indices], reg_preds_pos)
                
                gt_boxes_pos = assigned_gt_boxes_for_pos_anchors_i 

                current_reg_loss_img = reg_criterion(decoded_pred_boxes, gt_boxes_pos)
                batch_reg_loss_tensor += current_reg_loss_img 
                num_valid_samples_for_reg_loss_in_batch += 1 

        if num_valid_samples_for_cls_loss_in_batch > 0:
            batch_cls_loss_tensor /= num_valid_samples_for_cls_loss_in_batch
        else:
            batch_cls_loss_tensor = torch.tensor(0.0, device=device) 

        if num_valid_samples_for_reg_loss_in_batch > 0:
            batch_reg_loss_tensor /= num_valid_samples_for_reg_loss_in_batch
        else:
            batch_reg_loss_tensor = torch.tensor(0.0, device=device) 
            
        loss = batch_cls_loss_tensor + batch_reg_loss_tensor
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_cls_loss += batch_cls_loss_tensor.item()
        total_reg_loss += batch_reg_loss_tensor.item()
        num_batches += 1

    print(f"[Epoch {epoch+1}] Avg cls_loss: {total_cls_loss / num_batches:.4f}, Avg reg_loss: {total_reg_loss / num_batches:.4f}")

print("\nTraining Complete!")

# --- Inference on Test Images ---
print("\n--- Starting Inference on Test Images ---")

model.eval()

# --- Define output directories for saving images and labels ---
OUTPUT_BASE_DIR = "./inference_outputs"
OUTPUT_IMG_DIR = os.path.join(OUTPUT_BASE_DIR, "images_with_detections")
OUTPUT_LABEL_DIR = os.path.join(OUTPUT_BASE_DIR, "labels_yolo_format")
OUTPUT_TXT_FILE = os.path.join(OUTPUT_BASE_DIR, "output.txt")

os.makedirs(OUTPUT_IMG_DIR, exist_ok=True)
os.makedirs(OUTPUT_LABEL_DIR, exist_ok=True)
print(f"Detected images will be saved to: {OUTPUT_IMG_DIR}")
print(f"Detection labels (YOLO format) will be saved to: {OUTPUT_LABEL_DIR}")
print(f"Inference summary will be saved to: {OUTPUT_TXT_FILE}")

# Open the output.txt file in write mode
with open(OUTPUT_TXT_FILE, 'w') as output_txt:
    test_image_files = sorted([f for f in os.listdir(TEST_IMG_DIR) if f.endswith('.jpg')])

    if not test_image_files:
        print(f"No JPG images found in {TEST_IMG_DIR}. Skipping inference.")
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        for img_file in tqdm(test_image_files, desc="Running Inference"):
            img_path = os.path.join(TEST_IMG_DIR, img_file)

            pil_image = Image.open(img_path).convert("RGB")
            original_width, original_height = pil_image.size

            image_np_for_inference = np.array(pil_image)
            if image_np_for_inference.dtype == np.uint8:
                image_np_for_inference = image_np_for_inference.astype(np.float32) / 255.0
            elif np.max(image_np_for_inference) > 1.0:
                image_np_for_inference = image_np_for_inference / 255.0

            transformed = test_tfms(image=image_np_for_inference)
            image_tensor = transformed["image"].unsqueeze(0).to(device)

            with torch.no_grad():
                cls_logits, reg_preds = model(image_tensor)

                cls_logits = cls_logits.squeeze(0)
                reg_preds = reg_preds.squeeze(0)

                pred_scores = cls_logits.sigmoid()[:, 0]
                decoded_boxes = decode_boxes(model.anchors, reg_preds)

                score_threshold = 0.5
                keep = pred_scores >= score_threshold
                
                final_boxes = decoded_boxes[keep]
                final_scores = pred_scores[keep]

                if final_boxes.numel() > 0:
                    nms_threshold = 0.4
                    keep_indices = nms(final_boxes, final_scores, nms_threshold)
                    final_boxes = final_boxes[keep_indices]
                    final_scores = final_scores[keep_indices]
                else:
                    final_boxes = torch.empty((0, 4), device=device)
                    final_scores = torch.empty(0, device=device)

                # --- Prepare for saving ---
                output_image_filename = img_file
                output_label_filename = os.path.splitext(img_file)[0] + '.txt'
                output_image_path = os.path.join(OUTPUT_IMG_DIR, output_image_filename)
                output_label_path = os.path.join(OUTPUT_LABEL_DIR, output_label_filename)

                yolo_labels = []

                if final_boxes.numel() > 0:
                    scale_x = original_width / 640.0
                    scale_y = original_height / 640.0
                    
                    final_boxes_orig_coords = final_boxes.clone()
                    final_boxes_orig_coords[:, 0] *= scale_x
                    final_boxes_orig_coords[:, 1] *= scale_y
                    final_boxes_orig_coords[:, 2] *= scale_x
                    final_boxes_orig_coords[:, 3] *= scale_y

                    draw = ImageDraw.Draw(pil_image)
                    try:
                        font = ImageFont.truetype("arial.ttf", 20)
                    except IOError:
                        font = ImageFont.load_default()

                    for i in range(len(final_boxes_orig_coords)):
                        box = final_boxes_orig_coords[i].cpu().numpy()
                        score = final_scores[i].item()
                        label_class_id = 0

                        box_int = box.astype(int)
                        draw.rectangle([(box_int[0], box_int[1]), (box_int[2], box_int[3])], outline="red", width=2)
                        
                        text = f"Object: {score:.2f}"
                        try:
                            text_width, text_height = draw.textsize(text, font=font)
                        except AttributeError:
                            text_width = len(text) * 10
                            text_height = 20
                        
                        text_position = (box_int[0], box_int[1] - text_height - 2 if box_int[1] - text_height - 2 > 0 else box_int[1] + 2)
                        draw.rectangle([text_position, (text_position[0] + text_width, text_position[1] + text_height)], fill="red")
                        draw.text(text_position, text, fill="white", font=font)

                        x_min, y_min, x_max, y_max = box[0], box[1], box[2], box[3]
                        
                        box_width = x_max - x_min
                        box_height = y_max - y_min
                        x_center = (x_min + x_max) / 2
                        y_center = (y_min + y_max) / 2

                        x_center_norm = x_center / original_width
                        y_center_norm = y_center / original_height
                        width_norm = box_width / original_width
                        height_norm = box_height / original_height

                        yolo_labels.append(f"{label_class_id} {x_center_norm:.6f} {y_center_norm:.6f} {width_norm:.6f} {height_norm:.6f}")
                    
                    # --- Added Lane Counting and Vehicle Density ---
                    vehicle_boxes = final_boxes  # Use the boxes in resized 640x640 space
                    width, height = 640, 640  # Image dimensions after resizing (from test_tfms)

                    left_lane_count = 0
                    right_lane_count = 0

                    for box in vehicle_boxes:
                        x1, _, x2, _ = box
                        box_center_x = (x1 + x2) / 2
                        if box_center_x < width / 2:
                            left_lane_count += 1
                        else:
                            right_lane_count += 1

                    image_area = width * height
                    vehicle_density = len(vehicle_boxes) / image_area

                    # Write to output.txt
                    output_txt.write(f"Image: {img_file}\n")
                    output_txt.write(f"Total Detections: {len(final_boxes)}\n")
                    for label in yolo_labels:
                        output_txt.write(f"{label}\n")
                    output_txt.write(f"Vehicles in Left Lane: {left_lane_count}\n")
                    output_txt.write(f"Vehicles in Right Lane: {right_lane_count}\n")
                    output_txt.write(f"Vehicle Density: {vehicle_density:.6f} vehicles per pixel\n")
                    output_txt.write("-" * 50 + "\n")

                    pil_image.save(output_image_path)
                    with open(output_label_path, 'w') as f:
                        for line in yolo_labels:
                            f.write(line + '\n')

                else:
                    # Write to output.txt for no detections
                    output_txt.write(f"Image: {img_file}\n")
                    output_txt.write("Total Detections: 0\n")
                    output_txt.write("Vehicles in Left Lane: 0\n")
                    output_txt.write("Vehicles in Right Lane: 0\n")
                    output_txt.write("Vehicle Density: 0.000000 vehicles per pixel\n")
                    output_txt.write("-" * 50 + "\n")

                    pil_image.save(output_image_path)
                    with open(output_label_path, 'w') as f:
                        pass

print("\nInference complete. Check the 'inference_outputs' directory for results.")



Starting Training...


Epoch 1: 100%|██████████| 134/134 [00:23<00:00,  5.67it/s]


[Epoch 1] Avg cls_loss: 9.8562, Avg reg_loss: 0.5891


Epoch 2: 100%|██████████| 134/134 [00:19<00:00,  6.71it/s]


[Epoch 2] Avg cls_loss: 7.5405, Avg reg_loss: 0.5580


Epoch 3: 100%|██████████| 134/134 [00:19<00:00,  6.91it/s]


[Epoch 3] Avg cls_loss: 6.9139, Avg reg_loss: 0.5430


Epoch 4: 100%|██████████| 134/134 [00:19<00:00,  6.72it/s]


[Epoch 4] Avg cls_loss: 6.5019, Avg reg_loss: 0.5283


Epoch 5: 100%|██████████| 134/134 [00:19<00:00,  6.92it/s]


[Epoch 5] Avg cls_loss: 6.0608, Avg reg_loss: 0.5193


Epoch 6: 100%|██████████| 134/134 [00:19<00:00,  6.74it/s]


[Epoch 6] Avg cls_loss: 5.7850, Avg reg_loss: 0.5079


Epoch 7: 100%|██████████| 134/134 [00:19<00:00,  6.84it/s]


[Epoch 7] Avg cls_loss: 5.5186, Avg reg_loss: 0.4974
Unfreezing backbone parameters.


Epoch 8: 100%|██████████| 134/134 [00:23<00:00,  5.60it/s]


[Epoch 8] Avg cls_loss: 4.6743, Avg reg_loss: 0.4964


Epoch 9: 100%|██████████| 134/134 [00:23<00:00,  5.74it/s]


[Epoch 9] Avg cls_loss: 3.4417, Avg reg_loss: 0.4740


Epoch 10: 100%|██████████| 134/134 [00:23<00:00,  5.63it/s]


[Epoch 10] Avg cls_loss: 2.8577, Avg reg_loss: 0.4547


Epoch 11: 100%|██████████| 134/134 [00:23<00:00,  5.72it/s]


[Epoch 11] Avg cls_loss: 2.4283, Avg reg_loss: 0.4411


Epoch 12: 100%|██████████| 134/134 [00:24<00:00,  5.57it/s]


[Epoch 12] Avg cls_loss: 2.1050, Avg reg_loss: 0.4344


Epoch 13: 100%|██████████| 134/134 [00:23<00:00,  5.69it/s]


[Epoch 13] Avg cls_loss: 1.8446, Avg reg_loss: 0.4208


Epoch 14: 100%|██████████| 134/134 [00:23<00:00,  5.65it/s]


[Epoch 14] Avg cls_loss: 1.5988, Avg reg_loss: 0.4127


Epoch 15: 100%|██████████| 134/134 [00:23<00:00,  5.60it/s]


[Epoch 15] Avg cls_loss: 1.4037, Avg reg_loss: 0.3965


Epoch 16: 100%|██████████| 134/134 [00:24<00:00,  5.57it/s]


[Epoch 16] Avg cls_loss: 1.2694, Avg reg_loss: 0.3893


Epoch 17: 100%|██████████| 134/134 [00:23<00:00,  5.62it/s]


[Epoch 17] Avg cls_loss: 1.1220, Avg reg_loss: 0.3884


Epoch 18: 100%|██████████| 134/134 [00:23<00:00,  5.68it/s]


[Epoch 18] Avg cls_loss: 1.0108, Avg reg_loss: 0.3764


Epoch 19: 100%|██████████| 134/134 [00:24<00:00,  5.49it/s]


[Epoch 19] Avg cls_loss: 0.9416, Avg reg_loss: 0.3675


Epoch 20: 100%|██████████| 134/134 [00:23<00:00,  5.69it/s]


[Epoch 20] Avg cls_loss: 0.7794, Avg reg_loss: 0.3618

Training Complete!

--- Starting Inference on Test Images ---
Detected images will be saved to: ./inference_outputs/images_with_detections
Detection labels (YOLO format) will be saved to: ./inference_outputs/labels_yolo_format
Inference summary will be saved to: ./inference_outputs/output.txt


Running Inference: 100%|██████████| 90/90 [00:02<00:00, 33.21it/s]


Inference complete. Check the 'inference_outputs' directory for results.



