In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import os
import cv2
import numpy as np
from PIL import Image
from torchmetrics.detection.mean_ap import MeanAveragePrecision

In [None]:


# Define the YOLOv1 network
class YOLOv1(nn.Module):
    def __init__(self, S=7, B=2, C=1):
        super(YOLOv1, self).__init__()
        self.S = S
        self.B = B
        self.C = C

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(192, 128, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, S * S * (B * 5 + C))
        )

    def forward(self, x):
        return self.features(x).view(-1, self.S, self.S, self.B * 5 + self.C)


In [None]:
from torchsummary import summary
model = YOLOv1()
model.to('cuda')  # move model to GPU
summary(model, (3, 224, 224), device='cuda')  # specify CUDA device


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,472
              ReLU-2         [-1, 64, 112, 112]               0
         MaxPool2d-3           [-1, 64, 56, 56]               0
            Conv2d-4          [-1, 192, 56, 56]         110,784
              ReLU-5          [-1, 192, 56, 56]               0
         MaxPool2d-6          [-1, 192, 28, 28]               0
            Conv2d-7          [-1, 128, 28, 28]          24,704
              ReLU-8          [-1, 128, 28, 28]               0
            Conv2d-9          [-1, 256, 28, 28]         295,168
             ReLU-10          [-1, 256, 28, 28]               0
        MaxPool2d-11          [-1, 256, 14, 14]               0
           Conv2d-12          [-1, 256, 14, 14]         590,080
             ReLU-13          [-1, 256, 14, 14]               0
           Conv2d-14          [-1, 512,

In [None]:

class WiderFaceDataset(Dataset):
    def __init__(self, img_dir, label_dir, S=7, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.img_files = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_name = self.img_files[idx]
        img_path = os.path.join(self.img_dir, img_name)
        label_path = os.path.join(self.label_dir, os.path.splitext(img_name)[0] + ".txt")

        img = Image.open(img_path).convert('RGB')
        h, w = img.size
        target = torch.zeros((self.S, self.S, 5 * 2 + 1))  # B=2, C=1
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f:
                    cls, x_center, y_center, width, height = map(float, line.strip().split())
                    grid_x = min(int(x_center * self.S), self.S - 1)
                    grid_y = min(int(y_center * self.S), self.S - 1)
                    # Assign to first predictor (simplified)
                    target[grid_y, grid_x, 0] = x_center * self.S - grid_x
                    target[grid_y, grid_x, 1] = y_center * self.S - grid_y
                    target[grid_y, grid_x, 2] = width * self.S
                    target[grid_y, grid_x, 3] = height * self.S
                    target[grid_y, grid_x, 4] = 1.0  # Confidence
                    target[grid_y, grid_x, 10] = cls

        if self.transform:
            img = self.transform(img)

        return img, target

In [None]:


def yolo_loss(preds, targets, S=7, B=2, lambda_coord=5.0, lambda_noobj=0.5):
    batch_size = preds.size(0)
    total_loss = 0

    for i in range(batch_size):
        pred = preds[i].view(S, S, B * 5 + 1)  # [S, S, B*5 + C]
        target = targets[i].view(S, S, B * 5 + 1)

        for j in range(S):
            for k in range(S):
                # Objectness score (confidence)
                obj_mask = target[j, k, 4] > 0  # True if object exists
                noobj_mask = ~obj_mask

                if obj_mask:
                    # Coordinate loss (x, y, w, h) for the best bounding box
                    best_iou = 0
                    best_box_idx = 0
                    for b in range(B):
                        box_pred = pred[j, k, 5 * b:5 * (b + 1)]
                        x, y, w, h, conf = box_pred
                        target_box = target[j, k, :5]
                        iou = calculate_iou((x, y, w, h), (target_box[0], target_box[1], target_box[2], target_box[3]))
                        if iou > best_iou:
                            best_iou = iou
                            best_box_idx = b

                    best_pred = pred[j, k, 5 * best_box_idx:5 * (best_box_idx + 1)]
                    target_box = target[j, k, :5]
                    coord_loss = nn.MSELoss()(best_pred[:4], target_box[:4]) * lambda_coord
                    conf_loss = nn.MSELoss()(best_pred[4], target_box[4])
                    class_loss = nn.MSELoss()(pred[j, k, 10], target[j, k, 10])

                    total_loss += coord_loss + conf_loss + class_loss
                # No object loss
                for b in range(B):
                    noobj_conf = pred[j, k, 5 * b + 4]
                    zero_tensor = torch.tensor(0.0, device=pred.device)
                    total_loss += nn.MSELoss()(noobj_conf, zero_tensor) * lambda_noobj * noobj_mask.float()

    return total_loss / batch_size

def calculate_iou(box1, box2):
    # box: (x_center, y_center, width, height)
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2
    w1_half = w1 / 2
    h1_half = h1 / 2
    w2_half = w2 / 2
    h2_half = h2 / 2

    x1_min = x1 - w1_half
    y1_min = y1 - h1_half
    x1_max = x1 + w1_half
    y1_max = y1 + h1_half

    x2_min = x2 - w2_half
    y2_min = y2 - h2_half
    x2_max = x2 + w2_half
    y2_max = y2 + h2_half

    inter_x_min = max(x1_min, x2_min)
    inter_y_min = max(y1_min, y2_min)
    inter_x_max = min(x1_max, x2_max)
    inter_y_max = min(y1_max, y2_max)

    inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)
    union_area = w1 * h1 + w2 * h2 - inter_area

    return inter_area / union_area if union_area > 0 else 0

In [None]:


# Hyperparameters
S = 7
B = 2
C = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 16
image_size = 224

# Transform
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Dataset and DataLoader
train_img_dir =   "/home/cse/Desktop/Apurbo/datasets/WiderFace/archive/WIDER Face Dataset For YOLOv12/WIDER Face Dataset For YOLOv12/train/images"  # Adjust path
train_label_dir = "/home/cse/Desktop/Apurbo/datasets/WiderFace/archive/WIDER Face Dataset For YOLOv12/WIDER Face Dataset For YOLOv12/train/labels"  # Adjust path
val_img_dir =     "/home/cse/Desktop/Apurbo/datasets/WiderFace/archive/WIDER Face Dataset For YOLOv12/WIDER Face Dataset For YOLOv12/val/images"  # Adjust path
val_label_dir =   "/home/cse/Desktop/Apurbo/datasets/WiderFace/archive/WIDER Face Dataset For YOLOv12/WIDER Face Dataset For YOLOv12/val/labels"  # Adjust paths

train_dataset = WiderFaceDataset(train_img_dir, train_label_dir, S=S, transform=transform)
val_dataset = WiderFaceDataset(val_img_dir, val_label_dir, S=S, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = YOLOv1(S=S, B=B, C=C).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Metric
metric = MeanAveragePrecision().to(device)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (images, targets) in enumerate(train_loader):
        images, targets = images.to(device), targets.to(device)  # Move both images and targets to device

        optimizer.zero_grad()
        outputs = model(images)
        loss = yolo_loss(outputs, targets, S=S, B=B)  # Ensure yolo_loss handles device tensors
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {total_loss / len(train_loader):.4f}")

torch.save(model.state_dict(), "yolov1_widerface_model.pth")

Epoch [1/10], Step [0/805], Loss: 12.6961
Epoch [1/10], Step [10/805], Loss: 13.9542
Epoch [1/10], Step [20/805], Loss: 11.0309
Epoch [1/10], Step [30/805], Loss: 7.0369
Epoch [1/10], Step [40/805], Loss: 7.2621
Epoch [1/10], Step [50/805], Loss: 11.4739
Epoch [1/10], Step [60/805], Loss: 7.0607
Epoch [1/10], Step [70/805], Loss: 11.4232
Epoch [1/10], Step [80/805], Loss: 8.2501
Epoch [1/10], Step [90/805], Loss: 10.6414
Epoch [1/10], Step [100/805], Loss: 8.0437
Epoch [1/10], Step [110/805], Loss: 13.1536
Epoch [1/10], Step [120/805], Loss: 8.5123
Epoch [1/10], Step [130/805], Loss: 8.4753
Epoch [1/10], Step [140/805], Loss: 13.8317
Epoch [1/10], Step [150/805], Loss: 8.5037
Epoch [1/10], Step [160/805], Loss: 9.4029
Epoch [1/10], Step [170/805], Loss: 6.3362
Epoch [1/10], Step [180/805], Loss: 7.3290
Epoch [1/10], Step [190/805], Loss: 11.3190
Epoch [1/10], Step [200/805], Loss: 7.8236
Epoch [1/10], Step [210/805], Loss: 13.6501
Epoch [1/10], Step [220/805], Loss: 7.8885
Epoch [1/10]