# Yolo Notebook

Importing libraries

In [4]:
import cv2
import torch

Model Backbone

In [5]:
import torch.nn as nn

class YOLO(nn.Module):
    def __init__(self, num_classes = 6, num_anchors = 3, grid_size = 7):
        super(YOLO, self).__init__()
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.grid_size = grid_size
        
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        self.detector = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(256 * (grid_size // 4)**2, grid_size * grid_size * (num_anchors * 5 + num_classes))
        )
        
    def forward(self, x):
        features = self.backbone(x)
        predictions = self.detector(features)
        return predictions.view(-1, self.grid_size, self.grid_size, self.num_anchors * 5 + self.num_classes)


In [6]:
model = YOLO(num_classes=6)
model

YOLO(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (detector): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Flatten(start_dim=1, end_dim=-1)
    (4): Linear(in_features=256, out_features=1029, bias=True)
  )
)

Modular Building Blocks
Create a block

In [7]:
class ConvBlock(nn.Module):
    #This block contains Conv2d, BatchNorm, ReLU.
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU
        
    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))

In [8]:
class YOLOBackbone(nn.Module):
    def __init__(self):
        super(YOLOBackbone, self).__init__()
        self.layers = nn.Sequential(
            ConvBlock(3,32,3,1,1),
            nn.MaxPool2d(2,2),
            ConvBlock(32,64,3,1,1),
            nn.MaxPool2d(2,2),
            ConvBlock(64,128,3,1,1),
            nn.MaxPool2d(2,2)
        )
    def forward(self, x):
        return self.layers(x)

Detection Head

In [9]:
class YOLOHead(nn.Module):
    def __init__(self, grid_size, num_classes, num_anchors):
        super(YOLOHead, self).__init__()
        self.grid_size = grid_size
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.detector = nn.Conv2d(128, num_anchors * (5 + num_classes), kernel_size=1)

    def forward(self, x):
        return self.detector(x).permute(0,2,3,1).contiguous()

assemble YOLO Model

In [10]:
class YOLO(nn.Module):
    def __init__(self, grid_size=7, num_classes=20, num_anchors=3):
        super(YOLO, self).__init__()
        self.backbone = YOLOBackbone()
        self.head = YOLOHead(grid_size, num_classes, num_anchors)

    def forward(self, x):
        features = self.backbone(x)
        predictions = self.head(features)
        return predictions

# Example usage
model = YOLO(grid_size=7, num_classes=20, num_anchors=3)
print(model)


YOLO(
  (backbone): YOLOBackbone(
    (layers): Sequential(
      (0): ConvBlock(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (2): ConvBlock(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): ConvBlock(
        (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (head): YOLOHead(
    (detector): Conv2d(128, 75, k

Anchor Boxes

In [11]:
import numpy as np

def generate_anchors(scales, ratios):
    anchors = []
    for s in scales:
        for r in ratios:
            w = s * np.sqrt(r)
            h = s / np.sqrt(r)
            anchors.append((w,h))
    return anchors


YOLO expects bounding box coordinates relative to the image size (values between 0 and 1). Here’s how you can convert VOC annotations to YOLO format:

In [12]:
def convert_to_yolo_format(w, h, bbx):
    x_min, y_min, x_max, y_max = bbx
    x_center = (x_min + x_max) / 2 / w
    y_center = (y_min + y_max) / 2 / h
    b_w = (x_min - x_max) / w
    b_h = (x_min - x_max) / h
    return [x_center, y_center, b_w, b_h]

Data augmentation

In [13]:
import torchvision.transforms as transforms

train_transforms = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor()
])


Loss Function

In [14]:
def yolo_loss(predictions, targets, num_classes, lambda_coord=5, lambda_noobj=0.5):
    """
    Computes YOLO loss.
    - predictions: Predicted tensor.
    - targets: Ground truth tensor.
    """
    # Unpack predictions and targets
    pred_boxes = predictions[..., :4]
    pred_conf = predictions[..., 4]
    pred_classes = predictions[..., 5:]
    target_boxes = targets[..., :4]
    target_conf = targets[..., 4]
    target_classes = targets[..., 5:]
    
    # Localization Loss
    box_loss = lambda_coord * torch.sum((pred_boxes - target_boxes) ** 2)

    # Confidence Loss
    obj_loss = torch.sum((pred_conf - target_conf) ** 2)
    noobj_loss = lambda_noobj * torch.sum((pred_conf[target_conf == 0]) ** 2)

    # Classification Loss
    class_loss = torch.sum((pred_classes - target_classes) ** 2)

    # Total Loss
    total_loss = box_loss + obj_loss + noobj_loss + class_loss
    return total_loss

dataset class

In [4]:
pip install requests

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Downloading certifi-2025.1.31-py3-none-any.whl (166 kB)
Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl (102 kB)
Downloading idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.3.0-py3-none-any.whl (128 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.1.31 charset-normalizer-3.4.1 idna-3.10 requests-2.32.3 urll

https://colab.research.google.com/drive/1QuLLsvX-DnOcOVWxcKWglIzDnxV_OHxE?usp=sharing#scrollTo=9ILzwujhyDmX

In [8]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip /content/annotations_trainval2014.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
from coco_dataset import coco_dataset_download as cocod

class_names = ['newspaper', 'cardboard box', 'tissue box', 'bottle', 'wine glass', 'cup']  # List of class names you want to download
images_count = 500  # Count of images
annotations_path = r'C:\Users\YOGA\Desktop\Master s3\NoSQL\repo\intelligent-recycling-system\content\annotations\instances_train2014.json'

# Loop through each class and call the download function
for class_name in class_names:
    print(f"Downloading images for class: {class_name}")
    cocod.coco_dataset_download(class_name, images_count, annotations_path)


Downloading images for class: newspaper
loading annotations into memory...
Done (t=29.06s)
creating index...
index created!
no.of image: 1
no.of image: 2
no.of image: 3
no.of image: 4
no.of image: 5
no.of image: 6
no.of image: 7
no.of image: 8
no.of image: 9
no.of image: 10
no.of image: 11
no.of image: 12
no.of image: 13
no.of image: 14
no.of image: 15
no.of image: 16
no.of image: 17
no.of image: 18
no.of image: 19
no.of image: 20
no.of image: 21
no.of image: 22
no.of image: 23
no.of image: 24
no.of image: 25
no.of image: 26
no.of image: 27
no.of image: 28
no.of image: 29
no.of image: 30
no.of image: 31
no.of image: 32
no.of image: 33
no.of image: 34
no.of image: 35
no.of image: 36
no.of image: 37
no.of image: 38
no.of image: 39
no.of image: 40
no.of image: 41
no.of image: 42
no.of image: 43
no.of image: 44
no.of image: 45
no.of image: 46
no.of image: 47
no.of image: 48
no.of image: 49
no.of image: 50
no.of image: 51
no.of image: 52
no.of image: 53
no.of image: 54
no.of image: 55
no.of

In [22]:
from torch.utils.data import Dataset
import os

class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, transforms=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.images = os.listdir(img_dir)
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir,self.images[idx])
        label_path = os.path.join(self.label_dir, self.images[idx].replace(".jpg", ".txt"))
        
        # Load Image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Load Annotations
        boxes = []
        with open(label_path, "r") as f:
            for l in f.readlines():
                class_label, x, y, w, h = map(float, l.strip().split())
                boxes.append([class_label, x, y, w, h])
                
        if self.transforms:
            img = self.transforms(img)
        return img, torch.Tensor(boxes)

In [31]:
# Example: Initialize DataLoader
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
img_path = 'C:/Users/YOGA/Desktop/Master^ s3/NoSQL/repo/intelligent-recycling-system/data'
lbl_path = 'C:/Users/YOGA/Desktop/Master^ s3/NoSQL/repo/intelligent-recycling-system/label'
train_dataset = YOLODataset(img_dir=img_path, label_dir=lbl_path, transforms=ToTensor())
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/YOGA/Desktop/Master^ s3/NoSQL/repo/intelligent-recycling-system/data'

In [27]:
train_dataset[1]

error: OpenCV(4.10.0) C:\b\abs_e4cxka7_7g\croot\opencv-suite_1738943368733\work\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


## Training

In [24]:
import torch.optim as optim

# init model, optimizer and loss function
model = YOLO(grid_size=7, num_classes=6, num_anchors=3)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = yolo_loss

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for imgs, trgts in train_loader:
        
        #forward pass
        preds = model(imgs)
        
        #loss calcs
        loss = criterion(preds,trgts, num_classes=6)
        
        #Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

        

error: OpenCV(4.10.0) C:\b\abs_e4cxka7_7g\croot\opencv-suite_1738943368733\work\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


# Video capture 

In [None]:
# To quit press (q)
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

while True:
    ret, img= cap.read()
    cv2.imshow('Webcam', img)

    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


# What objects we will detect:
Glasses
Paper
Plastic