<a href="https://colab.research.google.com/github/Arity-T/yolo-in-pytorch/blob/main/train_yolo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python --version
!nvidia-smi

In [None]:
%cd /content/
!git clone https://github.com/Arity-T/yolo-in-pytorch.git
%cd /content/yolo-in-pytorch/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## VOC dataset

Download from pjreddie.com

In [None]:
# !wget https://pjreddie.com/media/files/VOCtrainval_11-May-2012.tar
# !wget https://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar
# !wget https://pjreddie.com/media/files/VOCtest_06-Nov-2007.tar

Or copy from Drive

In [None]:
drive_voc_path = "/content/drive/MyDrive/Projects/YOLO/VOC"
!ls "$drive_voc_path"

In [None]:
!cp "$drive_voc_path/VOCtrainval_11-May-2012.tar" .
!cp "$drive_voc_path/VOCtrainval_06-Nov-2007.tar" .
!cp "$drive_voc_path/VOCtest_06-Nov-2007.tar" .

Extract and convert labels

In [None]:
!tar xf VOCtrainval_11-May-2012.tar
!tar xf VOCtrainval_06-Nov-2007.tar
!tar xf VOCtest_06-Nov-2007.tar
!python convert_voc_labels.py

## Imports

In [None]:
import multiprocessing as mp

import albumentations as A
import matplotlib.pyplot as plt
import torch
import torchvision
from torch.utils.data import DataLoader
from tqdm import tqdm

import utils
import yolov1

## Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Create model with pretrained backbone

In [None]:
# from torchvision.models import resnet34

# model = yolov1.Model(backbone=resnet34(weights="DEFAULT")).to(device)

Or load weights from drive

In [None]:
drive_weights_path = "/content/drive/MyDrive/Projects/YOLO/model_loss_1.05.pth"

model = torch.load(drive_weights_path, map_location=device)

## Datasets and dataloaders

In [None]:
# List of augmentations
augs = A.Compose(
    [
        A.Resize(448, 448, always_apply=True),
    ],
    bbox_params=A.BboxParams(format="yolo"),
)
trfs = torchvision.transforms.ToTensor()

# Datasets
train_ds = yolov1.Dataset(
    img_sets=[
        "voc2007_train.txt",
        "voc2007_val.txt",
        "voc2007_test.txt",
        "voc2012_train.txt",
    ],
    augmentations=augs,
    transforms=trfs,
)
val_ds = yolov1.Dataset(
    img_sets=["voc2012_val.txt"], augmentations=augs, transforms=trfs
)

# Dataloaders
print("\nCPU count:", mp.cpu_count())
train_dl = DataLoader(
    train_ds, batch_size=64, collate_fn=yolov1.collate_fn, num_workers=mp.cpu_count()
)
val_dl = DataLoader(
    val_ds, batch_size=64, collate_fn=yolov1.collate_fn, num_workers=mp.cpu_count()
)

labels = open("voc_classes.txt").read().split()

In [None]:
imgs, annots = next(iter(train_dl))

plt.figure(figsize=(5, 5))
plt.imshow(utils.draw_bboxes(imgs[0], annots[0]))

## Model training

It is better to freeze the layers if a pretrained backbone is used.

In [None]:
# Freeze all backbone layers
model.backbone.requires_grad_(False)

# And then unfreeze only a few last layers
model.backbone[0][-1].requires_grad_(True)
model.backbone[1].requires_grad_(True)
model.backbone[2].requires_grad_(True);

In [None]:
epochs = 10

In [None]:
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0005, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = yolov1.Loss(labmda_coord=5.0, labmda_noobj=0.5)

In [None]:
model.train()

for epoch in tqdm(range(epochs)):
    mean_loss = []
    for imgs, annots in tqdm(train_dl, leave=False):
        predicticted_grids = model(imgs.to(device))
        loss = loss_fn(predicticted_grids, annots)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print("\nMean loss:", sum(mean_loss) / len(mean_loss))

In [None]:
# Best loss: 0.83

## Save model

In [None]:
!ls /content/drive/MyDrive/Projects/YOLO

In [None]:
torch.save(model, "/content/drive/MyDrive/Projects/YOLO/model_last.pth")

## Compute metrics

In [None]:
train_annots = []
train_predictions = []

for imgs, annots in tqdm(train_dl, leave=False):
    train_annots += annots
    train_predictions += model.predict(
        imgs.to(device), threshold=0.25, iou_threshold=0.5
    )

In [None]:
train_map, train_map_by_classes = utils.compute_map(
    train_predictions, train_annots, iou_threshold=0.5, number_of_classes=20
)
print("MAP@0.5 on train:", train_map)

In [None]:
val_annots = []
val_predictions = []

for imgs, annots in tqdm(val_dl, leave=False):
    val_annots += annots
    val_predictions += model.predict(imgs.to(device), threshold=0.25, iou_threshold=0.5)

In [None]:
val_map, val_map_by_classes = utils.compute_map(
    val_predictions, val_annots, iou_threshold=0.5, number_of_classes=20
)
print("MAP@0.5 on val:", val_map)

## Some examples

In [None]:
# Take images from training set
imgs_train, annots_train = next(iter(train_dl))
print("Batch size:", len(imgs_train))

example_indexes = range(10)

utils.show_examples(
    imgs_train[example_indexes],
    model.predict(imgs_train[example_indexes].to(device), threshold=0.2),
    [annots_train[i] for i in example_indexes],
)

In [None]:
# Take images from validation set
imgs_test, annots_test = next(iter(val_dl))
print("Batch size:", len(imgs_test))

example_indexes = range(10)

utils.show_examples(
    imgs_test[example_indexes],
    model.predict(imgs_test[example_indexes].to(device), threshold=0.2),
    [annots_test[i] for i in example_indexes],
)