In [1]:
import torch # version 2.1.2
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.utils.data import DataLoader
from model import YoloV1
from dataset import VOCDataset
from loss import YoloLoss
from utils import (
    intersection_over_union,
    non_max_suppression,
    mean_average_precision,
    cellboxes_to_boxes,
    get_bboxes,
    plot_image,
    save_checkpoint,
    load_checkpoint
)

seed = 3301 #pseudorandom seed, gets the same dataset loading
torch.manual_seed(seed)

torch.autograd.set_detect_anomaly(True)

[CNNBlock(
  (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (leakyrelu): LeakyReLU(negative_slope=0.1)
), MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), CNNBlock(
  (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (batchnorm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (leakyrelu): LeakyReLU(negative_slope=0.1)
), MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), CNNBlock(
  (conv): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (batchnorm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (leakyrelu): LeakyReLU(negative_slope=0.1)
), CNNBlock(
  (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (batchnorm): BatchNorm2d(256, eps=

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f3fa1119950>

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

Using device cuda


In [3]:
# Hyperparameters for our model
LEARNING_RATE = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16
WEIGHT_DECAY = 0    # no regularization in order for fast training
EPOCHS = 100

# Other variables for training
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"

In [4]:
# trains through the entire dataset once
def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        
        # backpropogation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update progress bar
        loop.set_postfix(loss=loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")


In [5]:
model = YoloV1(split_size = 7, num_boxes = 2, num_classes = 20).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE, weight_decay = WEIGHT_DECAY)
loss_fn = YoloLoss()

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

transform = transforms.Compose([
     transforms.Resize((448, 448)), 
     transforms.ToTensor()
     ])

train_dataset = VOCDataset(
    "data/8examples.csv", 
    transform = transform, 
    img_dir = IMG_DIR,
    label_dir = LABEL_DIR
)

test_dataset = VOCDataset(
    "data/test.csv", 
    transform = transform, 
    img_dir = IMG_DIR,
    label_dir = LABEL_DIR
)

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    num_workers= NUM_WORKERS,
    pin_memory= PIN_MEMORY,
    shuffle = True,
    drop_last = False       #We have 8 examples, so false, but true if more than BATCH_SIZE
)

for epoch in range(EPOCHS):
    pred_boxes, target_boxes = get_bboxes(train_loader, model, iou_threshold = 0.5, threshold = 0.4)

    mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")
    print(f"Train mAP: {mean_avg_prec}")
    if mean_avg_prec > 0.9:
            checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename= LOAD_MODEL_FILE)
            import time
            time.sleep(10)
    train_fn(train_loader, model, optimizer, loss_fn)

Train mAP: 0.0


  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/anthony/miniconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/anthony/miniconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 1077, in launch_instance
    app.start()
  File "/home/anthony/miniconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/home/anthony/miniconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/home/anthony/miniconda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
    self._run_once()
  File "/home/anthony/miniconda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
    handle._run()
  File "/home/anthony/miniconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.r

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [8, 7, 7, 2]], which is output 0 of AsStridedBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [6]:
for i, (x, y) in enumerate(train_loader):
    print(x.shape)
    if i == 0:
        break

torch.Size([8, 3, 448, 448])


In [7]:
transform = transforms.Compose([
     transforms.Resize((448, 448)), 
     transforms.ToTensor()
     ])

train_dataset = VOCDataset(
        "data/8examples.csv", 
        transform = transform, 
        img_dir = IMG_DIR,
        label_dir = LABEL_DIR
        )

train_dataset.__getitem__(2)
print()

torch.Size([3, 448, 448])


(tensor([[[0.0118, 0.0118, 0.0118,  ..., 0.0078, 0.0118, 0.0157],
          [0.0039, 0.0157, 0.0157,  ..., 0.0039, 0.0039, 0.0000],
          [0.0118, 0.0078, 0.0078,  ..., 0.0078, 0.0078, 0.0039],
          ...,
          [0.0039, 0.0039, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0078, 0.0118,  ..., 0.0078, 0.0078, 0.0078],
          [0.0078, 0.0078, 0.0078,  ..., 0.0078, 0.0078, 0.0078]],
 
         [[0.0118, 0.0118, 0.0118,  ..., 0.0078, 0.0118, 0.0157],
          [0.0039, 0.0157, 0.0157,  ..., 0.0039, 0.0039, 0.0000],
          [0.0118, 0.0078, 0.0078,  ..., 0.0078, 0.0078, 0.0039],
          ...,
          [0.0039, 0.0039, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0078, 0.0118,  ..., 0.0078, 0.0078, 0.0078],
          [0.0078, 0.0078, 0.0078,  ..., 0.0078, 0.0078, 0.0078]],
 
         [[0.0118, 0.0118, 0.0118,  ..., 0.0078, 0.0118, 0.0157],
          [0.0039, 0.0157, 0.0157,  ..., 0.0039, 0.0039, 0.0000],
          [0.0118, 0.0078, 0.0078,  ...,