In [1]:
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
from voc import get_dataloader
from main_utils import set_seed
from model_factory import get_model

set_seed(42)

In [2]:
SIZE = (256, 256)
CONFIDENCE_THRESHOLD = 0.7
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

def scale_to_01(image, **kwargs):
    return image.astype('float32') / 255.0

train_labeled_transforms = A.Compose([
    A.Resize(SIZE[0], SIZE[1]),         
    A.HorizontalFlip(p=0.5),
    A.Lambda(image=scale_to_01), 
    ToTensorV2(),
], bbox_params=A.BboxParams(format='pascal_voc'))

train_unlabeled_transforms = A.Compose(
        [
            A.Resize(SIZE[0], SIZE[1]),
            A.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1, p=0.8),
            A.GaussianBlur(blur_limit=(3, 7), sigma_limit=(0.1, 2.0), p=0.5),
            # A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
            A.CoarseDropout(num_holes_range=(3, 3), hole_height_range=(0.05, 0.1),
                             hole_width_range=(0.05, 0.1), p=0.5),
            
            ToTensorV2(),
        ],
        bbox_params=A.BboxParams(format='pascal_voc')
    )

test_transforms = A.Compose([
    A.Resize(SIZE[0], SIZE[1]),
    # A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ToTensorV2(), 
], bbox_params=A.BboxParams(format='pascal_voc'))

dt_train_labeled = get_dataloader("trainval", "2007", 4, transform=train_labeled_transforms)
dt_train_unlabeled = get_dataloader("trainval", "2012", 4, transform=train_unlabeled_transforms)
dt_test = get_dataloader("test", "2007", 4, transform=test_transforms, shuffle=False)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = get_model(device=device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [3]:
# images, targets = next(iter(dt_train_labeled))
# for target in targets:
#             target["boxes"] = target["boxes"].to(device)
#             target["labels"] = target["labels"].to(device)
# images = images.to(device)
# loss_for_each = model(images, targets)
# print(loss_for_each)
# sum(loss_for_each.values())

In [4]:
import matplotlib.pyplot as plt
METRIC_KEYS = ["loss_classifier", "loss_box_reg", "loss_objectness", "loss_rpn_box_reg", "total"]

def plot_losses(history):
    epochs = range(1, len(history["total"]) + 1)

    plt.figure(figsize=(7, 5))
    for comp in METRIC_KEYS:
        plt.plot(epochs, history[comp], label=f"Train {comp}", linewidth=2)
    plt.title(f"Train results over epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
from tqdm import tqdm

def train(model, optimizer, dt_train_labeled, device):
    model.train()
    train_batches = 0
    history = {key : 0 for key in METRIC_KEYS}

    for images, targets in tqdm(dt_train_labeled, desc="Training"):
        if train_batches == 5: break
        for target in targets:
            target["boxes"] = target["boxes"].to(device)
            target["labels"] = target["labels"].to(device)
        images = images.to(device)
        loss_dict = model(images, targets)
        loss = sum(loss_dict.values())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        for k, v in loss_dict.items():
            history[k] += v.item()

        history["total"] += loss.item()
        train_batches += 1
    for key in history:
        history[key] = history[key] / train_batches
    return history


def pipeline(epochs, model, optimizer, dt_train_labeled, device):
    history = {key : [] for key in METRIC_KEYS}
    for epoch in range(epochs):
        print(f"\n==================== Epoch {epoch+1}/{epochs} ====================\n")
        train_history = train(model, optimizer, dt_train_labeled, device)
        for key, val in train_history.items():
            history[key].append(val)
        plot_losses(history)
        

pipeline(30, model, optimizer, dt_train_labeled, device)





Training:   0%|          | 5/1253 [00:03<15:44,  1.32it/s]


{'loss_classifier': [1.8295207977294923], 'loss_box_reg': [0.10072046448476613], 'loss_objectness': [0.4210142493247986], 'loss_rpn_box_reg': [0.05141953900456429], 'total': [2.4026750206947325]}




Training:   0%|          | 5/1253 [00:03<13:51,  1.50it/s]


{'loss_classifier': [1.8295207977294923, 1.1688620388507842], 'loss_box_reg': [0.10072046448476613, 0.16546856313943864], 'loss_objectness': [0.4210142493247986, 0.11648987978696823], 'loss_rpn_box_reg': [0.05141953900456429, 0.02231107037514448], 'total': [2.4026750206947325, 1.473131537437439]}




Training:   0%|          | 5/1253 [00:03<14:02,  1.48it/s]


{'loss_classifier': [1.8295207977294923, 1.1688620388507842, 0.6384583473205566], 'loss_box_reg': [0.10072046448476613, 0.16546856313943864, 0.17361959517002107], 'loss_objectness': [0.4210142493247986, 0.11648987978696823, 0.09805498719215393], 'loss_rpn_box_reg': [0.05141953900456429, 0.02231107037514448, 0.025340333580970764], 'total': [2.4026750206947325, 1.473131537437439, 0.9354732275009155]}




Training:   0%|          | 5/1253 [00:03<13:58,  1.49it/s]


{'loss_classifier': [1.8295207977294923, 1.1688620388507842, 0.6384583473205566, 0.3640890508890152], 'loss_box_reg': [0.10072046448476613, 0.16546856313943864, 0.17361959517002107, 0.13405905961990355], 'loss_objectness': [0.4210142493247986, 0.11648987978696823, 0.09805498719215393, 0.09399413466453552], 'loss_rpn_box_reg': [0.05141953900456429, 0.02231107037514448, 0.025340333580970764, 0.017204658314585685], 'total': [2.4026750206947325, 1.473131537437439, 0.9354732275009155, 0.6093469321727752]}




Training:   0%|          | 5/1253 [00:03<14:01,  1.48it/s]


{'loss_classifier': [1.8295207977294923, 1.1688620388507842, 0.6384583473205566, 0.3640890508890152, 0.2723005011677742], 'loss_box_reg': [0.10072046448476613, 0.16546856313943864, 0.17361959517002107, 0.13405905961990355, 0.12415132075548171], 'loss_objectness': [0.4210142493247986, 0.11648987978696823, 0.09805498719215393, 0.09399413466453552, 0.0948096290230751], 'loss_rpn_box_reg': [0.05141953900456429, 0.02231107037514448, 0.025340333580970764, 0.017204658314585685, 0.023886961676180363], 'total': [2.4026750206947325, 1.473131537437439, 0.9354732275009155, 0.6093469321727752, 0.5151484131813049]}




Training:   0%|          | 4/1253 [00:02<15:20,  1.36it/s]


KeyboardInterrupt: 