In [None]:
import os
import sys
import PIL
import numpy as np

from matplotlib import pyplot as plt

In [None]:
import time
import json
import types
import random
import argparse
import datetime
from pathlib import Path
from functools import partial

import torch
from torch import nn
from torchvision.models import resnet50
import torch.nn.functional as F
import torchvision as TV
import torchvision.transforms as T
from torch.utils.data import DataLoader, DistributedSampler

import datasets
import util.misc as utils
from models import build_model
from engine import evaluate, train_one_epoch
from datasets import build_dataset, get_coco_api_from_dataset

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
## define my device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Available Device: {device}")

In [None]:
class DETRdemo(nn.Module):
    """
    Demo DETR implementation with multi-batch support.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Supports multiple batch sizes.
    """
    def __init__(self, num_classes, hidden_dim=256, nheads=8,
                 num_encoder_layers=6, num_decoder_layers=6, num_queries=100):
        super().__init__()

        # create dinov2 backbone and then
        # arrange the other projections
        self.dino_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')

        self.avg_pool = nn.AdaptiveAvgPool2d((7,7))
        self.dino_projector = nn.Linear(1024, 256)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)

        # output positional encodings (object queries)
        self.num_queries = num_queries
        self.query_pos = nn.Parameter(torch.rand(num_queries, hidden_dim))

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))


        # Freeze all layers by setting requires_grad=False for all parameters
        for param in self.dino_model.parameters():
            param.requires_grad = False

        print("The Dino model is now frozen and will not require gradients during training.")

    def forward(self, inputs):
        # propagate inputs through DinoV2
        x = self.dino_model.forward_features(inputs)["x_norm_patchtokens"]
        img_B, img_C, img_W, img_H = inputs.shape
        B, N, D = x.shape
        
        # Reshape and process backbone features
        x = x.view(B, int(img_W / 14), int(img_H / 14), D)
        x = self.avg_pool(x.permute(0, 3, 1, 2))
        x = x.flatten(2).permute(0, 2, 1)
        x = self.dino_projector(x)  # Shape: [B, 49, 256]
        h = x.permute(1, 0, 2)  # Shape: [49, B, 256]

        # Construct positional encodings
        H, W = 7, 7
        
        # Modify positional encoding to support batch size
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)


        # Repeat query_pos for batch size
        query_pos = self.query_pos.unsqueeze(1).repeat(1, B, 1)

        # Propagate through the transformer
        h = self.transformer(pos + 0.1 * h, query_pos).transpose(0, 1)
        
        # Project transformer outputs to class labels and bounding boxes
        return {
            'pred_logits': self.linear_class(h), 
            'pred_boxes': self.linear_bbox(h).sigmoid()
        }

In [None]:
# Initialize the model
bddk_num_classes = 7
model = DETRdemo(num_classes= bddk_num_classes).to(device)

n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)

In [None]:
# Load the weights
state_dict = torch.hub.load_state_dict_from_url(
    url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
    map_location='cpu', check_hash=True)

# Since we will fine-tune for different number of classes (COCO:91 v Our Problem: 7)
layers_to_remove = ["linear_class.weight", "linear_class.bias", "linear_bbox.weight", "linear_bbox.bias"]

for layer in layers_to_remove:
    if layer in state_dict:
        del state_dict[layer]


# 1. Get model and pretrained state_dict
model_state_dict = model.state_dict()
pretrained_state_dict = state_dict  # Assuming it's already loaded

# 2. Find common keys
matching_keys = model_state_dict.keys() & pretrained_state_dict.keys()

# 3. Create a filtered state_dict with only the matching layers
filtered_state_dict = {key: pretrained_state_dict[key] for key in matching_keys}

# 4. Load the filtered state_dict into your model
missing_keys, unexpected_keys = model.load_state_dict(filtered_state_dict, strict=False)

print(f'# of Missing: {len(missing_keys)} and Matched: {len(filtered_state_dict)}')

In [None]:
# If you uncomment and run, you will see the missing keys are related to;
## 1) Dino model
## 2) Last layer that works for classification and bboxes
"""
for element in missing_keys:
    print(element)"
"""

In [None]:
# Create dummy input
dummy_inputs = torch.randn(4, 3, 448, 448).to(device)

# Forward pass
dummy_outputs = model(dummy_inputs)

print("Pred Logits Shape:", dummy_outputs['pred_logits'].shape)
print("Pred Boxes Shape:", dummy_outputs['pred_boxes'].shape)
print()

In [None]:
# Create dummy input with transformation
dummy_inputs = np.random.randint(0, 256, (448, 448, 3), dtype=np.uint8)
dummy_inputs = PIL.Image.fromarray(dummy_inputs)

normalize = T.Compose([
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

dummy_transformation = T.Compose([
            T.RandomHorizontalFlip(),
            T.Resize((448, 448)),
            normalize,
        ])


dummy_transformed = dummy_transformation(dummy_inputs).to(device)
dummy_transformed = dummy_transformed.unsqueeze(0)

print(dummy_transformed.shape)

dummy_transformed_duplicated = dummy_transformed.repeat(5, 1, 1, 1)

print(dummy_transformed_duplicated.shape)

# Forward pass
dummy_outputs = model(dummy_transformed_duplicated)

print("Pred Logits Shape:", dummy_outputs['pred_logits'].shape)
print("Pred Boxes Shape:", dummy_outputs['pred_boxes'].shape)
print()

In [None]:
from models import detr as detr_models
from models import matcher as detr_matcher

from datasets.coco import CocoDetection as coco_dataset

In [None]:
## Definition of loss function and post-processor
weight_dict = {'loss_ce': 1, 'loss_bbox': 1, 'loss_giou': 1}
weight_dict = {k: torch.tensor(v, dtype=torch.float32, device=device) for k, v in weight_dict.items()}

losses = ['labels', 'boxes', 'cardinality']

eos_coef = 0.1
eos_coef = torch.tensor(eos_coef, dtype=torch.float32, device=device)

bddk_matcher = detr_matcher.HungarianMatcher(cost_class=1, cost_bbox=5, cost_giou=2)

criterion = detr_models.SetCriterion(bddk_num_classes, matcher=bddk_matcher, weight_dict=weight_dict, eos_coef=eos_coef, losses=losses, device=device)
postprocessors = {'bbox': detr_models.PostProcess()}

In [None]:
## Definition of optimizer
lr = 5e-5
weight_decay = 1e-4
lr_drop = 2

param_dicts = [
    {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]},
]

optimizer = torch.optim.AdamW(param_dicts, lr=lr,
                                weight_decay=weight_decay)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_drop)

In [None]:
## Definition of dataset and transformation function

def dino_transforms(image_set):

    normalize = T.Compose([
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    if image_set == 'train':
        
        return T.Compose([
            T.RandomHorizontalFlip(),
            T.Resize((448, 448)),
            normalize,
        ])

    if image_set == 'val':
        return T.Compose([
            T.Resize((448, 448)),
            normalize,
        ])

    raise ValueError(f'unknown {image_set}')


images_main_path = r'C:\BDD100K\100k_images'
train_images_path = os.path.join(images_main_path, 'train')
val_images_path = os.path.join(images_main_path, 'val')
test_images_path = os.path.join(images_main_path, 'test')

print('-*'*20)
dataset_train = coco_dataset(train_images_path, '../train_subset_COCO_Format.json', transforms=dino_transforms('train'), return_masks=False)
print('-*'*20)
dataset_val = coco_dataset(val_images_path, '../val_subset_COCO_Format.json', transforms=dino_transforms('val'), return_masks=False)
print('-*'*20)
dataset_test = coco_dataset(test_images_path, '../test_subset_COCO_Format.json', transforms=dino_transforms('val'), return_masks=False)
print('-*'*20)

In [None]:
def my_collate_fn(batch):
    batch = list(zip(*batch))
    
    # Extract regular tensors instead of creating a NestedTensor
    images = batch[0]
    
    # Get maximum dimensions to create a batch
    max_size = utils._max_by_axis([list(img.shape) for img in images])
    batch_shape = [len(images)] + max_size
    b, c, h, w = batch_shape
    
    # Create padded tensor batch
    dtype = images[0].dtype
    device = images[0].device
    tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
    
    # Copy images to padded tensor
    for i, img in enumerate(images):
        tensor[i, :img.shape[0], :img.shape[1], :img.shape[2]].copy_(img)
    
    # Return tensor batch and the rest of the batch elements
    batch[0] = tensor
    return tuple(batch)

In [None]:
## Defintion of dataloader
batch_size = 2

sampler_train = torch.utils.data.RandomSampler(dataset_train)
sampler_val = torch.utils.data.SequentialSampler(dataset_val)
sampler_test = torch.utils.data.SequentialSampler(dataset_test)

batch_sampler_train = torch.utils.data.BatchSampler(
    sampler_train, batch_size, drop_last=True)

data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                collate_fn= my_collate_fn) # utils.collate_fn

data_loader_val = DataLoader(dataset_val, batch_size, sampler=sampler_val,
                                drop_last=False, collate_fn= my_collate_fn) # utils.collate_fn

data_loader_test = DataLoader(dataset_test, batch_size, sampler=sampler_test,
                                drop_last=False, collate_fn= my_collate_fn) # utils.collate_fn

In [None]:
## START TRAINING (FINE-TUNING)
start_epoch = 0
n_epoch = 20

clip_max_norm = 0.1
clip_max_norm = torch.tensor(clip_max_norm, dtype=torch.float32, device=device)

output_dir = Path('weights')

print("Start training")
start_time = time.time()
for epoch in range(start_epoch, n_epoch):

    train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch, clip_max_norm)
    
    lr_scheduler.step()

    if output_dir:
        checkpoint_paths = [output_dir / 'checkpoint.pth']

        # extra checkpoint before LR drop and every 2 epochs
        if (epoch + 1) % lr_drop == 0 or (epoch + 1) % 2 == 0:
            checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
        for checkpoint_path in checkpoint_paths:
            utils.save_on_master({
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch
            }, checkpoint_path)

    base_ds = get_coco_api_from_dataset(dataset_val)
    test_stats, coco_evaluator = evaluate(
        model, criterion, postprocessors, data_loader_val, base_ds, device, output_dir
    )

    log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     **{f'test_{k}': v for k, v in test_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

    if output_dir and utils.is_main_process():
        with (output_dir / "log.txt").open("a") as f:
            f.write(json.dumps(log_stats) + "\n")
        
        # for evaluation logs
        if coco_evaluator is not None:
            (output_dir / 'eval').mkdir(exist_ok=True)
            if "bbox" in coco_evaluator.coco_eval:
                filenames = ['latest.pth']
                if epoch % 1 == 0:
                    filenames.append(f'{epoch:03}.pth')
                for name in filenames:
                    torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                output_dir / "eval" / name)
    

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))