In [None]:
import os
import gc
import torch
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import (AutoImageProcessor, AutoModelForObjectDetection, 
                          TrainingArguments, Trainer, TrainerCallback)
from torchmetrics.detection.mean_ap import MeanAveragePrecision

In [None]:
model_id = "microsoft/conditional-detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(model_id, size={"longest_edge": 640, "shortest_edge": 640})

In [None]:
def prepare_and_split_data(root_path, subfolder):
    img_dir = os.path.join(root_path, subfolder)
    image_files = sorted([f for f in os.listdir(img_dir) if f.endswith('.png')])
    dataset_list = []
    for idx, img_name in enumerate(image_files):
        img_path = os.path.join(img_dir, img_name)
        txt_path = img_path.replace('.png', '.txt')
        with Image.open(img_path) as i:
            width, height = i.size
        objs = []
        if os.path.exists(txt_path):
            with open(txt_path, 'r') as f:
                for line in f:
                    cls, x_c, y_c, w, h = map(float, line.split())
                    abs_w, abs_h = w * width, h * height
                    x_min = (x_c * width) - (abs_w / 2)
                    y_min = (y_c * height) - (abs_h / 2)
                    objs.append({
                        "id": len(objs), "image_id": idx, "category_id": int(cls), 
                        "bbox": [x_min, y_min, abs_w, abs_h], "area": abs_w * abs_h, "iscrowd": 0
                    })
        dataset_list.append({"image_path": img_path, "image_id": idx, "objects": objs, "orig_size": [height, width]})
    
    full_ds = Dataset.from_list(dataset_list)
    # Split: 70% Train, 15% Val, 15% Test
    train_test = full_ds.train_test_split(test_size=0.3, seed=42)
    test_val = train_test['test'].train_test_split(test_size=0.5, seed=42)
    
    return {
        'train': train_test['train'],
        'val': test_val['train'],
        'test': test_val['test']
    }

In [None]:
def transform_batch(examples):
    images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
    targets = [{"image_id": i, "annotations": a} for i, a in zip(examples["image_id"], examples["objects"])]
    return image_processor(images=images, annotations=targets, return_tensors="pt")

In [None]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    return {"pixel_values": encoding["pixel_values"], "pixel_mask": encoding["pixel_mask"], "labels": labels}


In [None]:
def val_compute_metrics(evaluation_pred):
    return {}

In [None]:
def run_resnet_experiment(split_ds, name):
    raw_val = [x for x in split_ds["val"]]
    raw_test = [x for x in split_ds["test"]]
    
    split_ds["train"].set_transform(transform_batch)
    split_ds["val"].set_transform(transform_batch)
    split_ds["test"].set_transform(transform_batch)

    model = AutoModelForObjectDetection.from_pretrained(
        model_id, num_labels=3, ignore_mismatched_sizes=True,
        id2label={0: "door", 1: "wall", 2: "window"},
        label2id={"door": 0, "wall": 1, "window": 2}
    )
    
    args = TrainingArguments(
        output_dir=f"./results_resnet_{name}",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=25,
        fp16=True,
        learning_rate=1e-4,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        remove_unused_columns=False,
        report_to="none",
        load_best_model_at_end=True
    )
    
    trainer = Trainer(
        model=model, args=args, 
        train_dataset=split_ds["train"], 
        eval_dataset=split_ds["val"],
        tokenizer=image_processor, 
        data_collator=collate_fn
    )
    
    trainer.train()
    
    model.eval()
    metric = MeanAveragePrecision(box_format='xyxy')
    for i in tqdm(range(len(split_ds["test"])), desc=f"Final Test {name}"):
        batch = split_ds["test"][i]
        pixel_values = batch["pixel_values"].unsqueeze(0).to(model.device)
        raw_item = raw_test[i]
        orig_size = torch.tensor([raw_item["orig_size"]])
        
        with torch.no_grad():
            outputs = model(pixel_values)
        
        results = image_processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=orig_size)[0]
        
        target_boxes = [[obj["bbox"][0], obj["bbox"][1], obj["bbox"][0]+obj["bbox"][2], obj["bbox"][1]+obj["bbox"][3]] for obj in raw_item["objects"]]
        target_labels = [obj["category_id"] for obj in raw_item["objects"]]
            
        if len(target_boxes) > 0:
            metric.update(
                [dict(boxes=results["boxes"].cpu(), scores=results["scores"].cpu(), labels=results["labels"].cpu())],
                [dict(boxes=torch.tensor(target_boxes, dtype=torch.float32), labels=torch.tensor(target_labels, dtype=torch.int64))]
            )
    
    final_res = metric.compute()
    print(f"\n--- {name.upper()} FINAL TEST RESULTS ---")
    print(f"mAP50: {final_res['map_50']:.4f}")
    print(f"mAP50-95: {final_res['map']:.4f}")
    return final_res

In [1]:
torch.cuda.empty_cache()
gc.collect()

data_colorful = prepare_and_split_data("data", "colorful")
res_color = run_resnet_experiment(data_colorful, "colorful")

data_bw = prepare_and_split_data("data", "black_and_white")
res_bw = run_resnet_experiment(data_bw, "bw")

Some weights of ConditionalDetrForObjectDetection were not initialized from the model checkpoint at microsoft/conditional-detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.bias: found shape torch.Size([91]) in the checkpoint and torch.Size([3]) in the model instantiated
- class_labels_classifier.weight: found shape torch.Size([91, 256]) in the checkpoint and torch.Size([3, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Epoch,Training Loss,Validation Loss
0,3.1426,2.897964
1,2.7036,2.782233
2,2.4897,2.707849
4,2.3548,2.499986
5,2.2265,2.445969
6,2.1816,2.305512
8,2.1801,2.231669
9,2.0268,2.234604
10,2.0576,2.198721
12,1.97,2.122638


Final Test colorful:   0%|          | 0/42 [00:00<?, ?it/s]


--- COLORFUL FINAL TEST RESULTS ---
mAP50: 0.3144
mAP50-95: 0.1346


Some weights of ConditionalDetrForObjectDetection were not initialized from the model checkpoint at microsoft/conditional-detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.bias: found shape torch.Size([91]) in the checkpoint and torch.Size([3]) in the model instantiated
- class_labels_classifier.weight: found shape torch.Size([91, 256]) in the checkpoint and torch.Size([3, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
0,2.2785,2.354462
1,2.1195,2.145136
2,2.0724,2.080273
4,1.7905,1.837766
5,1.7815,1.792887
6,1.7322,1.714936
8,1.6309,1.693833
9,1.5845,1.643676
10,1.5164,1.594365
12,1.4521,1.531413




Final Test bw:   0%|          | 0/709 [00:00<?, ?it/s]


--- BW FINAL TEST RESULTS ---
mAP50: 0.5422
mAP50-95: 0.2625
