In [None]:
! pip install accelerate
! pip install datasets
! pip install timm
! pip install evaluate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import os
import json
import torch
from torch.utils.data import Dataset
from torch.optim import AdamW,lr_scheduler
import torchvision
from datasets import load_dataset
from transformers import AutoImageProcessor,AutoModelForObjectDetection
from transformers import TrainingArguments,Trainer
from PIL import Image,ImageDraw
import matplotlib.pyplot as plt
import requests
import torch.nn.functional as F
import evaluate
from tqdm import tqdm

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
checkpoint = "facebook/detr-resnet-50"
id2label = {0:'human_face'}
label2id = {'human_face':0}
id2label,label2id
num_epoch = 500

processor = AutoImageProcessor.from_pretrained(checkpoint)
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    num_labels=1,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)
optimizer = AdamW(model.parameters(),lr=1e-5)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=num_epoch)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

In [7]:
wider_face = load_dataset('wodeyuanbukongda/detr_format')
# weight = torch.ones(len(wider_face['train']))
# a = torch.multinomial(weight,1000,replacement=False)
# wider_face = wider_face['train'].select(a)
wider_face = wider_face['train'].select(range(0,1000))
wider_face = wider_face.train_test_split(test_size=0.05)
wider_face

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'area', 'bbox', 'label'],
        num_rows: 950
    })
    test: Dataset({
        features: ['image', 'image_id', 'area', 'bbox', 'label'],
        num_rows: 50
    })
})

In [None]:
import albumentations
import numpy as np
import torch

transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=1.0),
        albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [None]:
def formatted_anns(image_id, category, area, bbox):
  annotations = []
  for i in range(0, len(category)):
      new_ann = {
          "image_id": image_id,
          "category_id": category[i],
          "isCrowd": 0,
          "area": area[i],
          "bbox": list(bbox[i]),
      }
      annotations.append(new_ann)

  return annotations

In [None]:
def transform_fn(examples):
  image_ids = examples['image_id']
  images, bboxes, areas, categories = [], [], [], []
  for image, area, bbox, label in zip(examples["image"], examples["area"], examples['bbox'], examples['label']):
    image = np.array(image.convert("RGB"))[:, :, ::-1]
    out = transform(image=image, bboxes=bbox, category=label)

    areas.append(area)
    images.append(out["image"])
    bboxes.append(out["bboxes"])
    categories.append(out["category"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, areas, bboxes)
    ]

    return processor(images=images, annotations=targets, return_tensors="pt")

In [None]:
wider_face_transform = wider_face.with_transform(transform_fn)
wider_face_transform[0]

In [71]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

In [None]:
training_args = TrainingArguments(
    output_dir="detr-resnet-50_finetuned_widerface",
    per_device_train_batch_size=64,
    num_train_epochs=num_epoch,
    fp16=True,
    save_steps=200,
    logging_steps=30,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=wider_face_transform,
    optimizers=(optimizer,scheduler),
    tokenizer=processor,
)

In [None]:
trainer.train()

In [18]:
# image = wider_face['train'][0]['image']
image = Image.open('/content/drive/MyDrive/2024Spring/640ComputerVision/YOLO/Image/0122.jpg')
#image = Image.open('/content/drive/MyDrive/2024Spring/640ComputerVision/YOLO/Image/0014.jpg')
model.eval()
with torch.no_grad():
    inputs = processor(images=image, return_tensors="pt")
    # pixel_values = inputs['pixel_values'].to(device)
    # pixel_mask = inputs['pixel_mask'].to(device)
    # inputs = {'pixel_values':pixel_values,'pixel_mask':pixel_mask}
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(outputs, threshold=0.8, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

Detected human_face with confidence 0.869 at location [36.63, 622.52, 199.98, 816.68]
Detected human_face with confidence 0.902 at location [860.35, 659.39, 1002.66, 846.39]


In [None]:
draw = ImageDraw.Draw(image)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y), model.config.id2label[label.item()], fill="white")

image

In [20]:
torch.save({
      'model_state_dict': model.state_dict(),
  }, r'/content/drive/MyDrive/2024Spring/640ComputerVision/YOLO/checkpoint.pth')

In [42]:
checkpoint = torch.load(r'/content/drive/MyDrive/2024Spring/640ComputerVision/YOLO/checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [75]:
def val_formatted_anns(image_id, category, area, bbox):
  annotations = []
  for i in range(0, len(category)):
      new_ann = {
          "id": image_id,
          "image_id": image_id,
          "category_id": category[i],
          "iscrowd": 0,
          "area": area[i],
          "bbox": list(bbox[i]),
      }
      annotations.append(new_ann)

  return annotations

In [76]:
def save_annotation_file_images(dataset):
    output_json = {}
    path_output = f"{os.getcwd()}/dataset/"

    if not os.path.exists(path_output):
        os.makedirs(path_output)

    path_anno = os.path.join(path_output, "wider_face.json")
    categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
    output_json["images"] = []
    output_json["annotations"] = []
    for example in dataset:
        ann = val_formatted_anns(example["image_id"],example["label"], example["area"], example["bbox"])
        output_json["images"].append(
            {
                "id": example["image_id"],
                "width": example["image"].width,
                "height": example["image"].height,
                "file_name": f"{example['image_id']}.png",
            }
        )
        output_json["annotations"].extend(ann)
    output_json["categories"] = categories_json

    with open(path_anno, "w") as file:
        json.dump(output_json, file, ensure_ascii=False, indent=4)

    for im, img_id in zip(dataset["image"], dataset["image_id"]):
        path_img = os.path.join(path_output, f"{img_id}.png")
        im.save(path_img)

    return path_output, path_anno

In [77]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, image_processor, ann_file):
        super().__init__(img_folder, ann_file)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target: converting target to DETR format,
        # resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {"image_id": image_id, "annotations": target}
        encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
        target = encoding["labels"][0]  # remove batch dimension

        return {"pixel_values": pixel_values, "labels": target}

path_output, path_anno = save_annotation_file_images(wider_face["test"])
test_ds_coco_format = CocoDetection(path_output, processor, path_anno)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [79]:
module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
val_dataloader = torch.utils.data.DataLoader(
    test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
with torch.no_grad():
    for idx, batch in enumerate(tqdm(val_dataloader)):
        pixel_values = batch["pixel_values"].to(device)
        pixel_mask = batch["pixel_mask"].to(device)

        labels = [
            {k: v for k, v in t.items()} for t in batch["labels"]
        ]  # these are in DETR format, resized + normalized

        # forward pass
        outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)

        module.add(prediction=results, reference=labels)
        del batch

results = module.compute()
print(results)

  self.pid = os.fork()
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
  self.pid = os.fork()
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
100%|██████████| 7/7 [00:20<00:00,  2.92s/it]


Accumulating evaluation results...
DONE (t=0.08s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.102
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.003
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.010
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.013
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.003
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.025
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la