## Setup 
This is originally on Google Colab.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Direct to your assignment folder.
%cd /content/drive/MyDrive/ML_P2_Illustrations

%load_ext autoreload
%autoreload 1

# Install
!pip install torch torchvision torchaudio ultralytics supervision
!!pip install transformers accelerate sentencepiece


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ML_P2_Illustrations




## Benchmarking for Entity Detection/Classification
We benchmarked three entity detection models: YOLOv10, Faster R-CNN and Detr on a manually annotated sample set. The confidence threshold is set at 5% to ensure more accurate classification: `Human` (image containing entities of "person" and no any animals), `Animal` (image containing entities of animals and no "person"), `Both` (image containing entities of "person" and animals), and `Neither` (image containing neither entities of "person" nor animals).

In [None]:
import os
import pandas as pd
import torch
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from transformers import DetrForObjectDetection, DetrImageProcessor
import supervision as sv

# Settings
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CONFIDENCE_THRESHOLD = 0.05

# Load YOLO model
yolo_model = YOLO("yolov10x.pt")

# Load Faster R-CNN model
faster_rcnn_model = fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn_model.to(DEVICE)
faster_rcnn_model.eval()

# Load DETR model and processor
CHECKPOINT = 'facebook/detr-resnet-50'
detr_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)
detr_model = DetrForObjectDetection.from_pretrained(CHECKPOINT)
detr_model.to(DEVICE)
detr_model.eval()

# Define category mappings
category_dict = {
    0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
    6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
    11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
    16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
    22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie',
    28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball',
    33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard',
    37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass',
    41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana',
    47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot',
    52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
    57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
    62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
    67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
    72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
    77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
}


animal_classes = {'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe'}

# Classification logic
def classify_detections(detected_classes):
    has_human = 'person' in detected_classes
    has_animal = any(animal in detected_classes for animal in animal_classes)
    if has_human and has_animal:
        return "Both"
    elif has_human:
        return "Human"
    elif has_animal:
        return "Animal"
    else:
        return "neither"

# Function to process images with YOLO
def process_with_yolo(image_path):
    results = yolo_model(image_path, conf=CONFIDENCE_THRESHOLD, verbose=False)[0]
    detections = sv.Detections.from_ultralytics(results)
    detected_classes = {category_dict[class_id] for class_id in detections.class_id}
    return classify_detections(detected_classes)

# Function to process images with Faster R-CNN
def process_with_faster_rcnn(image_path):
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    tensor_image = F.to_tensor(image_rgb).unsqueeze(0).to(DEVICE)
    outputs = faster_rcnn_model(tensor_image)[0]

    # Filter detections to include only valid class IDs
    detected_classes = {
        category_dict[label.item()] for label, score in zip(outputs['labels'], outputs['scores'])
        if score.item() > CONFIDENCE_THRESHOLD and label.item() in category_dict
    }
    return classify_detections(detected_classes)


# Function to process images with DETR
def process_with_detr(image_path):
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    inputs = detr_processor(images=image_rgb, return_tensors="pt").to(DEVICE)
    outputs = detr_model(**inputs)

    logits = outputs.logits[0]
    detected_classes = {
        category_dict[label.item()] for label, score in zip(logits.argmax(-1), logits.max(-1).values)
        if score.item() > CONFIDENCE_THRESHOLD and label.item() in category_dict
    }
    return classify_detections(detected_classes)


# Load annotations
annotations_path = "Annotation.xlsx"
annotations_df = pd.read_excel(annotations_path)

# Define base folder for images
base_folder = "preprocessingV2/out"

# Run benchmarking
results = []
for _, row in annotations_df.iterrows():
    folder = row['Book']
    page_number = row['Page number']
    image_path = os.path.join(base_folder, folder, f"page{page_number}.png")

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    human_class = row['Yann']  # Human annotation

    # Process image with each model
    yolo_class = process_with_yolo(image_path)
    faster_rcnn_class = process_with_faster_rcnn(image_path)
    detr_class = process_with_detr(image_path)

    # Append results
    results.append({
        'Image': image_path,
        'Human Annotation': human_class,
        'YOLO Classification': yolo_class,
        'Faster R-CNN Classification': faster_rcnn_class,
        'DETR Classification': detr_class
    })

# Create a DataFrame of results
results_df = pd.DataFrame(results)

# Compute accuracies
def compute_accuracy(model_column):
    correct_classifications = (results_df['Human Annotation'] == results_df[model_column]).sum()
    return correct_classifications / len(results_df)

yolo_accuracy = compute_accuracy('YOLO Classification')
faster_rcnn_accuracy = compute_accuracy('Faster R-CNN Classification')
detr_accuracy = compute_accuracy('DETR Classification')

# Output the accuracies and results
print(f"YOLO Accuracy: {yolo_accuracy * 100:.2f}%")
print(f"Faster R-CNN Accuracy: {faster_rcnn_accuracy * 100:.2f}%")
print(f"DETR Accuracy: {detr_accuracy * 100:.2f}%")


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


YOLO Accuracy: 69.00%
Faster R-CNN Accuracy: 14.00%
DETR Accuracy: 11.00%


In [None]:
results_df

Unnamed: 0,Image,Human Annotation,YOLO Classification,Faster R-CNN Classification,DETR Classification
0,preprocessingV2/out/abcbooka00nulliala/page1.png,Animal,Animal,Animal,Animal
1,preprocessingV2/out/abcbooka00nulliala/page2.png,Animal,Animal,Animal,Animal
2,preprocessingV2/out/abcbooka00nulliala/page3.png,Animal,Animal,Animal,Animal
3,preprocessingV2/out/abcbooka00nulliala/page4.png,Animal,Animal,Animal,Animal
4,preprocessingV2/out/adriftinwildsora00elli/pag...,Human,Both,Animal,neither
...,...,...,...,...,...
95,preprocessingV2/out/commodorebarneys00otis/pag...,Human,Human,neither,neither
96,preprocessingV2/out/commodorebarneys00otis/pag...,Human,Human,Animal,neither
97,preprocessingV2/out/defenseofcastles00jenk/pag...,Human,Human,Animal,neither
98,preprocessingV2/out/defenseofcastles00jenk/pag...,Human,Human,Animal,neither
