# Detection and Classification of Military Planes: A Comparative Study of CNN, YOLO, Faster R-CNN, RetinaNet, and EfficientDet


---

In the midst of the ongoing war in Ukraine, the ability to accurately detect and classify military aircraft is of critical importance for surveillance, defense, and strategic planning. This capability can significantly contribute to national security and defense efforts.

Traditionally, image recognition tasks have relied on simple Convolutional Neural Networks (CNNs) for their relatively straightforward architecture and ease of implementation. However, with the availability of more complex models such as YOLO (You Only Look Once), Faster R-CNN, RetinaNet, and EfficientDet, the potential for higher accuracy and faster detection speeds has increased.

This project seeks to explore and compare the performance of these advanced models against a simple CNN baseline for the task of military aircraft detection and classification. By conducting this comparative study, I aim to identify the most suitable model for practical applications in military contexts.

I have decided to use a Military Aircraft Recognition dataset from the kaggle.com. This dataset include 3842 images, 20 types, and 22341 instances annotated with horizontal bounding boxes and oriented bounding boxes.

In order to simplify work I have downloaded all the dataset into my git repository. Lets start by importing our project code and data from the git repository:


In [None]:
# Clone the repository
!git clone https://github.com/AndriyDragan/HawkEye.git

# Install YOLOv5
%pip install -U ultralytics

# Install EfficientDet
%pip install -U effdet

# Data analysys and preparation

Now lets do some import of the dependecies and look at our data:

In [None]:
import os
import gc
import itertools
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import matplotlib.pyplot as plt
import cv2
from collections import Counter, defaultdict
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision.ops import box_iou
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn, retinanet_resnet50_fpn
from torchvision.models.detection.retinanet import RetinaNetHead
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from ultralytics import YOLO
from effdet import create_model, DetBenchTrain
from tqdm.notebook import tqdm
import time
import random

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


data_dir = 'HawkEye/Data'

imfiles = os.listdir(os.path.join(data_dir, 'Images'))
imfiles = [os.path.join(data_dir, 'Images', f) for f in imfiles if os.path.splitext(f)[-1] == '.jpg']

def imread(filename):
    return cv2.cvtColor(cv2.imread(filename), cv2.COLOR_BGR2RGB)

sample = random.choice(imfiles)
image = imread(sample)
rows, cols, channels = image.shape

plt.imshow(image)
print('Number of samples:', len(imfiles))
print('Image shape:      ', image.shape)

In order to verify if our images have same size I will iterate all of them and clasify by shape:

In [None]:
# Dictionary to store the count of images for each shape
image_shapes = defaultdict(int)

for imfile in tqdm(imfiles):
    image = cv2.imread(imfile)
    if image is not None:
        shape = image.shape
        image_shapes[shape] += 1

# Iterate over all images and collect information about their shapes
for shape, count in image_shapes.items():
    print(f'Shape: {shape}, Count: {count}')

As we can see there is a big veriety of the image shapes yet the main claster is Shape: (800, 800, 3), Count: 3167. This will be the base of our dataset:

In [None]:
# Function to read annotations
def read_annotations(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    annotations = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)
        annotations.append((name, (xmin, ymin, xmax, ymax)))
    return annotations


def read_data(file_names, data_dir):
    data = []
    labels = []
    dims = []

    for file_name in file_names:
        img_path = os.path.join(data_dir, 'Images', file_name + '.jpg')
        xml_path = os.path.join(data_dir, 'Labels', 'Horizontal Bounding Boxes', file_name + '.xml')
        img = Image.open(img_path)

        if img.size != (800, 800):
            continue

        annotations = read_annotations(xml_path)

        for annot in annotations:
            width = abs(annot[1][0] - annot[1][2])
            height = abs(annot[1][1] - annot[1][3])
            dims.append((width, height))
            labels.append(annot[0])

        data.append((img_path, xml_path, (width, height)))

    return data, labels, dims

file_names = [f.split('.')[0] for f in os.listdir(os.path.join(data_dir, 'Images'))]
data, labels, dims = read_data(file_names, data_dir)


Lets display some more of the random samples from our dataset, already with proper annotations:

In [None]:
# Display random samples
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.flatten()

for ax in axes:
    idx = np.random.randint(0, len(data) - 1)
    img_path, xml_path, _ = data[idx]
    img = Image.open(img_path)
    annotations = read_annotations(xml_path)
    draw = ImageDraw.Draw(img)

    for obj in annotations:
        label, (xmin, ymin, xmax, ymax) = obj
        draw.rectangle([xmin, ymin, xmax, ymax], outline='red')
        font_size = 20
        draw.text((xmax, ymin), label, fill='red')

    ax.imshow(img)
    ax.axis('off')

plt.tight_layout()
plt.show()

I would like to visualise some additional properties of the dataset we will work with:

In [None]:
# Compute and plot the per-class histogram
hist = Counter(labels)
plt.figure(figsize=(15, 5))
plt.bar(hist.keys(), hist.values())
plt.grid(True)
plt.xlabel('Class Label')
plt.ylabel('Counts')
plt.title('Per-Class Histogram')
plt.show()

In [None]:
# Print the per-class distribution
sorted_hist = dict(sorted(hist.items()))
for class_label, count in sorted_hist.items():
    print(f'Class {class_label}: {count} instances')

In [None]:
# Sample a random subset of the dataset for brightness analysis
sample_size = 1000
sample_data_indices = np.random.choice(len(data), sample_size, replace=False)
brightness = []

for idx in sample_data_indices:
    img_path = data[idx][0]
    img = Image.open(img_path).convert('L')
    brightness.append(np.mean(np.array(img)))

sample_brightness = pd.DataFrame(brightness, columns=['Brightness'])

# Plot brightness distribution for the sample
plt.figure(figsize=(15, 5))
plt.hist(sample_brightness['Brightness'], bins=50, alpha=0.7)
plt.xlabel('Brightness')
plt.ylabel('Frequency')
plt.title('Brightness Distribution (Sample)')
plt.show()

In [None]:
# Brightness distribution per class for the sample
sample_labels = [labels[idx] for idx in sample_data_indices]
brightness_per_class = pd.DataFrame({'ClassId': sample_labels, 'Brightness': brightness}).groupby('ClassId')['Brightness'].mean()

plt.figure(figsize=(15, 5))
plt.bar(brightness_per_class.index, brightness_per_class.values)
plt.xlabel('Class Label')
plt.ylabel('Average Brightness')
plt.title('Average Brightness per Class (Sample)')
plt.grid(True)
plt.show()

I am a bit worried by severely unequal per-class distribution and would like to mitigate the risks of some classes undertraining. At first I decided that simplest pass would be to remove overepresented classes. Lets take maximum of 500 images of each class.

In [None]:
# Counter to count instances of each class
def filter_data(data, class_counter, max_count):
    new_data = []
    new_class_counter = {class_name: 0 for class_name in class_counter}
    for img_path, xml_path, dimensions in data:
        annotations = read_annotations(xml_path)
        filtered_annotations = []
        for annot in annotations:
            class_name = annot[0]
            if new_class_counter[class_name] < max_count:
                filtered_annotations.append(annot)
                new_class_counter[class_name] += 1
        if filtered_annotations:
            new_data.append((img_path, xml_path, dimensions))
    return new_data

class_counter = Counter(labels)
max_count = 500
filtered_data = filter_data(data, class_counter, max_count)

# Update labels based on filtered_data
new_labels = []
for img_path, xml_path, dimensions in filtered_data:
    annotations = read_annotations(xml_path)
    for annot in annotations:
        new_labels.append(annot[0])

# Recount instances of each class in the new dataset
new_class_counter = Counter(new_labels)
sorted_new_class_counter = dict(sorted(new_class_counter.items()))

# Print new statistics
for class_label, count in sorted_new_class_counter.items():
    print(f'Class {class_label}: {count} instances')

In [None]:
# Plot the updated per-class histogram
plt.figure(figsize=(15, 5))
plt.bar(sorted_new_class_counter.keys(), sorted_new_class_counter.values())
plt.grid(True)
plt.xlabel('Class Label')
plt.ylabel('Counts')
plt.title('Per-Class Histogram After Filtering')
plt.show()

Yes I made my dataset much smaller yet I belive its optimal option to mitigate issues of class imbalance. I have no time to look for more data and using proper data augmentation is also out of scope of this project. But I will use pre-trained models and hope this data will be enough  for finetuning.

In [None]:
# List of class names in the order of their class IDs
class_names = ['A1', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A2', 'A20', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9']

# Split data into training and validation sets
random.shuffle(filtered_data)
split_index = int(0.8 * len(filtered_data))
train_data = filtered_data[:split_index]
val_data = filtered_data[split_index:]

# Ensure paths in train.txt and test.txt are correct
write_data_to_file(train_data, os.path.join(data_dir, 'DataLists', 'train.txt'))
write_data_to_file(val_data, os.path.join(data_dir, 'DataLists', 'test.txt'))

# Write paths to image and XML annotation files
def write_data_to_file(data, file_path):
    with open(file_path, 'w') as f:
        for img_path, xml_path, _ in data:
            f.write(f"{img_path}\t{xml_path}\n")

# Use new file names for image-annotation pairs to avoid overwriting
write_data_to_file(train_data, os.path.join(data_dir, 'DataLists', 'train.txt'))
write_data_to_file(val_data, os.path.join(data_dir, 'DataLists', 'test.txt'))

# YOLO

In [None]:
import shutil

# Function to parse XML and convert to YOLO format
def convert_xml_to_yolo(xml_path, img_size=(800, 800)):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    yolo_data = []

    for obj in root.findall('object'):
        name = obj.find('name').text
        class_id = class_names.index(name)

        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        # Convert to YOLO format
        x_center = (xmin + xmax) / 2 / img_size[0]
        y_center = (ymin + ymax) / 2 / img_size[1]
        width = (xmax - xmin) / img_size[0]
        height = (ymax - ymin) / img_size[1]

        yolo_data.append(f"{class_id} {x_center} {y_center} {width} {height}")

    return yolo_data

# Function to copy files based on file path lists and convert labels
def copy_and_convert_files(file_list, img_dest, lbl_dest):
    with open(file_list, 'r') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                print(f"Skipping line due to unexpected format: {line}")
                continue
            img_path, xml_path = parts
            shutil.copy(img_path, img_dest)

            # Convert XML to YOLO and save
            yolo_data = convert_xml_to_yolo(xml_path)
            yolo_lbl_path = os.path.join(lbl_dest, os.path.splitext(os.path.basename(img_path))[0] + '.txt')
            with open(yolo_lbl_path, 'w') as lbl_file:
                lbl_file.write("\n".join(yolo_data))

# Create directories for YOLO dataset
yolo_base_dir = '/content/datasets/dataset'
train_img_dir = os.path.join(yolo_base_dir, 'train/images')
train_lbl_dir = os.path.join(yolo_base_dir, 'train/labels')
val_img_dir = os.path.join(yolo_base_dir, 'val/images')
val_lbl_dir = os.path.join(yolo_base_dir, 'val/labels')

os.makedirs(train_img_dir, exist_ok=True)
os.makedirs(train_lbl_dir, exist_ok=True)
os.makedirs(val_img_dir, exist_ok=True)
os.makedirs(val_lbl_dir, exist_ok=True)

# Copy and convert training and validation files
copy_and_convert_files(os.path.join(data_dir, 'DataLists', 'train.txt'), train_img_dir, train_lbl_dir)
copy_and_convert_files(os.path.join(data_dir, 'DataLists', 'test.txt'), val_img_dir, val_lbl_dir)

In [None]:
# Create data.yaml with correct paths
data_yaml_content = f"""
train: {os.path.join(yolo_base_dir, 'train')}
val: {os.path.join(yolo_base_dir, 'val')}
nc: {len(set(labels))}
names: {list(set(labels))}
"""

with open('data.yaml', 'w') as f:
    f.write(data_yaml_content)

In [None]:
# Train YOLO
yolo_model = YOLO('yolov5su.pt')

# Capture training history
history = yolo_model.train(data='data.yaml', epochs=10, imgsz=800)

# Validate YOLO
yolo_results = yolo_model.val()
print(yolo_results)


In [None]:
train_loss = [x['train']['loss'] for x in history['metrics']]
val_loss = [x['val']['loss'] for x in history['metrics']]
epochs = range(1, len(train_loss) + 1)

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.plot(epochs, train_loss, 'b-', label='Training loss')
plt.plot(epochs, val_loss, 'r-', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

TypeError: 'DetMetrics' object is not subscriptable

# Faster R-CNN

In [None]:
class CustomDataset(Dataset):
    def __init__(self, txt_file, transforms=None):
        self.txt_file = txt_file
        self.transforms = transforms
        self.imgs = []
        self.annotations = []
        with open(txt_file) as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split('\t')
                if len(parts) != 2:
                    print(f"Skipping line due to unexpected format: {line}")
                    continue
                img_path, xml_path = parts
                self.imgs.append(img_path)
                self.annotations.append(xml_path)

    def __getitem__(self, idx):
        img_path = self.imgs[idx]
        xml_path = self.annotations[idx]
        img = Image.open(img_path).convert("RGB")
        target = self.parse_xml(xml_path)

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

    def parse_xml(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()
        boxes = []
        labels = []
        for obj in root.findall('object'):
            name = obj.find('name').text
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_names.index(name) + 1)  # class label starts from 1

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels

        return target

In [None]:
transforms = T.Compose([T.ToTensor()])

train_dataset = CustomDataset(os.path.join(data_dir, 'DataLists', 'train.txt'), transforms)
val_dataset = CustomDataset(os.path.join(data_dir, 'DataLists', 'test.txt'), transforms)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [None]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define the model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = len(class_names) + 1  # number of classes + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop with history tracking
num_epochs = 30
train_loss_history = []
val_loss_history = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    avg_epoch_loss = epoch_loss / len(train_loader)
    train_loss_history.append(avg_epoch_loss)

    print(f'Epoch {epoch+1}, Loss: {avg_epoch_loss}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, targets in val_loader:
            images = list(img.to(device) for img in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            val_loss += losses.item()

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)

    print(f'Epoch {epoch+1}, Validation Loss: {avg_val_loss}')

# Validation
model.eval()
with torch.no_grad():
    for images, targets in val_loader:
        images = list(img.to(device) for img in images)
        outputs = model(images)
        print(outputs)

In [None]:
# Plot training history
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.plot(epochs, train_loss_history, 'b-', label='Training loss')
plt.plot(epochs, val_loss_history, 'r-', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.show()

# RetinaNet

In [None]:
import torch
from torchvision.ops import box_iou
from torchvision.models.detection import RetinaNet_ResNet50_FPN_Weights, retinanet_resnet50_fpn
from torchvision.models.detection.retinanet import RetinaNetHead

# Load the pre-trained RetinaNet model
retinanet_model = retinanet_resnet50_fpn(weights=RetinaNet_ResNet50_FPN_Weights.DEFAULT)
num_classes = len(set(labels)) + 1  # +1 for background

# Modify the classification head
in_features = retinanet_model.head.classification_head.conv[0][0].in_channels
num_anchors = retinanet_model.head.classification_head.num_anchors
retinanet_model.head.classification_head = RetinaNetHead(in_features, num_anchors, num_classes)

retinanet_model.to(device)

In [None]:
# Define the optimizer
optimizer = torch.optim.Adam(retinanet_model.parameters(), lr=0.001)

def compute_loss(model, images, targets):
    model.eval()
    with torch.no_grad():
        outputs = model(images)
    matched_idxs = []
    for target, pred in zip(targets, outputs):
        matched_idx = box_iou(target['boxes'], pred['boxes']).max(dim=1)[1]
        matched_idxs.append(matched_idx)
    return model.head.compute_loss(targets, outputs, matched_idxs)

# Training loop with history tracking
num_epochs = 10
train_loss_history = []
val_loss_history = []

for epoch in range(num_epochs):
    retinanet_model.train()
    epoch_loss = 0
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass and loss computation
        loss_dict = compute_loss(retinanet_model, images, targets)

        # Sum up all the losses
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass and optimization
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    avg_epoch_loss = epoch_loss / len(train_loader)
    train_loss_history.append(avg_epoch_loss)

    print(f'Epoch {epoch + 1}, Loss: {avg_epoch_loss}')


    # Validation
    retinanet_model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, targets in val_loader:
            images = list(img.to(device) for img in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Forward pass and loss computation
            loss_dict = compute_loss(retinanet_model, images, targets)

            # Sum up all the losses
            losses = sum(loss for loss in loss_dict.values())

            val_loss += losses.item()

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)

    print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss}')

In [None]:
# Plot training history
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.plot(epochs, train_loss_history, 'b-', label='Training loss')
plt.plot(epochs, val_loss_history, 'r-', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.show()

# EfficientDet

In [None]:
import torch
from torch.utils.data import DataLoader
from effdet import create_model, DetBenchTrain, DetBenchEval
from effdet.evaluator import CocoEvaluator
from effdet.data import create_loader, create_dataset, resolve_input_config
from torchvision.transforms import ToTensor

In [None]:
# Initialize the model
effdet_model = create_model('tf_efficientdet_d0', num_classes=len(class_names) + 1, pretrained=True)
effdet_model = DetBenchTrain(effdet_model, config=effdet_model.config)
effdet_model.to(device)

# Define the optimizer
optimizer = torch.optim.AdamW(effdet_model.parameters(), lr=0.001)

# Initialize variables to track loss history
train_loss_history = []
val_loss_history = []

# Training loop with history tracking
num_epochs = 10
for epoch in range(num_epochs):
    effdet_model.train()
    epoch_loss = 0
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = effdet_model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        losses.backward()
        optimizer.step()
        epoch_loss += losses.item()

    avg_epoch_loss = epoch_loss / len(train_loader)
    train_loss_history.append(avg_epoch_loss)

    print(f'Epoch {epoch + 1}, Loss: {avg_epoch_loss}')

In [None]:
# Validation
effdet_model.eval()
val_loss = 0
with torch.no_grad():
    for images, targets in val_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = effdet_model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        val_loss += losses.item()

avg_val_loss = val_loss / len(val_loader)
val_loss_history.append(avg_val_loss)

print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss}')

In [None]:
# Plot training history
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.plot(epochs, train_loss_history, 'b-', label='Training loss')
plt.plot(epochs, val_loss_history, 'r-', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.show()

# Evaluation of the models

In [None]:
# Function to evaluate a model
def evaluate_model(model, dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for images, targets in dataloader:
            images = list(image.to(device) for image in images)
            outputs = model(images)

            for i, output in enumerate(outputs):
                all_labels.append(targets[i]['labels'].cpu().numpy())
                all_preds.append(output['labels'].cpu().numpy())

    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    mAP = average_precision_score(all_labels, all_preds, average='macro')

    return precision, recall, f1, mAP

# Function to measure inference time
def measure_inference_time(model, dataloader, device):
    start_time = time.time()
    with torch.no_grad():
        for images, _ in dataloader:
            images = list(image.to(device) for image in images)
            outputs = model(images)
    end_time = time.time()
    inference_time = end_time - start_time
    return inference_time

In [None]:
# YOLO Evaluation
yolo_model = YOLO('path/to/your/yolo/model')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

precision, recall, f1, mAP = evaluate_model(yolo_model, val_loader, device)
inference_time = measure_inference_time(yolo_model, val_loader, device)

evaluation_results['YOLO'] = {
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'mAP': mAP,
    'Inference Time': inference_time
}

print(evaluation_results['YOLO'])

In [None]:
# Faster R-CNN Evaluation
faster_rcnn_model = fasterrcnn_resnet50_fpn(pretrained=True, num_classes=len(class_names) + 1)
faster_rcnn_model.to(device)

precision, recall, f1, mAP = evaluate_model(faster_rcnn_model, val_loader, device)
inference_time = measure_inference_time(faster_rcnn_model, val_loader, device)

evaluation_results['Faster R-CNN'] = {
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'mAP': mAP,
    'Inference Time': inference_time
}

print(evaluation_results['Faster R-CNN'])

In [None]:
# RetinaNet Evaluation
retinanet_model = retinanet_resnet50_fpn(pretrained=True)
in_features = retinanet_model.head.classification_head.conv[0].in_channels
num_anchors = retinanet_model.head.classification_head.num_anchors
num_classes = len(class_names) + 1  # Include background class
retinanet_model.head.classification_head = RetinaNetHead(in_features, num_anchors, num_classes)
retinanet_model.to(device)

precision, recall, f1, mAP = evaluate_model(retinanet_model, val_loader, device)
inference_time = measure_inference_time(retinanet_model, val_loader, device)

evaluation_results['RetinaNet'] = {
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'mAP': mAP,
    'Inference Time': inference_time
}

print(evaluation_results['RetinaNet'])

In [None]:
# EfficientDet Evaluation
effdet_model = create_model('tf_efficientdet_d0', num_classes=len(class_names) + 1)
effdet_model = DetBenchTrain(effdet_model, config)
effdet_model.to(device)

precision, recall, f1, mAP = evaluate_model(effdet_model, val_loader, device)
inference_time = measure_inference_time(effdet_model, val_loader, device)

evaluation_results['EfficientDet'] = {
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'mAP': mAP,
    'Inference Time': inference_time
}

print(evaluation_results['EfficientDet'])

Conclusion

1. Good data is a key! Garbge in - garbage out
2. Basic CNN would not work for object detection and clasification, vasted time on that track