In [1]:
# 1. Import necessary libraries and modules

import os
import xml.etree.ElementTree as ET
from PIL import Image
import os


relative_train_path = "train"
current_dir = os.getcwd()  #Here
train_path = os.path.join(current_dir, relative_train_path)

In [6]:
train_image_paths = [os.path.join(train_path, filename) for filename in os.listdir(train_path) if filename.endswith('.jpg')]
train_label_paths = [path.replace('.jpg', '.xml') for path in train_image_paths]
train_image_paths[:5]

['/Users/owo/HOUSE/@Code/@Project/ID_FruitDetection/train/apple_75.jpg',
 '/Users/owo/HOUSE/@Code/@Project/ID_FruitDetection/train/apple_61.jpg',
 '/Users/owo/HOUSE/@Code/@Project/ID_FruitDetection/train/apple_49.jpg',
 '/Users/owo/HOUSE/@Code/@Project/ID_FruitDetection/train/orange_3.jpg',
 '/Users/owo/HOUSE/@Code/@Project/ID_FruitDetection/train/orange_28.jpg']

In [None]:
# 3. Convert XML annotations to YOLO format

def convert_coordinates(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return [x, y, w, h]


In [None]:
def convert_annotation(annotation_path, class_dict):
    tree = ET.parse(annotation_path)
    root = tree.getroot()

    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    yolo_annotations = []
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in class_dict or int(difficult) == 1:
            continue
        cls_id = class_dict[cls]
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text),
             float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        converted_coordinates = convert_coordinates((w, h), b)
        yolo_annotations.append([cls_id, *converted_coordinates])
    
    return yolo_annotations


In [None]:
class_dict = {"apple": 0, "banana": 1, "orange": 2, "mixed": 3}

# Convert XML annotations to YOLO format and save them
yolo_train_annotations = [convert_annotation(path, class_dict) for path in train_label_paths]
train_annotation_save_paths = [path.replace('.xml', '.txt') for path in train_label_paths]


In [None]:
import numpy as np
for annotations, save_path in zip(yolo_train_annotations, train_annotation_save_paths):
    np.savetxt(save_path, annotations, fmt=["%d", "%f", "%f", "%f", "%f"])


In [None]:
# 1. Data Loader and Transformations
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image

transform = transforms.Compose([
    transforms.Resize((416, 416)),
    transforms.ToTensor()
])

train_dataset = YOLODataset(train_image_paths, train_annotation_save_paths, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


# 2. Define the Simplified YOLO Architecture
import torch.nn as nn


class SimpleYOLO(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLO, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        
        grid_size = 416 // 8
        self.detector = nn.Conv2d(128, 5 + num_classes, kernel_size=1, stride=1, padding=0)
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.detector(x)
        return x

model = SimpleYOLO(num_classes)


# 3. Loss Function and Optimizer

yolo_loss = YOLOLoss(num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# 4. Training the Model

num_epochs = 5
model.train()
model.to(device)
yolo_loss.to(device)

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for images, labels_list in train_loader:
        optimizer.zero_grad()
        images = images.to(device)
        
        outputs = model(images)
        labels = [labels[0] for labels in labels_list]
        labels = torch.stack(labels).to(device)
        loss = yolo_loss(outputs, labels)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    avg_epoch_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")

print("Training process summarized.")
