In [1]:
import os


relative_train_path = "train"
current_dir = os.getcwd()  #Here
train_path = os.path.join(current_dir, relative_train_path)

train_files = os.listdir(train_path)

train_files[:10]

['apple_49.xml',
 'apple_75.xml',
 'apple_61.xml',
 'apple_75.jpg',
 'apple_61.jpg',
 'apple_49.jpg',
 'orange_3.jpg',
 'orange_3.xml',
 'orange_28.jpg',
 'banana_50.jpg']

In [2]:
import os


relative_test_path = "test"
current_dir = os.getcwd()  #Here
test_path = os.path.join(current_dir, relative_test_path)

test_files = os.listdir(test_path)

test_files[:10]

['banana_93.xml',
 'banana_78.jpg',
 'banana_87.xml',
 'banana_93.jpg',
 'banana_87.jpg',
 'banana_78.xml',
 'banana_86.jpg',
 'banana_79.xml',
 'banana_92.jpg',
 'banana_79.jpg']

In [3]:
import xml.etree.ElementTree as ET

# Select a random XML file
xml_file = os.path.join(train_path, 'apple_28.xml')

# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()

# Print the entire XML content
ET.dump(root)


<annotation>
	<folder>train</folder>
	<filename>apple_28.jpg</filename>
	<path>C:\tensorflow1\models\research\object_detection\images\train\apple_28.jpg</path>
	<source>
		<database>Unknown</database>
	</source>
	<size>
		<width>0</width>
		<height>0</height>
		<depth>3</depth>
	</size>
	<segmented>0</segmented>
	<object>
		<name>apple</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>25</xmin>
			<ymin>42</ymin>
			<xmax>275</xmax>
			<ymax>297</ymax>
		</bndbox>
	</object>
</annotation>


In [4]:
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms as T
import glob

# Create a dictionary to map class names to integer IDs
class_dict = {"apple": 0, "banana": 1, "mixed": 2, "orange": 3}

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    boxes = []
    labels = []

    for obj in root.iter("object"):
        label = class_dict[obj.find("name").text]

        bbox = obj.find("bndbox")
        xmin = int(bbox.find("xmin").text)
        ymin = int(bbox.find("ymin").text)
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)

        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(label)

    return {"boxes": boxes, "labels": labels}

In [5]:
class FruitDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform

        self.imgs = sorted(glob.glob(os.path.join(root, "*.jpg")))
        self.labels = sorted(glob.glob(os.path.join(root, "*.xml")))

    def __getitem__(self, idx):
        img_path = self.imgs[idx]
        label_path = self.labels[idx]

        img = Image.open(img_path).convert("RGB")

        # Parse the XML file
        label_data = parse_xml(label_path)

        # Convert the bounding boxes and labels to tensors
        boxes = torch.as_tensor(label_data["boxes"], dtype=torch.float32)
        labels = torch.as_tensor(label_data["labels"], dtype=torch.int64)

        # Compute the area of the bounding boxes
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        # All instances are not crowd
        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = torch.tensor([idx])
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transform:
            img, target = self.transform(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

# Create instances of the FruitDataset
train_dataset = FruitDataset(train_path)
test_dataset = FruitDataset(test_path)

# Number of images in the training and test sets
len(train_dataset), len(test_dataset)


(240, 60)

In [6]:
# Calculate the number of fruits per image in the training set
num_fruits_train = [len(data[1]["labels"]) for data in train_dataset]

# Calculate the number of fruits per image in the test set
num_fruits_test = [len(data[1]["labels"]) for data in test_dataset]

# Compute some statistics
min_fruits_train, max_fruits_train = min(num_fruits_train), max(num_fruits_train)
min_fruits_test, max_fruits_test = min(num_fruits_test), max(num_fruits_test)

min_fruits_train, max_fruits_train, min_fruits_test, max_fruits_test




(1, 9, 1, 5)

In [7]:
def transform(image, target):
    # Resize the image and target
    resize = T.Resize((416, 416))
    image = resize(image)

    new_target = target.copy()
    new_target["boxes"] = target["boxes"] / torch.tensor([image.width, image.height, image.width, image.height])

    # Convert the bounding boxes to YOLO format
    new_target["boxes"] = torch.stack([
        torch.max(new_target["boxes"][:, 0], new_target["boxes"][:, 2]),  # x_center
        torch.max(new_target["boxes"][:, 1], new_target["boxes"][:, 3]),  # y_center
        torch.abs(new_target["boxes"][:, 2] - new_target["boxes"][:, 0]),  # width
        torch.abs(new_target["boxes"][:, 3] - new_target["boxes"][:, 1])  # height
    ], dim=1)

    # Normalize the image
    normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    image = normalize(T.ToTensor()(image))

    return image, new_target


In [8]:
def transform(image, target):
    # Resize the image and target
    resize = T.Resize((416, 416))
    image = resize(image)

    new_target = target.copy()
    new_target["boxes"] = target["boxes"] / torch.tensor([image.width, image.height, image.width, image.height])

    # Convert the bounding boxes to YOLO format
    new_target["boxes"] = torch.stack([
        (new_target["boxes"][:, 0] + new_target["boxes"][:, 2]) / 2,  # x_center
        (new_target["boxes"][:, 1] + new_target["boxes"][:, 3]) / 2,  # y_center
        new_target["boxes"][:, 2] - new_target["boxes"][:, 0],  # width
        new_target["boxes"][:, 3] - new_target["boxes"][:, 1]  # height
    ], dim=1)

    # Normalize the image
    normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    image = normalize(T.ToTensor()(image))

    return image, new_target

In [9]:
# Create instances of the FruitDataset with the transform
train_dataset = FruitDataset(train_path, transform=transform)
test_dataset = FruitDataset(test_path, transform=transform)

# Check the transformation
img, target = train_dataset[0]
img.shape, target

(torch.Size([3, 416, 416]),
 {'boxes': tensor([[0.4075, 0.4375, 0.7764, 0.8029]]),
  'labels': tensor([0]),
  'image_id': tensor([0]),
  'area': tensor([107882.]),
  'iscrowd': tensor([0])})

In [10]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [11]:
from torch.utils.data import DataLoader

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Test the data loading
next(iter(train_loader))

((tensor([[[ 1.1187,  1.0673,  0.9646,  ..., -0.5424, -0.4226, -0.4739],
           [ 1.1015,  1.0331,  0.9303,  ..., -0.2684, -0.0801, -0.0972],
           [ 1.1015,  1.0159,  0.8961,  ...,  0.0227,  0.2453,  0.2453],
           ...,
           [ 2.2318,  2.2318,  2.2489,  ...,  2.2318,  2.2318,  2.2318],
           [ 2.2147,  2.2147,  2.2318,  ...,  2.2318,  2.2318,  2.2318],
           [ 2.1975,  2.2318,  2.2489,  ...,  2.2318,  2.2318,  2.2318]],
  
          [[ 1.7283,  1.6933,  1.6583,  ..., -0.2325, -0.1099, -0.1625],
           [ 1.7108,  1.6758,  1.6232,  ...,  0.0826,  0.2577,  0.2227],
           [ 1.6933,  1.6583,  1.5882,  ...,  0.3978,  0.6078,  0.5378],
           ...,
           [ 0.1527,  0.1877,  0.2577,  ...,  1.2556,  1.2031,  1.1506],
           [ 0.0826,  0.1176,  0.1877,  ...,  1.2556,  1.2031,  1.1506],
           [ 0.0476,  0.1176,  0.1702,  ...,  1.2556,  1.2031,  1.1506]],
  
          [[-0.2881, -0.3404, -0.4450,  ..., -1.2293, -1.2467, -1.3339],
           

In [12]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

num_classes = len(class_dict) + 1

# Load a pre-trained version of the model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)


device = 'mps' ## USE cpu or cuda if error


model.to(device)



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [13]:
import torch
# Define the training function
def train_one_epoch(model, optimizer, data_loader, device):
    model.train()
    total_loss = 0

    for images, targets in data_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        total_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

# Define the evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            outputs = model(images)

            # Get the predicted classes
            _, predicted = torch.max(outputs.data, 1)

            # Compare with the targets
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [14]:
import torch

# Define the training parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Train the model for 10 epochs

num_epochs = 10
for epoch in range(num_epochs):
    # Train for one epoch
    train_one_epoch(model, optimizer, train_loader, device, epoch)
    # Update the learning rate
    lr_scheduler.step()


    # Evaluate on the test dataset
    evaluate(model, test_loader, device=device)


TypeError: train_one_epoch() takes 4 positional arguments but 5 were given

In [None]:
# Define the training function
def train_one_epoch(model, optimizer, data_loader, device):
    model.train()

    total_loss = 0
    for images, targets in data_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        total_loss += losses.item()

    return total_loss / len(data_loader)

# Define the evaluation function
@torch.no_grad()
def evaluate(model, data_loader, device):
    model.eval()
    total = 0
    correct = 0

    for images, targets in data_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    return correct / total

# Training the model for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, optimizer, train_loader, device)
    accuracy = evaluate(model, test_loader, device)

    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Accuracy: {accuracy}")

AssertionError: All bounding boxes should have positive height and width. Found invalid box [1.911520004272461, 0.9384245276451111, 1.3914570808410645, 1.4792898893356323] for target at index 0.