Import Required Modules

In [None]:
import torch
from torch.utils.data import DataLoader
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.datasets import ImageFolder
from torchvision import transforms
import torchvision.transforms as T
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [None]:
# Dataset classes
CLASSES = ['cardboard', 'glass', 'metal']
# Load the pre-trained Faster R-CNN model with a ResNet-50 backbone
model = fasterrcnn_resnet50_fpn(pretrained=True)

# Number of classes (your dataset classes + 1 for background)
num_classes = len(CLASSES)  # For example, 2 classes + background

# Get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Replace the head of the model with a new one (for the number of classes in your dataset)
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Create custom dataset

bounding box thing

In [None]:
def convert_yolo_to_pascal(box, img_width, img_height):
    """Convert YOLO (x_center, y_center, w, h) to Pascal VOC (x_min, y_min, x_max, y_max)"""
    x_center, y_center, width, height = box
    x_min = max(0, x_center - width / 2) * img_width
    y_min = max(0, y_center - height / 2) * img_height
    x_max = min(1, x_center + width / 2) * img_width
    y_max = min(1, y_center + height / 2) * img_height
    return [x_min, y_min, x_max, y_max]


In [None]:
import os
from torch.utils.data import Dataset
from PIL import Image

# Define transformations (e.g., resizing, normalization)
transform = T.Compose([
    T.ToTensor(),
])
# Custom Dataset class or using an existing one
class ModelDataset(Dataset):
    def __init__(self, image_dir, label_dir, img_width=1024, img_height=1024, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.img_width = img_width 
        self.img_height = img_height
        self.transform = transform
        self.image_filenames = sorted(os.listdir(image_dir))  # Sort for consistent order

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        img_filename = self.image_filenames[idx]
        img_path = os.path.join(self.image_dir, img_filename)

        # Check if image exists
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image file {img_path} not found.")

        # Load image
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            raise RuntimeError(f"Error loading image {img_path}: {e}")

        # Load corresponding label file
        label_filename = os.path.splitext(img_filename)[0] + ".txt"
        label_path = os.path.join(self.label_dir, label_filename)

        # Read bounding boxes
        boxes, labels = [], []
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f.readlines():
                    parts = line.strip().split()
                    if len(parts) != 5:
                        continue  # Skip incorrect formats
                    class_id = int(parts[0])
                    x_center, y_center, width, height = map(float, parts[1:])
                    # Convert YOLO to Pascal format (x_min, y_min, x_max, y_max)
                    box = convert_yolo_to_pascal([x_center, y_center, width, height], self.img_width, self.img_height)
                    boxes.append(box)
                    labels.append(class_id)
        else:
            print(f"Warning: Label file {label_path} not found. Assigning empty labels.")

        # Convert to tensors
        boxes = torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4), dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64) if labels else torch.zeros((0,), dtype=torch.int64)

        target = {"boxes": boxes, "labels": labels}

        # Apply transforms (ensure img exists)
        if self.transform:
            image = self.transform(image)

        return image, target

    

In [None]:
from torch.utils.data import dataloader
# Load train and val datasets
train_dataset = ModelDataset(image_dir="./datasets/dataset/train/images", label_dir="./datasets/dataset/train/labels", transform=transform)
val_dataset = ModelDataset(image_dir="./datasets/dataset/val/images", label_dir="./datasets/dataset/val/labels", transform=transform)

# DataLoader (Important: Use collate_fn to handle varying number of objects per image)
def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

# Test data loading
for images, targets in train_loader:
    print(images[0].shape, targets[0])  # Image shape and first annotation
    break

Setup dataloader

Training

In [None]:
# Cuda activation if existed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, 
                                                   weight_decay=0.0005)
# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, 
                                                               gamma=0.1)
# Train the model
print("Start training")
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

   # Training loop
    for images, targets in train_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass
        losses.backward()
        optimizer.step()
        train_loss += losses.item()

    # Update the learning rate
    lr_scheduler.step()
    print(f'Epoch: {epoch + 1}, Loss: {train_loss / len(train_loader)}')
print("Training complete!")

Evaluate

In [None]:
# Cuda activation if existed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Set the model to evaluation mode
model.eval()
# Test on a new image
with torch.no_grad():
    for images, targets in val_loader:
        images = list(img.to(device) for img in images)
        predictions = model(images)
        # Example: print the bounding boxes and labels for the first image
        print(predictions[0]['boxes'])
        print(predictions[0]['labels'])

test

In [None]:
import cv2
from PIL import Image, ImageDraw, ImageFont
# Load image
img = Image.open("./datasets/dataset/val/images/3a5036dd-cardboard9.jpg")
# Apply the same transformation as for training
img = transform(img)
# img = img.unsqueeze(0).to(device)
# Model prediction
model.eval()
with torch.no_grad():
    prediction = model([img])
# Print the predicted bounding boxes and labels
print(prediction[0]['boxes'])
print(prediction[0]['labels'])


Save model

In [None]:

import torch

# Save the model
torch.save(model.state_dict(), "FR-CNNmodel.pth")

Load Model

Load the saved weights


In [None]:
model.load_state_dict(torch.load("FR-CNNmodel.pth",map_location=torch.device('cpu')))
model.eval()  # Set to evaluation mode

In [None]:
# Initialize the webcam
cap = cv2.VideoCapture(0)  # 0 for default webcam

# Define transformation to convert frame to a tensor
transform = T.Compose([T.ToTensor()])

while cap.isOpened():
    # Read a frame from the webcam
    success, frame = cap.read()
    if not success:
        print("Error reading frame from webcam.")
        break

    # Convert frame (H, W, C) -> (C, H, W) and normalize
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    frame_tensor = transform(frame_rgb).unsqueeze(0)  # Convert to tensor and add batch dim

    # Run inference
    with torch.no_grad():
        outputs = model(frame_tensor)

    # Process the predictions
    scores = outputs[0]['scores'].numpy()
    labels = outputs[0]['labels'].numpy()
    boxes = outputs[0]['boxes'].numpy()

    # Draw bounding boxes if confidence > 0.5
    for i in range(len(scores)):
        if scores[i] > 0.5:
            x1, y1, x2, y2 = map(int, boxes[i])  # Convert to integers
            label = CLASSES[labels[i]]
            confidence = scores[i]

            # Draw bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            text = f"{label}: {confidence:.2f}"
            cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the annotated frame
    cv2.imshow("Faster R-CNN Object Detection", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Release the webcam and close all windows
cap.release()
cv2.destroyAllWindows()