In [1]:
# Required libraries
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_fasterrcnn_model(num_classes):
    # Load a pre-trained model
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    
    # Replace the classifier with a new one for custom number of classes
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    return model

# Example usage for a dataset with 5 classes (including background)
model = get_fasterrcnn_model(5)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "
Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /home/ellakiya/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [02:28<00:00, 1.13MB/s] 


In [3]:
from ultralytics import YOLO

def get_yolo_model(model_size='yolov8n.pt'):
    # Load a pre-trained YOLO model
    model = YOLO(model_size)
    return model

# Example usage
yolo_model = get_yolo_model()

In [9]:
from torch.utils.data import Dataset

class RobotVisionDataset(Dataset):
    def __init__(self, image_paths, annotations, transforms=None):
        self.image_paths = image_paths
        self.annotations = annotations
        self.transforms = transforms
        
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = Image.open(img_path).convert("RGB")
        
        # Get annotations for this image
        anns = self.annotations[idx]
        boxes = anns['boxes']
        labels = anns['labels']
        
        # Convert to tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        
        if self.transforms:
            img = self.transforms(img)
            
        return img, target
    
    def __len__(self):
        return len(self.image_paths)

In [10]:
def train_model(model, train_loader, optimizer, device, num_epochs=10):
    model.train()
    model.to(device)
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for images, targets in train_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            optimizer.zero_grad()
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            losses.backward()
            optimizer.step()
            
            running_loss += losses.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")
    
    return model

In [11]:
def detect_objects(model, image_path, confidence_threshold=0.7):
    # Load image
    img = Image.open(image_path).convert("RGB")
    img_tensor = F.to_tensor(img).unsqueeze(0)
    
    # Set model to eval mode and run inference
    model.eval()
    with torch.no_grad():
        predictions = model(img_tensor)
    
    # Process predictions
    pred = predictions[0]
    boxes = pred['boxes'][pred['scores'] > confidence_threshold]
    labels = pred['labels'][pred['scores'] > confidence_threshold]
    scores = pred['scores'][pred['scores'] > confidence_threshold]
    
    # Convert to numpy for visualization
    img_np = np.array(img)
    
    # Draw bounding boxes
    for box, label, score in zip(boxes, labels, scores):
        box = box.cpu().numpy().astype(int)
        cv2.rectangle(img_np, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
        cv2.putText(img_np, f"{label}: {score:.2f}", (box[0], box[1]-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    return img_np

In [14]:
def evaluate_model(model, test_loader, device):
    model.eval()
    model.to(device)
    
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in test_loader:
            images = list(image.to(device) for image in images)
            predictions = model(images)
            
            all_preds.extend(predictions)
            all_targets.extend(targets)
    
    # Calculate mAP (mean Average Precision)
    # Note: In practice, you'd use a proper evaluation function
    # This is a simplified placeholder
    mAP = calculate_mAP(all_preds, all_targets)
    
    print(f"Model mAP: {mAP:.4f}")
    return mAP