In [3]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import skimage.io
from skimage.segmentation import slic
from skimage.measure import find_contours
from skimage.util import img_as_float
import selectivesearch
from sklearn import svm
import os

In [None]:
def selective_search_proposal(img):
    """
    Generates region proposals using Selective Search.

    Args:
        img: A numpy array representing the input image.

    Returns:
        A list of bounding boxes (x1, y1, x2, y2).
    """
    # Perform selective search
    img_lbl, regions = selectivesearch.selective_search(img, scale=500, sigma=0.9, min_size=10)

    candidates = set()
    for r in regions:
        # excluding same areas
        if r['rect'] in candidates:
            continue
        # excluding small areas
        if r['size'] < (img.shape[0]*img.shape[1]*0.001): #the size is > 0.01% total area of image
            continue
        # distorted rects
        x, y, w, h = r['rect']
        if w > h:
            if w/h>3: #width is 3x >height
                continue
        else:
            if h/w>3: #vice versa
                continue
        candidates.add(r['rect'])

    boxes = [] #to return the bounding box locations
    for x, y, w, h in candidates:
        boxes.append((x, y, x + w, y + h))

    return boxes #List[(x1, y1, x2, y2)], bounding box candidates.

In [None]:
class CNNFeatureExtractor(nn.Module):
    def __init__(self, model_name='alexnet', use_cuda=True):
        super(CNNFeatureExtractor, self).__init__()
        self.use_cuda = use_cuda
        if model_name == 'alexnet':
            self.cnn = models.alexnet(pretrained=True)
            self.cnn = nn.Sequential(*list(self.cnn.children())[:-1]) # Remove classifier layer to get output pre the classifier
            self.feature_size = 256 * 6 * 6  # output dimension before flattening, should be 9216
        else:
            raise NotImplementedError

        if self.use_cuda:
            self.cnn = self.cnn.cuda()
        self.cnn.eval()  # Set to evaluation mode

    def forward(self, images):
        """
        Extracts CNN features from a batch of images.

        Args:
            images: A tensor of shape (B, 3, H, W) representing a batch of images.

        Returns:
            A tensor of shape (B, feature_size) representing the CNN features.
        """
        with torch.no_grad():  # Disable gradient calculation during inference
            features = self.cnn(images)
            features = features.view(features.size(0), -1) # Flatten the convolution layer.
        return features


def warp_proposal(image, bbox, target_size=(227, 227), padding=16):
    """
    Warps a region proposal to the target size, including context padding.

    Args:
        image: A PIL Image object.
        bbox: A tuple (x1, y1, x2, y2) representing the bounding box.
        target_size: A tuple (H, W) representing the target size.
        padding: Number of pixels for context padding (post warping).

    Returns:
        A PIL Image object representing the warped region.
    """
    x1, y1, x2, y2 = bbox
    width, height = image.size #image.width and image.height works as well for Pillow

    #Calculate Source Box Dimension before dilation
    W_proposal = x2-x1
    H_proposal = y2-y1

    #Calculate source box scale to make sure that after it's warped to the target Size, there is Padding pixels of context
    scale_x = float(target_size[0])/(W_proposal + padding*2) #new method with padding, and float cast, for greater precision and accuracy.
    scale_y = float(target_size[1])/(H_proposal + padding*2)

    # Calculate dilated Box
    dilated_width= int(W_proposal+(padding*2/scale_x)) #scale back to find the dilation amount
    dilated_height= int(H_proposal+(padding*2/scale_y))

    center_x = (x1+x2) //2 #avoid floating number issue.

    center_y = (y1+y2) //2

    x1_dilated = center_x - dilated_width//2 #integer floor division
    y1_dilated = center_y - dilated_height//2

    x2_dilated = center_x + dilated_width//2
    y2_dilated = center_y + dilated_height//2

    # Clip box to stay inside the image, so it doesn't access null values
    x1_clip = max(0,x1_dilated)
    y1_clip = max(0,y1_dilated)
    x2_clip = min(width,x2_dilated)
    y2_clip = max(height,y2_dilated)

    # Crop the warped image (using Pillow now)
    warped_image = image.crop((x1_clip,y1_clip,x2_clip,y2_clip))

    warped_image = warped_image.resize(target_size) #and after that you have a resized PIL image

    return warped_image


def extract_features(image_path, bboxes, cnn_model, transform, batch_size=32, use_cuda=True): #use CUDA here again
    """
    Extracts CNN features from region proposals in an image.

    Args:
        image_path: Path to the input image.
        bboxes: A list of bounding boxes (x1, y1, x2, y2).
        cnn_model: The CNNFeatureExtractor model.
        transform: PyTorch transform to apply to the warped images.
        batch_size: Batch size for CNN processing.
        use_cuda: A flag indicates whether to use CUDA.

    Returns:
        A numpy array of shape (num_proposals, feature_size) representing the CNN features.
    """
    image = Image.open(image_path).convert('RGB')
    num_proposals = len(bboxes)
    features = []

    for i in range(0, num_proposals, batch_size):
        batch_bboxes = bboxes[i:i + batch_size]
        batch_images = []
        for bbox in batch_bboxes:
            warped_image = warp_proposal(image, bbox)
            img_tensor = transform(warped_image)
            batch_images.append(img_tensor)

        batch_tensor = torch.stack(batch_images) # List[tensor] => tensor

        if use_cuda:
            batch_tensor = batch_tensor.cuda()

        batch_features = cnn_model(batch_tensor)
        features.append(batch_features.cpu().numpy())

    features = np.concatenate(features, axis=0)  #Join the batches together to return it
    return features

In [8]:
def create_training_data(image_dir, annotation_dir, bboxes, labels, feature_extraction_func,  iou_threshold_pos=0.5, iou_threshold_neg=0.3,  num_negatives_per_positive=3 ):
    """
    Creates training data for SVM classifiers using hard negative mining.

    Args:
        image_dir: Directory containing images.
        annotation_dir: Directory containing annotation files (e.g., PASCAL VOC XML).
        bboxes: bounding boxes from the proposal region (Selective Search most likely)
        feature_extraction_func: A function that takes an image path and bounding boxes and returns CNN features.
        iou_threshold_pos: IoU threshold for positive examples.
        iou_threshold_neg: IoU threshold for negative examples.
        num_negatives_per_positive: the amount of negative samples per positive sample

    Returns:
        A tuple (X, y) where X is a numpy array of CNN features and y is a numpy array of labels.
    """
    X = []
    y = []

    #for each image we go through and get the GT box info and proposals
    for image_name in os.listdir(image_dir): #iterate through all the images
        if not image_name.endswith(".jpg"): #only access jpg.
            continue

        image_path = os.path.join(image_dir, image_name)

        gt_boxes, gt_labels = load_annotations(os.path.join(annotation_dir, image_name + ".xml"))  # Use your annotation loading function
        image_bboxes = bboxes[image_name] #get the image's bounding boxes

        num_gt_boxes = len(gt_boxes)
        img_X, img_y = [], []  # lists to append within an image

        #for gt bounding box append GT box and also top N proposals
        for i, gt_box in enumerate(gt_boxes): #iterate through all the GT Boxes
            #Append positive data here.
            #GT box as positive label
            img_X.append(gt_box)  #append the ground truth box location here
            img_y.append(gt_labels[i])   #append the correct class name

            #for every GT box try to append K proposals with IoU < 0.3
            negative_candidates=[]

            for proposal_box in image_bboxes:
                iou = compute_iou(gt_box, proposal_box)

                if iou < iou_threshold_neg: #if iou < 0.3
                    negative_candidates.append(proposal_box)

            #append top K negatives.
            img_X.extend(negative_candidates[:num_negatives_per_positive])
            img_y.extend([0]*num_negatives_per_positive) # append 0 which is the negative class


        ###########################################################
        #At this point we have all positives and negatives in an image.
        ###########################################################

        #perform feature extraction on ALL proposals before going to next image.
        #the length here would be = numGTbox * N number of proposal * dimension,
        num_boxes = len(img_X) #total boxes
        features = feature_extraction_func(image_path, img_X)

        #Now extend all.
        X.extend(features)
        y.extend(img_y)

    return np.array(X), np.array(y)

def train_svms(X, y, classes, use_cuda=True): #Added Cuda
    """
    Trains a linear SVM classifier for each object category.

    Args:
        X: A numpy array of CNN features of shape (num_examples, feature_size).
        y: A numpy array of labels of shape (num_examples,).
        classes: A list of object category names.
        use_cuda: a flag indicates whether to use CUDA.

    Returns:
        A dictionary mapping class names to trained linear SVM classifiers.
    """
    svms = {}
    #Add a place to save results after each class
    svm_dir="./trained_svms/"
    if not os.path.exists(svm_dir):
        os.makedirs(svm_dir)

    for i, class_name in enumerate(classes):
        print("Training svm model {0}/{1}:{2}".format(i+1,len(classes),class_name))
        #create path to save file with model name

        #if model name exist, skip, so it doesn't need to train again and again.
        outputfile = svm_dir+class_name+".pth" #I added this part. so you don't need to keep training for multiple times. if model is there, it loads and it's done. saves you tremendous amount of time.

        #if already there then skip training
        if os.path.exists(outputfile): #skip it as it's already present
            model = svm.LinearSVC()
            model = torch.load(outputfile)
            svms[class_name] = model
            print("Model was loaded and will continue") #to show the progress,
            continue #then skip.

        #If Model is not found train
        # Select positive and negative samples for this class
        positive_indices = np.where(y == i)[0]
        negative_indices = np.where(y == 0)[0] #Assuming Background Index is Zero

        #SVM cannot take it if it's all one sided, it needs at least one negative/positive example
        if len(positive_indices)<1:
            print("{0} Skipped since number of positive examples are {1}".format(class_name,len(positive_indices)))
            continue
        if len(negative_indices)<1:
            print("{0} Skipped since number of negative examples are {1}".format(class_name,len(negative_indices)))
            continue

        X_train = np.concatenate((X[positive_indices], X[negative_indices]), axis=0)
        y_train = np.concatenate((np.ones(len(positive_indices)), np.zeros(len(negative_indices))))

        # Train a linear SVM classifier
        model = svm.LinearSVC(C=0.01, random_state=42, max_iter=1000) #I changed it a lil bit, #Set C=0.01 for higher regularization, to prevent the model from overfitting too much. and limited # of iters
        model.fit(X_train, y_train)

        svms[class_name] = model

        #save model to load for next use if something happens.
        torch.save(model, outputfile)
        print("Model done and saved in " + outputfile) #to show the progress.

    return svms

In [9]:
def predict(image_path, bboxes, cnn_model, svms, transform, classes, iou_threshold=0.3, confidence_threshold=0.5, use_cuda=True):
    """
    Performs object detection on an image using R-CNN.

    Args:
        image_path: Path to the input image.
        bboxes: A list of bounding boxes (x1, y1, x2, y2) from region proposals.
        cnn_model: The CNNFeatureExtractor model.
        svms: A dictionary mapping class names to trained linear SVM classifiers.
        transform: PyTorch transform to apply to the warped images.
        classes: A list of object category names.
        iou_threshold: IoU threshold for NMS.
        confidence_threshold: the confidence needed to classify as
    Returns:
        A list of dictionaries, where each dictionary represents a detection and contains:
            - 'bbox': A tuple (x1, y1, x2, y2) representing the bounding box.
            - 'class': The predicted object category.
            - 'score': The confidence score.
    """
    features = extract_features(image_path, bboxes, cnn_model, transform, use_cuda=use_cuda)
    detections = []

    for i, class_name in enumerate(classes):
        svm_model = svms[class_name]
        scores = svm_model.decision_function(features) #calculate the desicion,

        #We only get the scores above a confidence thresholds that we determine
        class_detections = []
        for j, score in enumerate(scores):
            if score >= confidence_threshold: #greater than confidence interval.
                class_detections.append((bboxes[j], score))  #create a detection tuple

        #apply NMS

        # Extract bboxes and scores
        bboxes_class, scores_class = zip(*class_detections) #reverse it into 2 lists

        #Convert to same format for simplicity
        bboxes_class = list(bboxes_class)
        scores_class = list(scores_class)

        #apply Non maximum supression
        nms_indices = non_max_suppression(bboxes_class, scores_class, iou_threshold) #the iou threshold are common, and we call the bboxes function
        for idx in nms_indices:
            detections.append({
                'bbox': bboxes_class[idx],
                'class': class_name,
                'score': scores_class[idx]
            })

    return detections

def compute_iou(box1, box2):
    """
    Computes the Intersection over Union (IoU) between two bounding boxes.

    Args:
        box1: A tuple (x1, y1, x2, y2) representing the first bounding box.
        box2: A tuple (x1, y1, x2, y2) representing the second bounding box.

    Returns:
        The IoU between the two bounding boxes.
    """
    x1_intersect = max(box1[0], box2[0])
    y1_intersect = max(box1[1], box2[1])
    x2_intersect = min(box1[2], box2[2])
    y2_intersect = min(box1[3], box2[3])

    intersect_width = max(0, x2_intersect - x1_intersect)
    intersect_height = max(0, y2_intersect - y1_intersect)
    intersect_area = intersect_width * intersect_height

    box1_width = box1[2] - box1[0]
    box1_height = box1[3] - box1[1]
    box1_area = box1_width * box1_height

    box2_width = box2[2] - box2[0]
    box2_height = box2[3] - box2[1]
    box2_area = box2_width * box2_height

    union_area = box1_area + box2_area - intersect_area
    iou = intersect_area / union_area if union_area > 0 else 0

    return iou

def non_max_suppression(boxes, scores, iou_threshold):
    """
    Applies Non-Maximum Suppression (NMS) to a list of bounding boxes.

    Args:
        boxes: A list of bounding boxes (x1, y1, x2, y2).
        scores: A list of confidence scores.
        iou_threshold: IoU threshold for NMS.

    Returns:
        A list of indices of the bounding boxes to keep after NMS.
    """
    # Convert the box coordinates to a NumPy array
    boxes = np.array(boxes)

    # If there are no boxes, return an empty list
    if len(boxes) == 0:
        return []

    # Initialize the list of picked indexes
    picked = []

    # Grab the coordinates of the bounding boxes
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    # Compute the area of the bounding boxes and sort by the score
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(scores)[::-1]

    # Keep looping while some indexes still remain in the indexes list
    while len(idxs) > 0:
        # Grab the last index in the indexes list and add the index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        picked.append(i)

        # Find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates for the end of the bounding box
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])

        # Compute the width and height of the intersection
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        # Compute the ratio of intersection over union
        iou = (w * h) / area[idxs[:last]] + area[i] - (w * h) #calculate for the rest except the final one.
        iou= (w*h)/(area[idxs[:last]])

        # Delete all indexes from the index list that have intersection over union greater than the provided intersection over union threshold
        idxs = np.delete(idxs, np.concatenate(([last], np.where(iou > iou_threshold)[0])))

    # Return only the bounding boxes that were picked using non-maximum suppression
    return picked

In [10]:
def main():
    # 0. Setup variables
    image_path = "example.jpg" #example image
    annotation_dir="./Annotations/"

    #create class and bboxes as it appears in annotations folder
    class_names = ["Aeroplane", "Bicycle", "Bird", "Boat", "Bottle", "Bus", "Car", "Cat", "Chair", "Cow", "Diningtable", "Dog", "Horse", "Motorbike", "Person", "Pottedplant", "Sheep", "Sofa", "Train","Tvmonitor"] #PASCAL VOC classes from example
    bboxes={} #bboxes["image_name"] will contain proposal bboxes

    #1. Make necessary folders to do all those things.
    image_dir="./JPEGImages/"
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
        #exit() #I added this part. so you don't need to keep training for multiple times. if model is there, it loads and it's done. saves you tremendous amount of time.

    annotation_dir="./Annotations/"
    if not os.path.exists(annotation_dir):
        os.makedirs(annotation_dir)
        #exit() #I added this part. so you don't need to keep training for multiple times. if model is there, it loads and it's done. saves you tremendous amount of time.

    #Now make a dummy XML folder so that GT Bounding Box are called during the training loop
    for img_file in os.listdir(image_dir):
        base_name = os.path.basename(img_file).split('.')[0]
        bboxes[base_name] = [] #create an image
        bboxes[base_name] = selective_search_proposal(skimage.io.imread(image_dir+img_file)) #put selective search to it.


    # 1. Load Image (placeholder)
    # ... Load your image and annotation data ...
    # For the example, let's assume loading bboxes and setting class_names

    # 2. Initialize CNN Feature Extractor
    transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    cnn_model = CNNFeatureExtractor(use_cuda=True)

    # 3. Create Training Data
    X, y = create_training_data(image_dir, annotation_dir, bboxes, class_names, lambda img,bb: extract_features(img, bb, cnn_model, transform))

    #4. Train SVMs
    svms = train_svms(X, y, class_names)

    # 5. Run Inference on a Test Image
    # Assuming you have test image, bboxes, CNN model, SVMs, transform, and classes
    test_image_path = "test_image.jpg"
    test_image = skimage.io.imread(test_image_path) #i need a BGR in order to apply selective search

    test_bboxes = selective_search_proposal(test_image) #get all the test_bboxes.

    detections = predict(test_image_path, test_bboxes, cnn_model, svms, transform, class_names, iou_threshold=0.3, confidence_threshold=0.5)

    # Print the results
    for det in detections:
        print(f"Detected: {det['class']} with score {det['score']} at {det['bbox']}")


# ---------------------------------------------------------------------
# Helper Functions (Implement these based on your data format)
# ---------------------------------------------------------------------

def load_annotations(annotation_path):
    """
    Loads object bounding boxes and labels from an annotation file (PASCAL VOC XML).

    Args:
        annotation_path: Path to the annotation file.

    Returns:
        A tuple (boxes, labels) where boxes is a list of (x1, y1, x2, y2) bounding boxes
        and labels is a list of corresponding class names.
    """
    # Implement loading XML annotations from PASCAL VOC format
    # ... (Use libraries like xml.etree.ElementTree)
    boxes=[]
    labels=[] #get gt labels and bounding boxes
    return boxes, labels


# ---------------------------------------------------------------------
# Run the Main Program
# ---------------------------------------------------------------------
if __name__ == "__main__":
    main()

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to C:\Users\Debojyoti Das/.cache\torch\hub\checkpoints\alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:13<00:00, 17.8MB/s] 


Training svm model 1/20:Aeroplane
Aeroplane Skipped since number of positive examples are 0
Training svm model 2/20:Bicycle
Bicycle Skipped since number of positive examples are 0
Training svm model 3/20:Bird
Bird Skipped since number of positive examples are 0
Training svm model 4/20:Boat
Boat Skipped since number of positive examples are 0
Training svm model 5/20:Bottle
Bottle Skipped since number of positive examples are 0
Training svm model 6/20:Bus
Bus Skipped since number of positive examples are 0
Training svm model 7/20:Car
Car Skipped since number of positive examples are 0
Training svm model 8/20:Cat
Cat Skipped since number of positive examples are 0
Training svm model 9/20:Chair
Chair Skipped since number of positive examples are 0
Training svm model 10/20:Cow
Cow Skipped since number of positive examples are 0
Training svm model 11/20:Diningtable
Diningtable Skipped since number of positive examples are 0
Training svm model 12/20:Dog
Dog Skipped since number of positive ex

FileNotFoundError: No such file: 'c:\Users\Debojyoti Das\Downloads\History\Rich feature hierarchies for accurate object detection and semantic segmentation\test_image.jpg'