In [1]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import torch.nn as nn
import torch.nn.functional as F


class Net_4_layers(nn.Module):
    def __init__(self):
        super(Net_4_layers, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 6 * 6, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16) # Add a new fully connected layer
        self.fc4 = nn.Linear(16, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 6 * 6)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

model_1 = Net_4_layers().to(device)

In [4]:
model_1.load_state_dict(torch.load('models/4-layer-lr-scheduling-net.pth'))
model_1.eval()

Net_4_layers(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=2, bias=True)
)

In [7]:
# Define the sliding window parameters
window_size = (36, 36)  # Size of the sliding window
step_size = 18  # Step size for moving the window

In [46]:
image_path = 'target_image_2.jpg'
image = Image.open(image_path)

# Create an image transform to match the model's input requirements
transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0,),std=(1,))
])

# Convert the image to a PyTorch tensor and apply the transformation
image_tensor = transform(image).to(device)

# Get the dimensions of the target image
img_width, img_height = image.size

In [47]:
import math

# Initialize an empty list to store the bounding box coordinates of detected faces
face_boxes = []

# Perform sliding window detection at multiple scales (zoom levels)
for scale_factor in np.arange(0.1, 1.0, 0.05):  # evenly spaced values in [0.1; 1.0]
    # Resize the image to the current sscale
    scaled_image = image.resize((int(img_width * scale_factor), int(img_height * scale_factor)))
    
    # if scaled image is smaller than the sliding window then skip
    width, height = scaled_image.size
    if (width < window_size[0] or height < window_size[1]):
        continue

    # Convert the scaled image to a PyTorch tensor
    scaled_image_tensor = transform(scaled_image).to(device)
    
    # Get the dimensions of the scaled image
    scaled_img_width, scaled_img_height = scaled_image.size

    # Perform sliding window detection
    for y in range(0, scaled_img_height - window_size[1], step_size):
        for x in range(0, scaled_img_width - window_size[0], step_size):
            # Extract the sub-image using the sliding window
            sub_image = scaled_image_tensor[:, y:y + window_size[1], x:x + window_size[0]]
            # Pass the sub-image through your pre-trained model for classification
            with torch.no_grad():
                output = model_1(sub_image.unsqueeze(0))  # Add a batch dimension
            
            confidence, predicted = torch.max(output.data, 1)

            if predicted == 1:
                # Calculate the coordinates of the bounding box in the original image
                left = int(x / scale_factor)
                top = int(y / scale_factor)
                right = int((x + window_size[0]) / scale_factor)
                bottom = int((y + window_size[1]) / scale_factor)

                face_boxes.append((left, top, right, bottom, confidence))

In [12]:
def nms_pytorch(P : torch.tensor ,thresh_iou : float):
    """
    Apply non-maximum suppression to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the image 
            along with the class predscores, Shape: [num_boxes,5].
        thresh_iou: (float) The overlap thresh for suppressing unnecessary boxes.
    Returns:
        A list of filtered boxes, Shape: [ , 5]
    """
 
    # we extract coordinates for every 
    # prediction box present in P
    x1 = P[:, 0]
    y1 = P[:, 1]
    x2 = P[:, 2]
    y2 = P[:, 3]
 
    # we extract the confidence scores as well
    scores = P[:, 4]
 
    # calculate area of every block in P
    areas = (x2 - x1) * (y2 - y1)
     
    # sort the prediction boxes in P
    # according to their confidence scores
    order = scores.argsort()
 
    # initialise an empty list for 
    # filtered prediction boxes
    keep = []
     

    while len(order) > 0:
        
        # extract the index of the 
        # prediction with highest score
        # we call this prediction S
        idx = order[-1]

        # push S in filtered predictions list
        keep.append(P[idx])

        # remove S from P
        order = order[:-1]
 
        # sanity check
        if len(order) == 0:
            break
         
        # select coordinates of BBoxes according to 
        # the indices in order
        xx1 = torch.index_select(x1,dim = 0, index = order)
        xx2 = torch.index_select(x2,dim = 0, index = order)
        yy1 = torch.index_select(y1,dim = 0, index = order)
        yy2 = torch.index_select(y2,dim = 0, index = order)
 
        # find the coordinates of the intersection boxes
        xx1 = torch.max(xx1, x1[idx])
        yy1 = torch.max(yy1, y1[idx])
        xx2 = torch.min(xx2, x2[idx])
        yy2 = torch.min(yy2, y2[idx])
 
        # find height and width of the intersection boxes
        w = xx2 - xx1
        h = yy2 - yy1
         
        # take max with 0.0 to avoid negative w and h
        # due to non-overlapping boxes
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
 
        # find the intersection area
        inter = w*h
 
        # find the areas of BBoxes according the indices in order
        rem_areas = torch.index_select(areas, dim = 0, index = order) 
 
        # find the union of every prediction T in P
        # with the prediction S
        # Note that areas[idx] represents area of S
        union = (rem_areas - inter) + areas[idx]
         
        # find the IoU of every prediction in P with S
        IoU = inter / union
 
        # keep the boxes with IoU less than thresh_iou
        mask = IoU < thresh_iou
        order = order[mask]
     
    return keep

In [50]:
face_boxes_tensor = torch.tensor(face_boxes)
filtered_boxes = nms_pytorch(face_boxes_tensor, 0.1)
print(len(face_boxes), len(filtered_boxes)) # number of face detected before and after the NMS

97 14


In [51]:
import cv2

# Load the original image
image_cv = cv2.imread(image_path)

# Draw bounding boxes on the image
for box in filtered_boxes:
    left, top, right, bottom, confidence = box
    left, top, right, bottom = int(left), int(top), int(right), int(bottom)  # Convert to integers
    cv2.rectangle(image_cv, (left, top), (right, bottom), (0, 255, 0), 2)

# Create a window and display the image
cv2.imshow('Detected Faces', image_cv)

# Wait for a key press and then close the window
cv2.waitKey(0)
cv2.destroyAllWindows()