We will start by setting up our environment on Google Colab

In [None]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

We will now recall the path in our Google Drive where we uploaded the coursework materials

In [None]:
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Colab Notebooks/CompVision_Labs/Computer Vision Coursework'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

We will now define a function to indicate the images and labels path

In [None]:
images_folder = os.path.join(GOOGLE_DRIVE_PATH,'CV2024_CW_Dataset (1)/train/images')
labels_folder = os.path.join(GOOGLE_DRIVE_PATH,'CV2024_CW_Dataset (1)/train/labels')


print(images_folder)

We will now assign the images and labels files to lists of images and files

In [None]:
images_files = os.listdir(images_folder)
labels_files = os.listdir(labels_folder)

print(images_folder)

We will now build the path for the test images

In [None]:
test_img_path = os.path.join(GOOGLE_DRIVE_PATH,'CV2024_CW_Dataset (1)/test/images')
test_img_files = os.listdir(os.path.join(GOOGLE_DRIVE_PATH,'CV2024_CW_Dataset (1)/test/images'))
test_lbl_files = os.listdir(os.path.join(GOOGLE_DRIVE_PATH,'CV2024_CW_Dataset (1)/test/labels'))

We will first upload, resize and normalize all the images in the Dataset. We will turn them into grayscale in order to simplify our analysis and because our chosen descriptors operate on grayscale images. We will open each Image in the Image list, transform it, assign each image to the new images list and create the label list in paralel.

In [None]:
import torch
from torchvision import datasets, transforms
from PIL import Image
from google.colab import files
from skimage import io, color, img_as_ubyte
#We will define the transformations we want to appluy for each image

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
 ])

#Initiate empty lists to store the transformed images and labels

images = []
labels = []

#We will Iterate through the list of image files

for image_file in images_files:
  #Construct the full path to the image
  image_path = os.path.join(images_folder, image_file)
  #Open the image
  image = Image.open(image_path)
  #Transform the image
  image = transform(image)


  #Append the transformed image to the list
  images.append(image)

  #Build the path to the label file
  label_file = os.path.splitext(image_file)[0] + ".txt"
  label_path = os.path.join(labels_folder, label_file)

  with open(label_path, 'r') as f:
    label = f.read().strip()
    labels.append(label)
print("Labels:", labels)
print("Images", images)


We will now transform the labels in the lsit to numerical values

In [None]:
labels = [int(item) for item in labels]

In [None]:
for item in labels:
  print(item)

In [None]:
print(labels)

After we obtained 2 separate lists with the transformed images and labels, we will create tuples of images and labels so we correlate them before applying our classification models. Each tuple will contain the transformed image and the corresponding label. We will use zip to make the process more efficient

In [None]:
data = list(zip(images, labels))

We will do the same for test images

We will now apply 2 different descriptors that we will use for face recognition: HOG and LBP descriptors to each image in the dataset. We will create two lists containing the HOG and the LBP descriptors of each image so that we can use both for our classification models and see which works best

LBP descriptors

In [None]:
import cv2
from skimage import feature
import numpy as np
from skimage.feature import local_binary_pattern



#Compute the LBP descriptor
def compute_lbp(image):
  #Parameters for LBP computation
  radius = 3
  n_points = 8 * radius
  method = 'uniform'

  #Compute the LBP descriptor
  lbp = local_binary_pattern(image, n_points, radius, method)

  #Calculate histogram of LBP image
  hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))

  #Normalize histogram
  hist = hist.astype("float")
  hist /= (hist.sum() + 1e-7)

  return hist


We will now store descriptors of each image

In [None]:
images_lbp = []

for image, label in data:
  #we will first transform image to numpy array
  image_np = image.numpy().squeeze().astype(np.uint8)

  #Compute LBP descriptor
  image_lbp = compute_lbp(image_np)
  images_lbp.append(image_lbp)

We will now make tuples of the image descriptors and labels for each descriptor before training our SVM model

In [None]:
data_lbp = list(zip(images_lbp, labels))

We will now extract the descriptors for each image and train the SVM model. Here, image descriptors are the predictors while the label us the target variable. Basic SVM


We will now build the model for the LBP descriptors

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import time
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

X = [image_lbp for image_lbp, label in data_lbp]
y = [label for image_lbp, label in data_lbp]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

start_time = time.time()

svm_model = SVC(kernel='linear', decision_function_shape='ovr')
svm_model.fit(X_train, y_train)

end_time = time.time()
training_time = end_time - start_time
print("Training Time:", training_time, "seconds")

accuracy = svm_model.score(X_test, y_test)
print("Accuracy:", accuracy)

y_pred = svm_model.predict(X_test)

# Compute performance metrics
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

#We will now build a Confusion Matrix to asses model Performance

conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

Now I will Build my MLP model with one hidden Layer for the analysis. Because we have a 3 class classification task, we will use the softmax activation function for the output layer and the rectified linear unit activation function between the hidden layers.

In [None]:
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

class MLP1(nn.Module):
  def __init__(self, input_size, hidden_size1, output_size):
    super(MLP1, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size1)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_size1, output_size)
    self.softmax = nn.Softmax()

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.softmax(x)
    return x



We will now implement an MLP with 2 hidden layers

In [None]:
class MLP2(nn.Module):
  def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
    super(MLP2, self).__init__()
    self.fc1 = nn.Linear(input_size, hidden_size1)
    self.relu1 = nn.ReLU()
    self.fc2 = nn.Linear(hidden_size1, hidden_size2)
    self.relu2 = nn.ReLU()
    self.fc3 = nn.Linear(hidden_size2, output_size)
    self.softmax = nn.Softmax()

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu1(x)
    x = self.fc2(x)
    x = self.relu2(x)
    x = self.fc3(x)
    x = self.softmax(x)
    return x

We will now initialize the model and set the hyperparameters

In [None]:
learning_rate = [0.001, 0.01, 0.1 ]
momentum = [0.5, 0.95]
weight_decay = (0.0, 0.1)
hidden_size1 = [64, 128, 256]
hidden_size2 = [64, 128, 256]


output_size = 3

In [None]:
criterion = nn.CrossEntropyLoss()


We will do the same for the LBP descriptor images with the MLP with one layer

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

X = [image_lbp for image_lbp, label in data_lbp]
y = [label for image_lbp, label in data_lbp]
y = np.array(y, dtype=np.int32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

input_size = X_train.shape[1]

best_accuracy = 0.0
best_hyperparameters = None

# Lists to store results for plotting
accuracy_values = []


for i, lr in enumerate(learning_rate):
  for m, mt in enumerate(momentum):
    for j, hs1 in enumerate(hidden_size1):
      for w, wd in enumerate(weight_decay):
        MLP1_model = MLP1(input_size, hs1, output_size)  # Create a new model for each combination of hyperparameters
        optimizer = optim.SGD(MLP1_model.parameters(), lr=lr, momentum=mt, weight_decay=wd)

        MLP1_model.train()

        epochs = 50

        for epoch in range(epochs):
          y_pred = MLP1_model(X_train)
          loss = criterion(y_pred, y_train)

          #Backpropagation

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          if (epoch + 1) % 10 == 0:
                  print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

        MLP1_model.eval()
        with torch.no_grad():
          outputs = MLP1_model(X_test)
          _, predicted = torch.max(outputs, 1)
          accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
          accuracy_values.append(accuracy)
          precision = precision_score(y_test.numpy(), predicted.numpy(), average='weighted')
          recall = recall_score(y_test.numpy(), predicted.numpy(), average='weighted')
          f1 = f1_score(y_test.numpy(), predicted.numpy(), average='weighted')
          print(f'Accuracy on the test set: {accuracy:.4f}')
          print(f'Precision: {precision:.4f}')
          print(f'Recall: {recall:.4f}')
          print(f'F1 Score: {f1:.4f}')

          conf_matrix = confusion_matrix(y_test.numpy(), predicted.numpy())
          # Plot confusion matrix
          plt.figure(figsize=(8, 6))
          sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
          plt.title("Confusion Matrix")
          plt.xlabel("Predicted Label")
          plt.ylabel("True Label")
          plt.show()

          if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparameters = (lr, mt, hs1, wd)

print("Best Accuracy:", best_accuracy)
print("Best Hyperparameters:", best_hyperparameters)

plt.plot(accuracy_values)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Models')
plt.show()

We will now train the MLP with 2 hidden layers

In [None]:
best_accuracy = 0.0
accuracy_values = []
precision_values = []
recall_values = []
f1_values = []
best_hyperparameters = None
for i, lr in enumerate(learning_rate):
  for m, mt in enumerate(momentum):
    for j1, hs1 in enumerate(hidden_size1):
      for j2, hs2 in enumerate(hidden_size2):
        for w, wd in enumerate(weight_decay):
          MLP2_model = MLP2(input_size, hs1, hs2, output_size)  # Create a new model for each combination
          optimizer = optim.SGD(MLP2_model.parameters(), lr=lr, momentum=mt, weight_decay=wd)

          MLP2_model.train()

          epochs = 50

          for epoch in range(epochs):
            y_pred = MLP2_model(X_train)
            loss = criterion(y_pred, y_train)

          #Backpropagation

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (epoch + 1) % 10 == 0:
                    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

          MLP2_model.eval()
          with torch.no_grad():
            outputs = MLP2_model(X_test)
            _, predicted = torch.max(outputs, 1)
            accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
            accuracy_values.append(accuracy)
            precision = precision_score(y_test.numpy(), predicted.numpy(), average='weighted')
            recall = recall_score(y_test.numpy(), predicted.numpy(), average='weighted')
            f1 = f1_score(y_test.numpy(), predicted.numpy(), average='weighted')

            print(f'Accuracy on the test set: {accuracy:.4f}')
            print(f'Precision: {precision:.4f}')
            print(f'Recall: {recall:.4f}')
            print(f'F1 Score: {f1:.4f}')

            conf_matrix = confusion_matrix(y_test.numpy(), predicted.numpy())
            # Plot confusion matrix
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
            plt.title("Confusion Matrix")
            plt.xlabel("Predicted Label")
            plt.ylabel("True Label")
            plt.show()

            if accuracy > best_accuracy:
              best_accuracy = accuracy
              best_hyperparameters = (lr, mt, hs1, hs2, wd)

print("Best Accuracy:", best_accuracy)
print("Best Hyperparameters:", best_hyperparameters)

plt.plot(accuracy_values)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Models')
plt.show()


We will now run a Convolutional Neural Network (CNN) on our Dataset

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
    self.fc1 = nn.Linear(64 * 56 * 56, 128)
    self.fc2 = nn.Linear(128, 64)
    self.fc3 = nn.Linear(64, 3)

  def forward(self, x):
    x = self.pool(torch.relu(self.conv1(x)))
    x = self.pool(torch.relu(self.conv2(x)))
    x = x.view(-1, 64 * 56 * 56)
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = self.fc3(x)
    return x

net = Net()



We will now train the model

In [None]:
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

Nowe we will prepare the Data to train the model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import torchvision
from torchvision import transforms, datasets
import torch.nn.functional as F
torch.manual_seed(42)

from PIL import Image
from dataclasses import dataclass
import matplotlib.pyplot as plt
import random
import numpy as np

##We will convert lists of images and tables in tensors

images_tensor = torch.stack(images)
labels_tensor = torch.tensor(labels)


dataset = TensorDataset(images_tensor, labels_tensor)


train_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
test_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)


In [None]:
print("Number of images:", images_tensor.size(0))
print("Number of labels:", labels_tensor.size(0))

We will now train our CNN model and Display the accuracy and the Confusion Matrix

In [None]:
import time
from sklearn.metrics import confusion_matrix

epochs = 2

accuracy_list = []
conf_matrices = []

t0 = time.time()

for epoch in range(epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics (loss.item() returns the mean loss in the mini-batch)
        running_loss += loss.item()
        if i % 2000 == 1999:  # Print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

    # After each epoch, evaluate the model on the test set
    net.eval()
    correct = 0
    total = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = net(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Append true and predicted labels for confusion matrix
            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())

        accuracy = 100 * correct / total
        accuracy_list.append(accuracy)
        print('Accuracy after epoch %d: %f %%' % (epoch + 1, accuracy))

        # Compute confusion matrix
        conf_matrix = confusion_matrix(y_true, y_pred)
        conf_matrices.append(conf_matrix)

print('Finished Training')
print('Training Time: %s seconds' % (time.time() - t0))

# Plot the accuracy development score
plt.plot(accuracy_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Development')
plt.show()

# Plot the confusion matrix for the last epoch
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrices[-1], annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

Mask Detection Function

In [None]:
import os
import cv2
import torch
import numpy as np
from torchvision.transforms import transforms
import matplotlib.pyplot as plt

def mask_detection(path_to_testset, model_type):
  test_files = os.listdir(path_to_testset)
  selected_files = np.random.choice(test_files, 4, replace=False)
  #Display each selected image with its prediction

  for image_file in selected_files:
        img_path = os.path.join(path_to_testset, image_file)
        img = Image.open(img_path)

        #Preprocess images

        img_transformed = transform(img)

        if model_type == "svm_model":
          img_np = img_transformed.numpy().squeeze().astype(np.uint8)
          img_lbp = compute_lbp(img_np)
          image = img_lbp
          model = svm_model
          image = image.reshape(1, -1)
          prediction = svm_model.predict(image)
          plt.imshow(img)
          plt.title(f"Prediction: {prediction}, Filename: {image_file}")
          plt.axis('off')
          plt.show()

        elif model_type == "MLP1_model":
          img_np = img_transformed.numpy().squeeze().astype(np.uint8)
          img_lbp = compute_lbp(img_np)
          img_torch = torch.tensor(img_lbp, dtype=torch.float32)
          image = img_torch
          model = MLP1_model
          output = MLP1_model(image)
          _, prediction = torch.max(output, 0)
          prediction = prediction.item()
          plt.imshow(img)
          plt.title(f"Prediction: {prediction}, Filename: {image_file}")
          plt.axis('off')
          plt.show()


        elif model_type == "net":
          image = img_transformed
          model = net
          output = net(image)
          _, prediction = torch.max(outputs, 1)
          prediction = prediction.item()
          plt.imshow(img)
          plt.title(f"Prediction: {prediction}, Filename: {image_file}")
          plt.axis('off')
          plt.show()

        else:
          raise ValueError("Invalid model type specified.")


We will now test the function

In [None]:
mask_detection(test_img_path, model_type="net")

Mask Detection Video

In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from google.colab.patches import cv2_imshow

def MaskDetectionVideo(video_path):
  #Load the video
  cap = cv2.VideoCapture(video_path)
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

  #We will use the pretrained face detection algorithm using the Haar cascade classifier.
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

  #Define a function to process each frame
  for frame in range(total_frames):
    #Process every 100th frame
    if frame % 100 == 0:
      ret, img = cap.read()
      if ret:
        #Mask Detection Pipeline using our trained model for CNN
        img_pil = Image.fromarray(img)
        img_transformed = transform(img_pil)
        output = net(img_transformed)
        _, prediction = torch.max(output, 1)
        prediction = prediction.item()
        #We will now draw a bounding box and label the image
        if prediction == 0:
          label = "No Mask"
          color= (0, 0, 255)
        elif prediction == 1:
          label = "Mask"
          color = (0, 0, 255)
        else:
          label = "Mask Improperly"
          color = (0, 0, 255)

        #Now we will detect faces and draw the bounding box and post the labels

        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        #Detect faces
        faces = face_cascade.detectMultiScale(img_gray, scaleFactor=1.1, minNeighbors=5)

        #We will loov through each detected face
        for (x, y, w, h) in faces:
          # Draw bounding box
          cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)

          # Add label
          cv2.putText(img, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

        # Display the processed frame
        cv2_imshow(img)
        cv2.waitKey(1)

  # Release the video capture object
  cap.release()
  cv2.destroyAllWindows()



Video Function 2.0

In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow

def MaskDetectionVideo(video_path):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # We will use the pretrained face detection algorithm using the Haar cascade classifier.
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    # Define a function to process each frame
    frame_count = 0
    while True:
        ret, img = cap.read()
        if not ret:
            break

        frame_count += 1
        if frame_count % 10 != 0:
            continue

        # Mask Detection Pipeline using our trained model for CNN
        img_pil = Image.fromarray(img)
        img_transformed = transform(img_pil)
        output = net(img_transformed)
        _, prediction = torch.max(output, 1)
        prediction = prediction.item()

        # We will now draw a bounding box and label the image
        if prediction == 0:
            label = "No Mask"
            color = (0, 0, 255)
        elif prediction == 1:
            label = "Mask"
            color = (0, 0, 255)
        else:
            label = "Mask Improperly"
            color = (0, 0, 255)

        # Now we will detect faces and draw the bounding box and post the labels
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(img_gray, scaleFactor=1.1, minNeighbors=5)

        # Loop through each detected face
        for (x, y, w, h) in faces:
            # Draw bounding box
            cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)

            # Add label
            cv2.putText(img, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

        # Display the processed frame
        cv2_imshow(img)
        cv2.waitKey(1)

    # Release the video capture object
    cap.release()
    cv2.destroyAllWindows()

We will now try our Mask Detection Video Pipeline

In [None]:
video_path = os.path.join(GOOGLE_DRIVE_PATH, 'videoplayback.mp4')

MaskDetectionVideo(video_path)