<a href="https://colab.research.google.com/github/AvivBGU/DeepLearning_Assignment2/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TODOS:
1. Create a proper training setup and embed the stuff implemented in the data loaded to the relevant functions.
2. Make sure that loss and iterations, as well as general data is printed.
3. Have a testing setup for an arbitrary model.
4. Create a model factory to properly balance different parameters of the network.
5. Make sure that the testing setup allows the display of images to have visual verifacation.

# Imports & Constants

<font size="4">Imports </font>

Install Pytorch

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

In [None]:
import torch
import os
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler
import zipfile
import requests
import numpy as np
import torch.utils.data as data
import time
import matplotlib.pyplot as plt
import random
import copy

from torchvision import transforms
from glob import glob
from PIL import Image

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

print("Using torch", torch.__version__)

<font size="4">Constants</font>

In [None]:
current_working_directory = os.getcwd()
DATA_BASE_DIRECTORY: str = os.path.join(current_working_directory, 'data')
TRAINING_SET_URL='https://web.archive.org/web/20241214060505/https://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt'
TEST_SET_URL='https://web.archive.org/web/20241214070147/https://vis-www.cs.umass.edu/lfw/pairsDevTest.txt#expand'
MAX_PIXEL_VALUE: float = 255.0
IMAGE_SIZE: tuple[int, int] = (105, 105)
BATCH_SIZE: int = 128
IMAGE_MODE: str = 'L' # If the image is greyscale
DEVICE_TO_USE: str = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED: int = 10 # For replicatability
TRAINING_VALIDATION_DIVISION: float = 0.1
MAX_EPOCHS_FOR_TRAINING: int = 200
PATIANCE_FACTOR: int = 1e-4
ALLOWED_PATIANCE_ITERATIONS: int = 20
EARLY_STOP: bool = True
print(f'Using device: {DEVICE_TO_USE}')
random.seed(RANDOM_SEED)

# Acquiring & Handling Data

<font size="6">Acquiring_Data</font>

In [None]:
# TODO Refine this mechanism to use python natively.

!pip install -q gdown

def download_images_from_drive(file_id: str, zip_path: str) -> str:
  """
  Downloads images from drive and return the path to the extracted folder, but 1
  level down assuming the structure of the directories are known in advance.
  """
  file_location: str = os.path.join(DATA_BASE_DIRECTORY, 'lfw2', 'lfw2')
  if os.path.exists(file_location):
    print(f"Dataset already downloaded to {file_location}")
    return file_location
  
  !gdown {file_id} -O {zip_path}

  os.makedirs(DATA_BASE_DIRECTORY, exist_ok=True)
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall(DATA_BASE_DIRECTORY)
  !rm {zip_path}
  print(f"Dataset extracted to {DATA_BASE_DIRECTORY}")
  return file_location

updated_dir_location: str = download_images_from_drive(
    file_id="1p1wjaqpTh_5RHfJu4vUh8JJCdKwYMHCp",
    zip_path="dataset.zip"
    )

<font size="4">Preprocessing function</font>

<font size="4">Loading file paths to memory</font>

In [None]:
def loads_files_paths_to_memory(base_directory: str, image_format: str = '.jpg') -> None:
    images: dict[str, dict[int, str]] = dict()
    images_loaded: int = 0
    for root, subdirs, files in os.walk(base_directory):
        if root == base_directory:
            continue
        person_name: str = root.split(os.sep)[-1]
        if person_name not in images:
            images[person_name] = dict()
        for file in files:
            if not file.endswith(image_format):
                raise Warning(f"File {file} is not a {image_format} file. Continuing...")
                continue
            stripped_image: str = file.rstrip(image_format) # File without ending
            image_index: int = int(stripped_image.split('_')[-1])
            if image_index in images[person_name]:
                 raise ValueError(f"Index: {image_index} collision for: {person_name}")
            images[person_name][image_index] = os.path.join(root, file)
            images_loaded += 1
    if len(images) < 1:
        raise ValueError(f"No images were found in {base_directory}, aborting...")
    print(f"People scanned: {len(images)}")
    print(f"Images loaded: {images_loaded}")
    return images

loaded_images: dict[str, dict[str, str]] = loads_files_paths_to_memory(updated_dir_location)

<font size="4">Organizing According to train-test</font>

<font size="4">Get train-test division and parse it</font>

In [None]:
def parse_train_test_txt(url_to_use: str) -> list[tuple[tuple[str, int], tuple[str, int], bool]]:
    url_response = requests.get(url_to_use)
    if url_response.status_code == 200:
        text_content = url_response.text
    else:
        raise ValueError("Invalid URL")
    ret_text: list[str] = text_content.split('\n')
    examples: list[tuple[tuple[str, int], tuple[str, int], bool]] = list()
    for text in ret_text:
        separated_by_tabs: list[str] = text.split('\t')
        if len(separated_by_tabs) < 3:
            # This is the number in the beginning
            continue
        if len(separated_by_tabs) == 3:
            # This is a positive example (2 Pictures of the same person)
            person = separated_by_tabs[0]
            first_image_index = int(separated_by_tabs[1])
            second_image_index = int(separated_by_tabs[2])
            examples.append(
                                        (
                                             (person, first_image_index),
                                             (person, second_image_index),
                                             1
                                        )
                                     )
        if len(separated_by_tabs) == 4:
            first_person = separated_by_tabs[0]
            first_person_image_index = int(separated_by_tabs[1])
            second_person = separated_by_tabs[2]
            second_person_image_index = int(separated_by_tabs[3])
            examples.append(
                                        (
                                             (first_person, first_person_image_index),
                                             (second_person, second_person_image_index),
                                             0
                                        )
                                     )
    return examples

training_set: list[tuple[tuple[str, int], tuple[str, int], bool]] = parse_train_test_txt(TRAINING_SET_URL)
test_set: list[tuple[tuple[str, int], tuple[str, int], bool]] = parse_train_test_txt(TEST_SET_URL)

<font size="4">Getting validation set</font>

In [None]:
# TODO MAKE SURE THE VALIDATION SET TAKES AN EQUAL AMOUNT OF SAME PICTURE AND NOT SAME PICTURE.
samples_to_select: int = int(len(training_set)*TRAINING_VALIDATION_DIVISION)
validation_set: list[tuple[tuple[str, int], tuple[str, int], bool]] = random.sample(training_set, samples_to_select)
training_set: list[tuple[tuple[str, int], tuple[str, int], bool]] = [sample for sample in training_set if sample not in validation_set]
print(f'training_set_size: {len(training_set)}')
print(f'validation_set_size: {len(validation_set)}')
print(f'test_set_size: {len(test_set)}')

In [None]:
def load_images(images_file_paths_dict: dict[str, dict[int, str]],
                examples_list: list[tuple[tuple[str, int], tuple[str, int], bool]]) -> list[tuple[Image.Image, Image.Image]]:
  """
  Loads the images given to memory in the following format:
  Returns 2 lists:
  list[loaded_image, loaded_image], list[is_same]
  """
  data_to_ret: list = list()
  labels_to_ret: list[bool] = list() # Returned labels, true if same person, false otherwise.
  transform = transforms.Compose([ # In case we need/want transform the inputs.
      transforms.Resize((105, 105)),
      transforms.ToTensor(),
  ]) # Important to note, if the input is transformed, then it's normalized.

  for example in examples_list:
    first_person, first_image_index = example[0]
    second_person, second_image_index = example[1]
    is_same = example[2]
    first_image_path = images_file_paths_dict[first_person][first_image_index]
    second_image_path = images_file_paths_dict[second_person][second_image_index]
    first_image = Image.open(first_image_path)
    second_image = Image.open(second_image_path)
    if (first_image.mode != IMAGE_MODE) or (second_image.mode != IMAGE_MODE):
      raise ValueError("Images have different modes.")
    if (first_image.size != IMAGE_SIZE) and (second_image.size != IMAGE_SIZE):
        # Resizing instead of throwing error
        first_image = transform(first_image)
        second_image = transform(second_image)
    data_to_ret.append((first_image, second_image))
    labels_to_ret.append(is_same)
  return data_to_ret, labels_to_ret

training_data, training_labels = load_images(loaded_images, training_set)
validation_data, validation_labels = load_images(loaded_images, validation_set)
test_data, test_labels = load_images(loaded_images, test_set)

<font size="4">Converting images to array to allow the useage of pytorch dataloader</font>

In [None]:
def convert_images_to_array(image_tuple_list: list[tuple[Image.Image, Image.Image]]) -> list[np.ndarray]:
  returned_list: list[np.ndarray] = list()
  for first_image, second_image in image_tuple_list:
    arrayed_first_image = np.array(first_image)
    arrayed_second_image = np.array(second_image)
    normalized_first_image = arrayed_first_image / MAX_PIXEL_VALUE if arrayed_first_image.max() > 1 else arrayed_first_image
    normalized_second_image = arrayed_second_image / MAX_PIXEL_VALUE if arrayed_second_image.max() > 1 else arrayed_second_image
    returned_list.append((normalized_first_image, normalized_second_image))
  return returned_list

arrayed_training_data = convert_images_to_array(training_data)
arrayed_validation_data = convert_images_to_array(validation_data)
arrayed_test_data = convert_images_to_array(test_data)

In [None]:
class SiameseDataset(data.Dataset):
    def __init__(self, image_pairs: list[tuple], labels: list[int]):
        self.image_pairs = image_pairs
        self.labels = labels

    def __len__(self):
        return len(self.image_pairs)

    def __getitem__(self, idx):
        img1, img2 = self.image_pairs[idx]
        img1 = torch.tensor(np.array(img1), dtype=torch.float32)
        img2 = torch.tensor(np.array(img2), dtype=torch.float32)
        if len(img1.shape) == 2 or len(img2.shape) == 2: # Adding color channel in case original picture didn't have it.
            img1 = img1.unsqueeze(0)
            img2 = img2.unsqueeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return img1, img2, label

training_dataset: data.Dataset = SiameseDataset(arrayed_training_data, training_labels)
validation_dataset: data.Dataset = SiameseDataset(validation_data, validation_labels)
test_dataset: data.Dataset = SiameseDataset(arrayed_test_data, test_labels)
training_loader: data.DataLoader = torch.utils.data.DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
validation_loader: data.DataLoader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_loader: data.DataLoader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True)

# Neural Network definition

<font size="6">Creating Network</font>

In [None]:
class ModularSiameseNetwork(nn.Module):
    def __init__(self,
                 first_conv_layer: tuple[int, int, int, torch.nn.modules.pooling.MaxPool2d | None],
                 other_layers: list[(int, int, torch.nn.modules.pooling.MaxPool2d | None)],
                 input_size: list[int, int, int], # channels, height, width
                 fully_connected_layer_size: int):
        """
        First_conv_layer: [in_channels, out_channels, kernel_size, should use pooling]
        other_layers [(out_channels, kernel_size, should_use_pooling)]
        """
        super().__init__()

        self.main_network_block = nn.Sequential()
        first_layer_in_channel, first_layer_out_channel, first_layer_kernel_size, first_layer_use_max_pool = first_conv_layer
        self.main_network_block.append(
            nn.Conv2d(in_channels=first_layer_in_channel,
                      out_channels=first_layer_out_channel,
                      kernel_size=first_layer_kernel_size)
        )
        self.main_network_block.append(
            nn.ReLU(),
        )
        if first_layer_use_max_pool:
            self.main_network_block.append(
                first_layer_use_max_pool
            )
        prev_layer_output: int = first_layer_out_channel
        for current_layer in other_layers:
            current_layer_output_channels, current_layer_kernel_size, max_pool = current_layer
            self.main_network_block.append(
                nn.Conv2d(prev_layer_output,
                          current_layer_output_channels,
                          kernel_size=current_layer_kernel_size)
            )
            self.main_network_block.append(
                nn.ReLU()
            )
            if max_pool: # If maxpooling should be used.
              self.main_network_block.append(
                  max_pool
              )
            prev_layer_output = current_layer_output_channels

        with torch.no_grad(): # Dynamically calculate the size of the layer according to expected input.
            dummy = torch.zeros(1, *input_size)
            dummy_out = self.main_network_block(dummy)
    
            flattened_size = dummy_out.view(1, -1).size(1)

        self.fully_connected_layer = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, fully_connected_layer_size,),
            nn.ReLU()
        )

        self.output_layer = nn.Sequential(
            nn.Linear(fully_connected_layer_size, 1),
            nn.Sigmoid()
        )

        # === Initialization ===
        with torch.no_grad():
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    nn.init.normal_(m.weight, mean=0.0, std=1e-2)
                    nn.init.normal_(m.bias, mean=0.5, std=1e-2)
                elif isinstance(m, nn.Linear):
                    nn.init.normal_(m.weight, mean=0.0, std=1e-2)
                    nn.init.normal_(m.bias, mean=0.5, std=2e-1)

    def forward_once(self, input):
        network_block_output = self.main_network_block(input)
        fully_connected_layer_output = self.fully_connected_layer(network_block_output)
        return fully_connected_layer_output

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return self.output_layer(torch.abs(output1 - output2)) # L1 distance.

<font size="6">Loss & Optimizer</font>

In [None]:
class RegularizedBinaryCrossEntropyLoss(nn.Module):
    def __init__(self, model, lambda_reg=1e-4):
        super().__init__()
        self.model = model
        self.lambda_reg = lambda_reg
        self.bce = nn.BCELoss()

    def forward(self, outputs, targets):
        loss = self.bce(outputs, targets.float())
        reg_loss = sum(torch.sum(p.pow(2)) for p in self.model.parameters() if p.requires_grad and p.ndim > 1).to(DEVICE_TO_USE)
        return loss + self.lambda_reg * reg_loss

<font size="6">Network initialization</font>

In [None]:
initial_lr = 0.01
momentum = 0.5
l2_regularition_strength = 1e-4

model = ModularSiameseNetwork(
    first_conv_layer=(1, 64, 10, nn.MaxPool2d(2)),
    other_layers=[(128, 7, nn.MaxPool2d(2)),
                  (128, 4, nn.MaxPool2d(2)), 
                  (256, 4, None)], 
    input_size=[1, *IMAGE_SIZE],
    fully_connected_layer_size=4096
)
model_in_gpu = model.to(DEVICE_TO_USE)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=initial_lr,             # learning rate
    momentum=momentum,       # momentum
    weight_decay=l2_regularition_strength  # L2 regularization
)
# Learning Rate Scheduler Configuration
learning_rate_decay_func = lambda epoch: 0.99
scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=learning_rate_decay_func)
model

<font size="6">Network training</font>

Calculate loss for 1 example after passing it to device

In [None]:
def calculate_loss(model, loss_criterion, input_1, input_2, targets):
    input1_in_device = input_1.to(DEVICE_TO_USE, non_blocking=True)
    input2_in_device = input_2.to(DEVICE_TO_USE, non_blocking=True)
    labels_in_device = targets.unsqueeze(1).to(DEVICE_TO_USE, non_blocking=True)

    output = model(input1_in_device, input2_in_device)
    loss = loss_criterion(output, labels_in_device)

    return loss

In [None]:
patiance_for_improvement: int = 0
validation_loss: float  = 100000000.0
early_stop_triggered: bool = False
total_time_start = time.time()
for epoch in range(MAX_EPOCHS_FOR_TRAINING):
    torch.cuda.empty_cache()
    model_in_gpu.train()
    start_time = time.time()

    if epoch + 1 == 6:  # epoch is 0-indexed
        for param_group in optimizer.param_groups:
            param_group["momentum"] = 0.9
            
    running_loss = 0.0
    print(f"\nEpoch [{epoch+1}/{MAX_EPOCHS_FOR_TRAINING}]")
    for batch_idx, (input1, input2, targets) in enumerate(training_loader):
        if patiance_for_improvement >= ALLOWED_PATIANCE_ITERATIONS and EARLY_STOP:
            print("Early stopping due to no improvement was triggered.")
            early_stop_triggered = True
            break
        
        optimizer.zero_grad()
        loss = calculate_loss(model_in_gpu, criterion, input1, input2, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if (batch_idx + 1) % 2 == 0 or (batch_idx + 1) == len(training_loader):
            print(f"  Batch [{batch_idx+1}/{len(training_loader)}], Loss: {loss.item():.4f}")
    model_in_gpu.eval()
    with torch.no_grad():
        running_validation_loss = 0
        for validation_batch_index, (validation_input_1, validation_input_2, validation_targets) in enumerate(validation_loader):
            running_validation_loss += calculate_loss(model_in_gpu, criterion, validation_input_1, validation_input_2, validation_targets).item()
        running_validation_loss = running_validation_loss / len(validation_loader)
        if running_validation_loss - validation_loss < PATIANCE_FACTOR:
            validation_loss = running_validation_loss
            best_model_state_dict = copy.deepcopy(model_in_gpu.state_dict())
            patiance_for_improvement = 0
        else:
            patiance_for_improvement += 1
        print(f" Validation Set Loss: {running_validation_loss} ")
    if early_stop_triggered and EARLY_STOP:
        print("Ending training early since no validation improvement triggered.")
        break
    scheduler.step()
    avg_loss = running_loss / len(training_loader)
    elapsed_time = time.time() - start_time
    print(f"Epoch [{epoch+1}] completed in {elapsed_time:.2f}s, Average Loss: {avg_loss:.4f}")
model_in_gpu.eval()
print(f"Finished training in: {time.time() - total_time_start}s.")




<font size="6">Evaluation</font>

Display an array as a picture.

In [None]:
def display_data_loaded_to_loader(
        test_example_1, 
        test_example_2, 
        prediction: float = -1, 
        expected_result: float = -1):
    cpu_input1 = test_example_1.cpu()
    cpu_input2 = test_example_2.cpu()

    img1_np = cpu_input1[0].squeeze(0).numpy()
    img2_np = cpu_input2[0].squeeze(0).numpy()

    # Plotting
    fig, axs = plt.subplots(1, 2)
    axs[0].imshow(img1_np, cmap='gray')
    axs[0].set_title('Image 1')
    axs[0].axis('off')

    axs[1].imshow(img2_np, cmap='gray')
    axs[1].set_title('Image 2')
    axs[1].axis('off')
    plt.show()
    if prediction > -1:
        print(f"prediction: {prediction}")
    if expected_result > -1:
        print(f"expected result: {expected_result}")

In [None]:
# Testing
if best_model_state_dict is not None:
    model_in_gpu.load_state_dict(best_model_state_dict)
    model_in_gpu.eval()

In [None]:
model_in_gpu.eval()
with torch.no_grad():
    dummy_1 = torch.randn(1, 1, 105, 105).to(DEVICE_TO_USE)
    dummy_2 = torch.randn(1, 1, 105, 105).to(DEVICE_TO_USE)
    dummy_3 = dummy_1.clone()  # identical to dummy_1

    # Should output different results if model is not collapsed
    out_same = model_in_gpu(dummy_1, dummy_3)
    out_diff = model_in_gpu(dummy_1, dummy_2)

    print("Output (same):", out_same.item())
    print("Output (diff):", out_diff.item())
    print("Abs difference:", abs(out_same.item() - out_diff.item()))

In [None]:
test_loss = 0.0
correct_predictions = 0
total_predictions = 0

plots_to_open: int = 6
iter_for_same = plots_to_open
iter_for_different = plots_to_open
with torch.no_grad():
    for test_input1, test_input2, test_targets in test_loader:
        test_input1, test_input2, test_targets = \
        test_input1.to(DEVICE_TO_USE, non_blocking=True), \
        test_input2.to(DEVICE_TO_USE, non_blocking=True), \
        test_targets.unsqueeze(1).to(DEVICE_TO_USE, non_blocking=True)
        test_output = model_in_gpu(test_input1, test_input2)
        # expected_result = test_targets[0, 0].item()
        # if expected_result > 0.9:
        #     if iter_for_same > 0:
        #         display_data_loaded_to_loader(
        #             test_input1,
        #             test_input2,
        #             prediction=test_output,
        #             expected_result=expected_result
        #         )
        #         iter_for_same -= 1
                
        # else:
        #     if iter_for_different > 0:
        #         display_data_loaded_to_loader(
        #             test_input1,
        #             test_input2,
        #             prediction=test_output,
        #             expected_result=expected_result
        #         )
        #         iter_for_different -= 1
        batch_loss = criterion(test_output, test_targets)
        test_loss += batch_loss.item()
        predictions = test_output > 0.5
        correct = (predictions.float() == test_targets).sum().item()
        correct_predictions += correct
        total_predictions += test_targets.size(0)


test_loss /= len(test_loader)
accuracy = correct_predictions / total_predictions

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")