In [1]:
import torch, os
import torchvision.transforms as T
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd 
from tqdm import tqdm
from PIL import Image
from sklearn.neighbors import NearestNeighbors

In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [3]:
train_df = pd.read_csv("./data/COMP90086_2021_Project_train/train.csv")
test_df = pd.read_csv("./data/COMP90086_2021_Project_test/imagenames.csv")
test_df

Unnamed: 0,id
0,IMG4287_3
1,IMG4288_5
2,IMG4289_5
3,IMG4290_4
4,IMG4291_5
...,...
1195,IMG5482_1
1196,IMG5483_2
1197,IMG5484_4
1198,IMG5485_3


In [5]:
# SOURCE: https://medium.com/pytorch/image-similarity-search-in-pytorch-1a744cf3469
# Dataset class converting all images in the train/test folder to PyTorch dataset
class FolderDataset(Dataset):
    """
    Creates a PyTorch dataset from a given folder containing images and returns two tensor images. 
    :param main_dir: directory where images are stored.
    #:param transform: torchvision transforms to be applied while making dataset (optional) 
    :return: two images, one as input to the model and another image to compare with the original image for reconstruction.
    """
    def __init__(self, main_dir, transform=None):
        self.main_dir = main_dir
        self.transform = transform
        self.all_imgs = os.listdir(main_dir)

    def __len__(self):
        return len(self.all_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.all_imgs[idx])

        
        image = Image.open(img_loc).convert("RGB") 
        crop = T.Compose([T.CenterCrop((480, 640))])
        image = crop(image)

        if self.transform is not None:
            tensor_image = self.transform(image)

        return tensor_image, tensor_image
        

In [6]:
class ConvEncoder(nn.Module):
    """
    Convolutional Encoder Model which is a a repetition of convolutional, relu and maxpool layers.
    It converts an input images to a feature representation of size (1, 256, 16, 16).
    """

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, (3, 3), padding=(1, 1))
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool1 = nn.MaxPool2d((2, 2))
        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=(1, 1))
        self.relu2 = nn.ReLU(inplace=True)
        self.maxpool2 = nn.MaxPool2d((2, 2))
        self.conv3 = nn.Conv2d(32, 64, (3, 3), padding=(1, 1))
        self.relu3 = nn.ReLU(inplace=True)
        self.maxpool3 = nn.MaxPool2d((2, 2))
        self.conv4 = nn.Conv2d(64, 128, (3, 3), padding=(1, 1))
        self.relu4 = nn.ReLU(inplace=True)
        self.maxpool4 = nn.MaxPool2d((2, 2))
        self.conv5 = nn.Conv2d(128, 256, (3, 3), padding=(1, 1))
        self.relu5 = nn.ReLU(inplace=True)
        self.maxpool5 = nn.MaxPool2d((2, 2))

    def forward(self, x):
        # Downscale the image with conv maxpool
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)
        x = self.conv5(x)
        x = self.relu5(x)
        x = self.maxpool5(x)
        
        return x

In [7]:
class ConvDecoder(nn.Module):
    """
    Convolutional Decoder Model takes an input of feature representations and reconstructs back the image
    It upscales the feature representations to the original image using transposed convolution layers of kernel size (2, 2) and stride (2, 2)
    """

    def __init__(self):
        super().__init__()
        self.deconv1 = nn.ConvTranspose2d(256, 128, (2, 2), stride=(2, 2))
        self.relu1 = nn.ReLU(inplace=True)
        self.deconv2 = nn.ConvTranspose2d(128, 64, (2, 2), stride=(2, 2))
        self.relu2 = nn.ReLU(inplace=True)
        self.deconv3 = nn.ConvTranspose2d(64, 32, (2, 2), stride=(2, 2))
        self.relu3 = nn.ReLU(inplace=True)
        self.deconv4 = nn.ConvTranspose2d(32, 16, (2, 2), stride=(2, 2))
        self.relu4 = nn.ReLU(inplace=True)
        self.deconv5 = nn.ConvTranspose2d(16, 3, (2, 2), stride=(2, 2))
        self.relu5 = nn.ReLU(inplace=True)

    def forward(self, x):
        # Upscale the image with convtranspose etc.
        x = self.deconv1(x)
        x = self.relu1(x)
        x = self.deconv2(x)
        x = self.relu2(x)
        x = self.deconv3(x)
        x = self.relu3(x)
        x = self.deconv4(x)
        x = self.relu4(x)
        x = self.deconv5(x)
        x = self.relu5(x)
        return x

In [8]:
def train_step(encoder, decoder, train_loader, loss_fn, optimizer, device):
    """
    Performs a single training step
    :param encoder: the convolutional Encoder defined as ConvEncoder
    :param decoder: A convolutional Decoder defined as ConvDecoder
    :param train_loader: PyTorch dataloader, containing (images, images).
    :param loss_fn: PyTorch loss_fn to compute loss between 2 images.
    :param optimizer: PyTorch optimizer (eg. AdamW)
    :param device: "cuda" or "cpu"
    :returns: Train Loss
    """
    #  Set networks to train mode.
    encoder.train()
    decoder.train()

    for batch_idx, (train_img, target_img) in enumerate(train_loader):
        train_img = train_img.to(device)
        target_img = target_img.to(device)
        
        # Zero grad the optimizer
        optimizer.zero_grad()
        # Feed the train images to encoder
        enc_output = encoder(train_img)
        dec_output = decoder(enc_output)
        # Compute loss between the reconstructed image and orginal image which is target image.
        loss = loss_fn(dec_output, target_img)
        loss.backward()
        # Apply the optimizer to network
        optimizer.step()
    return loss.item()

def val_step(encoder, decoder, val_loader, loss_fn, device):
    """
    Performs a single training step
    :param encoder: A convolutional Encoder like torch_model ConvEncoder
    :param decoder: A convolutional Decoder like torch_model ConvDecoder
    :param val_loader: PyTorch dataloader containing (images, images)
    :param loss_fn: PyTorch loss_fnto compute loss between 2 images
    :param device: "cuda" or "cpu"
    :returns: Validation Loss
    """
    # Set to eval mode.
    encoder.eval()
    decoder.eval()
    
    # gradients do not need to be computed  for validation???
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(val_loader):
            train_img = train_img.to(device)
            target_img = target_img.to(device)
            # Feed the train images to encoder
            enc_output = encoder(train_img)
            # Feed the encoder output to Decoder to reconstruct the image
            dec_output = decoder(enc_output)
            # Find the validation loss for the encoder and decoder 
            loss = loss_fn(dec_output, target_img)
    return loss.item()

In [9]:
def create_embedding(encoder, full_loader, embedding_dim, device):
    """
    Creates embedding using encoder from dataloader and saves our image embeddings.
    :param encoder: A convolutional Encoder like torch_model ConvEncoder
    :param full_loader: PyTorch dataloader, containing (images, images) over the entire dataset
    :param embedding_dim: Tuple (c, h, w) Dimension of embedding = output of encoder dimesntions.
    :param device: "cuda" or "cpu"
    : return: Embedding of size (num_images_in_loader + 1, c, h, w)
    """
    # Set encoder to eval mode.
    encoder.eval()
    # Just a place holder for our 0th image embedding.
    embedding = torch.randn(embedding_dim)
    
    # no_grad as we do not compute loss here
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(full_loader):
            train_img = train_img.to(device)
            # Get encoder outputs and move outputs to cpu
            enc_output = encoder(train_img).cpu()
            # Add the outputs to embeddings
            embedding = torch.cat((embedding, enc_output), 0)
    return embedding

In [10]:
# Create the PyTorch `dataset` and the `dataloaders`
transforms = T.Compose([T.ToTensor()]) # Normalize the pixels and convert to tensor
# Create folder dataset
full_dataset = FolderDataset("./data/COMP90086_2021_Project_train/train", transforms) 

train_size = int(0.75 * len(full_dataset))
val_size = len(full_dataset) - train_size

# Split data to train and test
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Create the train dataloader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# Create the validation dataloader
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# Create the full dataloader
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=32)

# Mean squared lossto compute difference between two images
encoder = ConvEncoder() 
decoder = ConvDecoder()
device = "cuda" 
max_loss = 9999

# Shift models to GPU
encoder.to(device)
decoder.to(device)

# The enocder and decoder parameters
print(encoder.parameters())
autoencoder_params = list(encoder.parameters()) + list(decoder.parameters())
# Adam Optimizer
optimizer = optim.AdamW(autoencoder_params, lr=1e-3)

total_epochs = 20


for epoch in tqdm(range(total_epochs)):
        train_loss = train_step(encoder, decoder, train_loader, nn.MSELoss(), optimizer, device=device)   
        print(f"Epochs = {epoch}, Training Loss : {train_loss}")
        val_loss = val_step(encoder, decoder, val_loader, nn.MSELoss(), device=device)   
        print(f"Epochs = {epoch}, Validation Loss : {val_loss}")
        # Simple Best Model saving
        if val_loss < max_loss:
            print("Validation Loss decreased, saving new best model")
            torch.save(encoder.state_dict(), "encoder_model.pt")
            torch.save(decoder.state_dict(), "decoder_model.pt")

<generator object Module.parameters at 0x7f354b0b46d0>


  0%|          | 0/20 [00:00<?, ?it/s]

Epochs = 0, Training Loss : 0.01252690702676773


  5%|▌         | 1/20 [01:59<37:46, 119.29s/it]

Epochs = 0, Validation Loss : 0.01323419064283371
Validation Loss decreased, saving new best model
Epochs = 1, Training Loss : 0.007552537601441145


 10%|█         | 2/20 [03:46<33:35, 112.00s/it]

Epochs = 1, Validation Loss : 0.010019776411354542
Validation Loss decreased, saving new best model
Epochs = 2, Training Loss : 0.006785946898162365


 15%|█▌        | 3/20 [05:33<31:04, 109.70s/it]

Epochs = 2, Validation Loss : 0.008748156018555164
Validation Loss decreased, saving new best model
Epochs = 3, Training Loss : 0.004905642941594124


 20%|██        | 4/20 [07:20<28:57, 108.59s/it]

Epochs = 3, Validation Loss : 0.006146470550447702
Validation Loss decreased, saving new best model
Epochs = 4, Training Loss : 0.004520830698311329


 25%|██▌       | 5/20 [09:07<27:02, 108.17s/it]

Epochs = 4, Validation Loss : 0.005630096886307001
Validation Loss decreased, saving new best model
Epochs = 5, Training Loss : 0.003535803873091936


 30%|███       | 6/20 [10:54<25:09, 107.82s/it]

Epochs = 5, Validation Loss : 0.0053911879658699036
Validation Loss decreased, saving new best model
Epochs = 6, Training Loss : 0.004147941246628761


 35%|███▌      | 7/20 [12:41<23:19, 107.62s/it]

Epochs = 6, Validation Loss : 0.00537729961797595
Validation Loss decreased, saving new best model
Epochs = 7, Training Loss : 0.0033578905276954174


 40%|████      | 8/20 [14:28<21:29, 107.47s/it]

Epochs = 7, Validation Loss : 0.005101591348648071
Validation Loss decreased, saving new best model
Epochs = 8, Training Loss : 0.0041295913979411125


 45%|████▌     | 9/20 [16:16<19:41, 107.43s/it]

Epochs = 8, Validation Loss : 0.005190403666347265
Validation Loss decreased, saving new best model
Epochs = 9, Training Loss : 0.00450908113270998


 50%|█████     | 10/20 [18:03<17:53, 107.39s/it]

Epochs = 9, Validation Loss : 0.004838699474930763
Validation Loss decreased, saving new best model
Epochs = 10, Training Loss : 0.0036463518626987934


 55%|█████▌    | 11/20 [19:51<16:06, 107.43s/it]

Epochs = 10, Validation Loss : 0.00461912015452981
Validation Loss decreased, saving new best model
Epochs = 11, Training Loss : 0.0035323232877999544


 60%|██████    | 12/20 [21:38<14:19, 107.38s/it]

Epochs = 11, Validation Loss : 0.004435521550476551
Validation Loss decreased, saving new best model
Epochs = 12, Training Loss : 0.003761634463444352


 65%|██████▌   | 13/20 [23:25<12:31, 107.41s/it]

Epochs = 12, Validation Loss : 0.0042784851975739
Validation Loss decreased, saving new best model
Epochs = 13, Training Loss : 0.004761117976158857


 70%|███████   | 14/20 [25:13<10:44, 107.43s/it]

Epochs = 13, Validation Loss : 0.0072531807236373425
Validation Loss decreased, saving new best model
Epochs = 14, Training Loss : 0.0032836939208209515


 75%|███████▌  | 15/20 [27:01<08:57, 107.53s/it]

Epochs = 14, Validation Loss : 0.004034943878650665
Validation Loss decreased, saving new best model
Epochs = 15, Training Loss : 0.0032015249598771334


 80%|████████  | 16/20 [28:48<07:10, 107.54s/it]

Epochs = 15, Validation Loss : 0.003900214098393917
Validation Loss decreased, saving new best model
Epochs = 16, Training Loss : 0.0038795953150838614


 85%|████████▌ | 17/20 [30:36<05:22, 107.61s/it]

Epochs = 16, Validation Loss : 0.0038666885811835527
Validation Loss decreased, saving new best model
Epochs = 17, Training Loss : 0.003770400770008564


 90%|█████████ | 18/20 [32:24<03:35, 107.69s/it]

Epochs = 17, Validation Loss : 0.004188248887658119
Validation Loss decreased, saving new best model
Epochs = 18, Training Loss : 0.002613143529742956


 95%|█████████▌| 19/20 [34:12<01:47, 107.72s/it]

Epochs = 18, Validation Loss : 0.0037254728376865387
Validation Loss decreased, saving new best model
Epochs = 19, Training Loss : 0.004715657792985439


100%|██████████| 20/20 [36:00<00:00, 108.00s/it]

Epochs = 19, Validation Loss : 0.0037532318383455276
Validation Loss decreased, saving new best model





RuntimeError: torch.cat(): Sizes of tensors must match except in dimension 0. Got 16 and 15 in dimension 2 (The offending index is 1)

In [11]:
# Got these values from the encoder
embedding_shape = (1, 256, 15, 20)


# We need feature representations for the complete dataset as well (i.e full_loader)
embedding = create_embedding(encoder, full_loader, embedding_shape, device)
# Convert embedding to numpy and save them
numpy_embedding = embedding.cpu().detach().numpy()
print('numpy_embedding:', numpy_embedding)
num_images = numpy_embedding.shape[0]

# Save the embeddings for complete dataset
flattened_embedding = numpy_embedding.reshape((num_images, -1))
np.save("data_embedding.npy", flattened_embedding)  

numpy_embedding: [[[[-2.36840653e+00  6.26653016e-01  2.72425920e-01 ...  8.52612436e-01
     1.43433189e+00 -5.39098144e-01]
   [-2.94320762e-01  2.51943135e+00 -7.61376679e-01 ...  9.80363488e-01
     1.47020829e+00  2.02829868e-01]
   [-3.96223569e+00 -1.96240079e+00  5.09149730e-01 ... -1.19797754e+00
     1.05913246e+00  2.77655363e-01]
   ...
   [-7.77682215e-02  8.89263809e-01 -3.44714433e-01 ...  1.40728140e+00
    -9.38679934e-01 -9.03505445e-01]
   [ 5.48914708e-02  1.41774490e-01  9.01549160e-01 ...  1.65012228e+00
    -4.43023592e-01  5.18196933e-02]
   [ 1.08886015e+00  1.04408896e+00  1.37463880e+00 ... -3.49601358e-02
    -2.09944099e-01 -3.88877690e-01]]

  [[ 6.13528609e-01  6.78441107e-01  4.42746542e-02 ...  2.66247940e+00
    -1.39707431e-01 -5.24485826e-01]
   [ 4.67617035e-01  6.21322751e-01  3.41062307e-01 ...  3.40247720e-01
     1.01849988e-01  5.26344836e-01]
   [-2.03190279e+00 -6.24652505e-01 -2.50390083e-01 ... -6.34851158e-01
    -7.30638504e-01 -1.4827104

In [12]:
print('embedding shape:', embedding.shape)
last_img_embedding = embedding[7500:, :, :, ]
print('\n embedding shape of the last image for instance:', last_img_embedding.shape)

embedding shape: torch.Size([7501, 256, 15, 20])

 embedding shape of the last image for instance: torch.Size([1, 256, 15, 20])


In [15]:
def find_similar_images(test_image, num_sim_images, embedding, device, knn):
    """
    Searches for a similar image given an image and number of similar images to search.
    :param image: Image whose similar images need to be found
    :param num_sim_images: Number of similar images to find (K argument in K-Nearest Neighbors algorithm)
    :param embedding: A (num_sim_images, embedding_dim) embedding of images learnt from auto-encoder.
    :param device: "cuda" or "cpu" device
    :returns: the indices of similar image to the test image
    """
    crop = T.Compose([T.CenterCrop((480, 640))])
    test_image = crop(test_image)
    
    image_tensor = T.ToTensor()(test_image)
    image_tensor = image_tensor.unsqueeze(0)
    image_tensor = image_tensor.to(device)
    with torch.no_grad():
        # the test image whose similar images are required need to be converted to feature representation
        # This line raised error as the model is on GPU, but data is on the CPU. So, we need to send our image tensor to GPU

        #image_embedding = encoder(image_tensor).cpu().detach().numpy()
        image_embedding = encoder(image_tensor).cpu().detach().numpy()
        #print(image_embedding.shape)
        #print('image_embedding:', image_embedding)
        
    flattened_embedding = image_embedding.reshape((image_embedding.shape[0], -1))
    #print(flattened_embedding)

    _, indices = knn.kneighbors(flattened_embedding)
    return indices.tolist()

In [16]:
test_image_path = "./data/COMP90086_2021_Project_test/test/"
num_sim_images = 1
encode_model_path = "encoder_model.pt"
embedding_path = "data_embedding.npy"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

encoder = ConvEncoder()
# Load the state dict of encoder
encoder.load_state_dict(torch.load(encode_model_path, map_location=device))
encoder.eval()
encoder.to(device)

# Loads the embedding
embedding = np.load(embedding_path)

# Creates K-NN graph
knn = NearestNeighbors(n_neighbors=num_sim_images, metric="cosine")
knn.fit(embedding)

most_similar_images_indices = []

for test_image in test_df['id']:
    print("Image name: ",test_image)
    test_image = Image.open(test_image_path + test_image + '.jpg').convert("RGB")
    
    similiar_images_indices = find_similar_images(test_image, num_sim_images, embedding, device, knn)
    
    # get first element and subtract -1 from index position
    top_match_index = similiar_images_indices[0][0] - 1
    
    print(top_match_index)
    
    # append only top one prediction
    most_similar_images_indices.append(top_match_index)
    #plot_similar_images(indices_list)

Image name:  IMG4287_3
954
Image name:  IMG4288_5
7357
Image name:  IMG4289_5
2921
Image name:  IMG4290_4
3124
Image name:  IMG4291_5
5263
Image name:  IMG4292_3
2079
Image name:  IMG4293_3
4255
Image name:  IMG4294_1
5521
Image name:  IMG4295_1
3449
Image name:  IMG4296_1
1764
Image name:  IMG4297_4
5680
Image name:  IMG4298_1
2192
Image name:  IMG4299_3
468
Image name:  IMG4300_5
3184
Image name:  IMG4301_3
2895
Image name:  IMG4302_2
5390
Image name:  IMG4303_2
3127
Image name:  IMG4304_5
1016
Image name:  IMG4305_5
1922
Image name:  IMG4306_4
1016
Image name:  IMG4307_4
5952
Image name:  IMG4308_3
5521
Image name:  IMG4309_1
7129
Image name:  IMG4310_4
695
Image name:  IMG4311_4
2339
Image name:  IMG4312_2
2120
Image name:  IMG4313_3
2749
Image name:  IMG4314_5
3514
Image name:  IMG4315_5
4174
Image name:  IMG4316_2
2975
Image name:  IMG4317_2
429
Image name:  IMG4318_2
5120
Image name:  IMG4319_2
4455
Image name:  IMG4320_2
6372
Image name:  IMG4321_5
6510
Image name:  IMG4322_4
4

In [17]:
current_id = 7357
train_df.iloc[current_id, 1]

6.419321919

In [18]:
preds = []

for current_index in most_similar_images_indices:
    x = train_df.iloc[current_index, 1]
    y = train_df.iloc[current_index, 2]
    pred = (x, y)
    preds.append(pred)

In [19]:
len(most_similar_images_indices)

1200

## Write submission file

In [20]:
import csv

test_filenames = []
with open("./data/COMP90086_2021_Project_test/imagenames.csv", newline='') as inputfile:
    reader = csv.reader(inputfile)
    # skip header
    header = next(reader)
    
    for row in reader:
        test_filenames.append(str(row[0]))
    
test_filenames[0]

'IMG4287_3'

In [21]:
with open("submission.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["id","x","y"])
    index = 0
    for prediction in preds:
        writer.writerow([test_filenames[index], preds[index][0], preds[index][1]])
        index+=1