In [1]:
import torch, os
import torchvision.transforms as T
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd 
from tqdm import tqdm
from PIL import Image


In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/imagenames.csv")
test_df

Unnamed: 0,id
0,IMG4287_3
1,IMG4288_5
2,IMG4289_5
3,IMG4290_4
4,IMG4291_5
...,...
1195,IMG5482_1
1196,IMG5483_2
1197,IMG5484_4
1198,IMG5485_3


In [5]:
# SOURCE: https://medium.com/pytorch/image-similarity-search-in-pytorch-1a744cf3469
# Dataset class converting all images in the train/test folder to PyTorch dataset
class FolderDataset(Dataset):
    """
    Creates a PyTorch dataset from a given folder containing images and returns two tensor images. 
    :param main_dir: directory where images are stored.
    #:param transform: torchvision transforms to be applied while making dataset (optional) 
    :return: two images, one as input to the model and another image to compare with the original image for reconstruction.
    """
    def __init__(self, main_dir, transform=None):
        self.main_dir = main_dir
        self.transform = transform
        self.all_imgs = os.listdir(main_dir)

    def __len__(self):
        return len(self.all_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.all_imgs[idx])
        image = Image.open(img_loc).convert("RGB")   # our image size is (680, 490), I had to scale them to 512*512 to handle a bug related to tensor size
        scale = T.Compose([T.Scale((512,512))])
        image = scale(image)

        if self.transform is not None:
            tensor_image = self.transform(image)

        return tensor_image, tensor_image
        

In [6]:
class ConvEncoder(nn.Module):
    """
    Convolutional Encoder Model which is a a repetition of convolutional, relu and maxpool layers.
    It converts an input images to a feature representation of size (1, 256, 16, 16).
    """

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, (3, 3), padding=(1, 1))
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool1 = nn.MaxPool2d((2, 2))
        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=(1, 1))
        self.relu2 = nn.ReLU(inplace=True)
        self.maxpool2 = nn.MaxPool2d((2, 2))
        self.conv3 = nn.Conv2d(32, 64, (3, 3), padding=(1, 1))
        self.relu3 = nn.ReLU(inplace=True)
        self.maxpool3 = nn.MaxPool2d((2, 2))
        self.conv4 = nn.Conv2d(64, 128, (3, 3), padding=(1, 1))
        self.relu4 = nn.ReLU(inplace=True)
        self.maxpool4 = nn.MaxPool2d((2, 2))
        self.conv5 = nn.Conv2d(128, 256, (3, 3), padding=(1, 1))
        self.relu5 = nn.ReLU(inplace=True)
        self.maxpool5 = nn.MaxPool2d((2, 2))

    def forward(self, x):
        # Downscale the image with conv maxpool
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        x = self.conv4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)
        x = self.conv5(x)
        x = self.relu5(x)
        x = self.maxpool5(x)
        
        return x

In [7]:
class ConvDecoder(nn.Module):
    """
    Convolutional Decoder Model takes an input of feature representations and reconstructs back the image
    It upscales the feature representations to the original image using transposed convolution layers of kernel size (2, 2) and stride (2, 2)
    """

    def __init__(self):
        super().__init__()
        self.deconv1 = nn.ConvTranspose2d(256, 128, (2, 2), stride=(2, 2))
        self.relu1 = nn.ReLU(inplace=True)
        self.deconv2 = nn.ConvTranspose2d(128, 64, (2, 2), stride=(2, 2))
        self.relu2 = nn.ReLU(inplace=True)
        self.deconv3 = nn.ConvTranspose2d(64, 32, (2, 2), stride=(2, 2))
        self.relu3 = nn.ReLU(inplace=True)
        self.deconv4 = nn.ConvTranspose2d(32, 16, (2, 2), stride=(2, 2))
        self.relu4 = nn.ReLU(inplace=True)
        self.deconv5 = nn.ConvTranspose2d(16, 3, (2, 2), stride=(2, 2))
        self.relu5 = nn.ReLU(inplace=True)

    def forward(self, x):
        # Upscale the image with convtranspose etc.
        x = self.deconv1(x)
        x = self.relu1(x)
        x = self.deconv2(x)
        x = self.relu2(x)
        x = self.deconv3(x)
        x = self.relu3(x)
        x = self.deconv4(x)
        x = self.relu4(x)
        x = self.deconv5(x)
        x = self.relu5(x)
        return x

In [8]:
def train_step(encoder, decoder, train_loader, loss_fn, optimizer, device):
    """
    Performs a single training step
    :param encoder: the convolutional Encoder defined as ConvEncoder
    :param decoder: A convolutional Decoder defined as ConvDecoder
    :param train_loader: PyTorch dataloader, containing (images, images).
    :param loss_fn: PyTorch loss_fn to compute loss between 2 images.
    :param optimizer: PyTorch optimizer (eg. AdamW)
    :param device: "cuda" or "cpu"
    :returns: Train Loss
    """
    #  Set networks to train mode.
    encoder.train()
    decoder.train()

    for batch_idx, (train_img, target_img) in enumerate(train_loader):
        train_img = train_img.to(device)
        target_img = target_img.to(device)
        
        # Zero grad the optimizer
        optimizer.zero_grad()
        # Feed the train images to encoder
        enc_output = encoder(train_img)
        dec_output = decoder(enc_output)
        # Compute loss between the reconstructed image and orginal image which is target image.
        loss = loss_fn(dec_output, target_img)
        loss.backward()
        # Apply the optimizer to network
        optimizer.step()
    return loss.item()

def val_step(encoder, decoder, val_loader, loss_fn, device):
    """
    Performs a single training step
    :param encoder: A convolutional Encoder like torch_model ConvEncoder
    :param decoder: A convolutional Decoder like torch_model ConvDecoder
    :param val_loader: PyTorch dataloader containing (images, images)
    :param loss_fn: PyTorch loss_fnto compute loss between 2 images
    :param device: "cuda" or "cpu"
    :returns: Validation Loss
    """
    # Set to eval mode.
    encoder.eval()
    decoder.eval()
    
    # gradients do not need to be computed  for validation???
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(val_loader):
            train_img = train_img.to(device)
            target_img = target_img.to(device)
            # Feed the train images to encoder
            enc_output = encoder(train_img)
            # Feed the encoder output to Decoder to reconstruct the image
            dec_output = decoder(enc_output)
            # Find the validation loss for the encoder and decoder 
            loss = loss_fn(dec_output, target_img)
    return loss.item()

In [9]:
def create_embedding(encoder, full_loader, embedding_dim, device):
    """
    Creates embedding using encoder from dataloader and saves our image embeddings.
    :param encoder: A convolutional Encoder like torch_model ConvEncoder
    :param full_loader: PyTorch dataloader, containing (images, images) over the entire dataset
    :param embedding_dim: Tuple (c, h, w) Dimension of embedding = output of encoder dimesntions.
    :param device: "cuda" or "cpu"
    : return: Embedding of size (num_images_in_loader + 1, c, h, w)
    """
    # Set encoder to eval mode.
    encoder.eval()
    # Just a place holder for our 0th image embedding.
    embedding = torch.randn(embedding_dim)
    
    # no_grad as we do not compute loss here
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(full_loader):
            train_img = train_img.to(device)
            # Get encoder outputs and move outputs to cpu
            enc_output = encoder(train_img).cpu()
            # Add the outputs to embeddings
            embedding = torch.cat((embedding, enc_output), 0)
    return embedding

In [10]:
# Create the PyTorch `dataset` and the `dataloaders`
transforms = T.Compose([T.ToTensor()]) # Normalize the pixels and convert to tensor
# Create folder dataset
full_dataset = FolderDataset("/content/drive/MyDrive/train", transforms) 

train_size = int(0.75 * len(full_dataset))
val_size = len(full_dataset) - train_size

# Split data to train and test
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Create the train dataloader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# Create the validation dataloader
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# Create the full dataloader
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=32)

# Mean squared lossto compute difference between two images
encoder = ConvEncoder() 
decoder = ConvDecoder()
device = "cuda" 
max_loss = 9999

# Shift models to GPU
encoder.to(device)
decoder.to(device)

# The enocder and decoder parameters
print(encoder.parameters())
autoencoder_params = list(encoder.parameters()) + list(decoder.parameters())
# Adam Optimizer
optimizer = optim.AdamW(autoencoder_params, lr=1e-3)

total_epochs = 10
# Got these values from the encoder
embedding_shape = (1, 256, 16, 16)

for epoch in tqdm(range(total_epochs)):
        train_loss = train_step(encoder, decoder, train_loader, nn.MSELoss(), optimizer, device=device)   
        print(f"Epochs = {epoch}, Training Loss : {train_loss}")
        val_loss = val_step(encoder, decoder, val_loader, nn.MSELoss(), device=device)   
        print(f"Epochs = {epoch}, Validation Loss : {val_loss}")
        # Simple Best Model saving
        if val_loss < max_loss:
            print("Validation Loss decreased, saving new best model")
            torch.save(encoder.state_dict(), "encoder_model.pt")
            torch.save(decoder.state_dict(), "decoder_model.pt")

# We need feature representations for the complete dataset as well (i.e full_loader)
embedding = create_embedding(encoder, full_loader, embedding_shape, device)
# Convert embedding to numpy and save them
numpy_embedding = embedding.cpu().detach().numpy()
print('numpy_embedding:', numpy_embedding)
num_images = numpy_embedding.shape[0]

# Save the embeddings for complete dataset
flattened_embedding = numpy_embedding.reshape((num_images, -1))
np.save("data_embedding.npy", flattened_embedding)            


<generator object Module.parameters at 0x7f59042378d0>


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epochs = 0, Training Loss : 0.011093203909695148


 10%|█         | 1/10 [24:16<3:38:26, 1456.24s/it]

Epochs = 0, Validation Loss : 0.010468840599060059
Validation Loss decreased, saving new best model
Epochs = 1, Training Loss : 0.005519292317330837


 20%|██        | 2/10 [26:20<1:29:40, 672.53s/it] 

Epochs = 1, Validation Loss : 0.005793792195618153
Validation Loss decreased, saving new best model
Epochs = 2, Training Loss : 0.005569281987845898


 30%|███       | 3/10 [28:23<49:11, 421.57s/it]  

Epochs = 2, Validation Loss : 0.0041328356601297855
Validation Loss decreased, saving new best model
Epochs = 3, Training Loss : 0.00417742133140564


 40%|████      | 4/10 [30:25<30:20, 303.42s/it]

Epochs = 3, Validation Loss : 0.003605941543355584
Validation Loss decreased, saving new best model
Epochs = 4, Training Loss : 0.0045937891118228436


 50%|█████     | 5/10 [32:28<19:51, 238.38s/it]

Epochs = 4, Validation Loss : 0.003322718432173133
Validation Loss decreased, saving new best model
Epochs = 5, Training Loss : 0.0032663573510944843


 60%|██████    | 6/10 [34:31<13:16, 199.17s/it]

Epochs = 5, Validation Loss : 0.0031313274521380663
Validation Loss decreased, saving new best model
Epochs = 6, Training Loss : 0.003957951907068491


 70%|███████   | 7/10 [36:33<08:42, 174.02s/it]

Epochs = 6, Validation Loss : 0.003285658545792103
Validation Loss decreased, saving new best model
Epochs = 7, Training Loss : 0.003755912883207202


 80%|████████  | 8/10 [38:38<05:16, 158.45s/it]

Epochs = 7, Validation Loss : 0.0028864911291748285
Validation Loss decreased, saving new best model
Epochs = 8, Training Loss : 0.005124012939631939


 90%|█████████ | 9/10 [40:41<02:27, 147.15s/it]

Epochs = 8, Validation Loss : 0.002975414041429758
Validation Loss decreased, saving new best model
Epochs = 9, Training Loss : 0.004047607071697712


100%|██████████| 10/10 [42:44<00:00, 256.48s/it]

Epochs = 9, Validation Loss : 0.0032191702630370855
Validation Loss decreased, saving new best model





numpy_embedding: [[[[-2.15307802e-01  1.27419400e+00  1.11482489e+00 ... -4.03616101e-01
    -1.31118751e+00  2.06213951e-01]
   [ 1.31526768e+00  8.20715845e-01  5.52603006e-01 ...  2.58888632e-01
     5.48661888e-01 -1.01492679e+00]
   [ 5.29578090e-01  8.09661150e-01  1.71489269e-02 ...  2.62416512e-01
     4.13850218e-01  1.60760179e-01]
   ...
   [-1.26794803e+00  4.29791272e-01  2.93290466e-01 ...  6.52712941e-01
     9.34922695e-03 -1.77564859e+00]
   [ 2.72137928e+00  5.97640216e-01  1.00864685e+00 ...  5.01684010e-01
     1.51797503e-01 -5.23506582e-01]
   [-3.59835923e-01 -1.24471593e+00  3.38557065e-01 ...  9.64882076e-01
    -8.83942306e-01  1.18420672e+00]]

  [[ 1.15755737e+00 -2.47312859e-01 -1.85232544e+00 ...  5.41178048e-01
     2.69179404e-01 -2.12892199e+00]
   [-4.04597163e-01 -2.57741719e-01  1.39649844e+00 ... -6.92095280e-01
    -3.76643211e-01  6.14588022e-01]
   [-1.12125665e-01 -6.27754629e-02  1.99460149e-01 ... -1.53630793e-01
     8.07094753e-01 -1.9274396

In [11]:
print('embedding shape:', embedding.shape)
last_img_embedding = embedding[7500:, :, :, ]
print('\n embedding shape of the last image for instance:', last_img_embedding.shape)

embedding shape: torch.Size([7501, 256, 16, 16])

 embedding shape of the last image for instance: torch.Size([1, 256, 16, 16])


In [20]:
def find_similar_images(test_image, num_sim_images, embedding, device):
    """
    Searches for a similar image given an image and number of similar images to search.
    :param image: Image whose similar images need to be found
    :param num_sim_images: Number of similar images to find (K argument in K-Nearest Neighbors algorithm)
    :param embedding: A (num_sim_images, embedding_dim) embedding of images learnt from auto-encoder.
    :param device: "cuda" or "cpu" device
    :returns: the indices of similar image to the test image
    """

    image_tensor = T.ToTensor()(test_image)
    image_tensor = image_tensor.unsqueeze(0)
    with torch.no_grad():
        # the test image whose similar images are required need to be converted to feature representation
        # This line raised error as the model is on GPU, but data is on the CPU. So, we need to send our image tensor to GPU

        #image_embedding = encoder(image_tensor).cpu().detach().numpy()
        image_embedding = encoder(image_tensor.to(device)).detach().numpy()
        print('image_embedding:', image_embedding)
    flattened_embedding = image_embedding.reshape((image_embedding.shape[0], -1))
    print(flattened_embedding)
    knn = NearestNeighbors(n_neighbors=num_sim_images, metric="cosine")
    knn.fit(embedding)
    _, indices = knn.kneighbors(flattened_embedding)
    return indices.tolist()

In [None]:
Test_iamge_path = "/content/drive/MyDrive/test/"
num_sim_images = 1
encode_model_path = "/content/encoder_model.pt"
embedding_path = "/content/data_embedding.npy"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
encoder = ConvEncoder()
# Load the state dict of encoder
encoder.load_state_dict(torch.load(encode_model_path, map_location=device))
encoder.eval()
encoder.to(device)

# Loads the embedding
embedding = np.load(embedding_path)
for test_image in test_df['id']:
  test_image = Image.open(Test_iamge_path + test_image + '.jpg').convert("RGB")
  indices_list = find_similar_images(test_image, num_sim_images, embedding, device)
  print(indices_list)
  #plot_similar_images(indices_list)