In [None]:
import torch
import torchvision.transforms as T
import os
from tqdm import tqdm
from torch.utils.data import Dataset
import numpy as np
import torch.nn as nn
import torch.optim as optim
from PIL import Image

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# SOURCE: https://medium.com/pytorch/image-similarity-search-in-pytorch-1a744cf3469
# Dataset class converting all images in the train/test folder to PyTorch dataset
class FolderDataset(Dataset):
    """
    Creates a PyTorch dataset from folder, returning two tensor images.
    Parameters: 
    main_dir : directory where images are stored.
    transform (optional) : torchvision transforms to be applied while making dataset
    Return: two images, one as input to the model and another image to compare with the original image for reconstruction.
    """

    def __init__(self, main_dir, transform=None):
        self.main_dir = main_dir
        self.transform = transform
        self.all_imgs = os.listdir(main_dir)

    def __len__(self):
        return len(self.all_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.all_imgs[idx])
        image = Image.open(img_loc).convert("RGB")   # our image size is (680, 490)
        scale = T.Compose([T.Scale((512,512))])
        image = scale(image)

        if self.transform is not None:
            tensor_image = self.transform(image)

        return tensor_image, tensor_image
        

In [None]:
# The encoder model is a repetition of convolutional, relu and maxpool layers
# Converts our input image to a feature representation of size (1, 256, 16, 16).
class ConvEncoder(nn.Module):
    """
    A simple Convolutional Encoder Model
    """

    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, 16, (3, 3), padding=(1, 1))
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool1 = nn.MaxPool2d((2, 2))

        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=(1, 1))
        self.relu2 = nn.ReLU(inplace=True)
        self.maxpool2 = nn.MaxPool2d((2, 2))

        self.conv3 = nn.Conv2d(32, 64, (3, 3), padding=(1, 1))
        self.relu3 = nn.ReLU(inplace=True)
        self.maxpool3 = nn.MaxPool2d((2, 2))

        self.conv4 = nn.Conv2d(64, 128, (3, 3), padding=(1, 1))
        self.relu4 = nn.ReLU(inplace=True)
        self.maxpool4 = nn.MaxPool2d((2, 2))

        self.conv5 = nn.Conv2d(128, 256, (3, 3), padding=(1, 1))
        self.relu5 = nn.ReLU(inplace=True)
        self.maxpool5 = nn.MaxPool2d((2, 2))

    def forward(self, x):
        # Downscale the image with conv maxpool etc.
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)

        x = self.conv4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)

        x = self.conv5(x)
        x = self.relu5(x)
        x = self.maxpool5(x)
        
        return x

In [None]:
class ConvDecoder(nn.Module):
    """
    A simple Convolutional Decoder Model
    Decoder takes an input of feature representations and reconstructs back the image.
    Upscale the feature representations to the original image using transposed convolution layers of kernel size (2, 2) and stride (2, 2)
    """

    def __init__(self):
        super().__init__()
        self.deconv1 = nn.ConvTranspose2d(256, 128, (2, 2), stride=(2, 2))
        self.relu1 = nn.ReLU(inplace=True)

        self.deconv2 = nn.ConvTranspose2d(128, 64, (2, 2), stride=(2, 2))
        self.relu2 = nn.ReLU(inplace=True)

        self.deconv3 = nn.ConvTranspose2d(64, 32, (2, 2), stride=(2, 2))
        self.relu3 = nn.ReLU(inplace=True)

        self.deconv4 = nn.ConvTranspose2d(32, 16, (2, 2), stride=(2, 2))
        self.relu4 = nn.ReLU(inplace=True)

        self.deconv5 = nn.ConvTranspose2d(16, 3, (2, 2), stride=(2, 2))
        self.relu5 = nn.ReLU(inplace=True)

    def forward(self, x):
         # Upscale the image with convtranspose etc.
        x = self.deconv1(x)
        x = self.relu1(x)

        x = self.deconv2(x)
        x = self.relu2(x)

        x = self.deconv3(x)
        x = self.relu3(x)

        x = self.deconv4(x)
        x = self.relu4(x)

        x = self.deconv5(x)
        x = self.relu5(x)
        return x

In [None]:
def train_step(encoder, decoder, train_loader, loss_fn, optimizer, device):
    """
    Performs a single training step
    Args:
    encoder: A convolutional Encoder. E.g. torch_model ConvEncoder
    decoder: A convolutional Decoder. E.g. torch_model ConvDecoder
    train_loader: PyTorch dataloader, containing (images, images).
    loss_fn: PyTorch loss_fn, computes loss between 2 images.
    optimizer: PyTorch optimizer.
    device: "cuda" or "cpu"
    Returns: Train Loss
    """
    #  Set networks to train mode.
    encoder.train()
    decoder.train()

    for batch_idx, (train_img, target_img) in enumerate(train_loader):
        # Move images to device
        train_img = train_img.to(device)
        target_img = target_img.to(device)
        
        # Zero grad the optimizer
        optimizer.zero_grad()
        # Feed the train images to encoder
        enc_output = encoder(train_img)
        # The output of encoder is input to decoder !
        dec_output = decoder(enc_output)
        
        # Decoder output is reconstructed image
        # Compute loss with it and orginal image which is target image.
        loss = loss_fn(dec_output, target_img)
        # Backpropogate
        loss.backward()
        # Apply the optimizer to network by calling step.
        optimizer.step()
    # Return the loss
    return loss.item()

def val_step(encoder, decoder, val_loader, loss_fn, device):
    """
    Performs a single training step
    Args:
    encoder: A convolutional Encoder. E.g. torch_model ConvEncoder
    decoder: A convolutional Decoder. E.g. torch_model ConvDecoder
    val_loader: PyTorch dataloader, containing (images, images).
    loss_fn: PyTorch loss_fn, computes loss between 2 images.
    device: "cuda" or "cpu"
    Returns: Validation Loss
    """
    
    # Set to eval mode.
    encoder.eval()
    decoder.eval()
    
    # We don't need to compute gradients while validating.
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(val_loader):
            # Move to device
            train_img = train_img.to(device)
            target_img = target_img.to(device)

            # Again as train. Feed encoder the train image.
            enc_output = encoder(train_img)
            # Decoder takes encoder output and reconstructs the image.
            dec_output = decoder(enc_output)

            # Validation loss for encoder and decoder.
            loss = loss_fn(dec_output, target_img)
    # Return the loss
    return loss.item()

In [None]:
def create_embedding(encoder, full_loader, embedding_dim, device):
    """
    Creates embedding using encoder from dataloader.
    encoder: A convolutional Encoder. E.g. torch_model ConvEncoder
    full_loader: PyTorch dataloader, containing (images, images) over entire dataset.
    embedding_dim: Tuple (c, h, w) Dimension of embedding = output of encoder dimesntions.
    device: "cuda" or "cpu"
    Returns: Embedding of size (num_images_in_loader + 1, c, h, w)
    """
    # Set encoder to eval mode.
    encoder.eval()
    # Just a place holder for our 0th image embedding.
    embedding = torch.randn(embedding_dim)
    
    # Again we do not compute loss here so. No gradients.
    with torch.no_grad():
        for batch_idx, (train_img, target_img) in enumerate(full_loader):
            # We can compute this on GPU. be faster
            train_img = train_img.to(device)
            
            # Get encoder outputs and move outputs to cpu
            enc_output = encoder(train_img).cpu()
            # Keep adding these outputs to embeddings.
            embedding = torch.cat((embedding, enc_output), 0)
    
    # Return the embeddings
    return embedding

In [None]:
# Create the PyTorch `dataset` and the `dataloaders`
transforms = T.Compose([T.ToTensor()]) # Normalize the pixels and convert to tensor
# Create folder dataset
full_dataset = FolderDataset("/content/drive/MyDrive/train", transforms) 

train_size = int(0.75 * len(full_dataset))
val_size = len(full_dataset) - train_size

# Split data to train and test
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Create the train dataloader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# Create the validation dataloader
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

# Create the full dataloader
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=32)

# Mean squared lossto compute difference between two images
encoder = ConvEncoder() 
decoder = ConvDecoder()
device = "cuda" 
max_loss = 9999

# Shift models to GPU
encoder.to(device)
decoder.to(device)

# The enocder and decoder parameters
print(encoder.parameters())
autoencoder_params = list(encoder.parameters()) + list(decoder.parameters())
# Adam Optimizer
optimizer = optim.AdamW(autoencoder_params, lr=1e-3)


total_epochs = 10
# Got these values from the encoder
embedding_shape = (1, 256, 16, 16)

for epoch in tqdm(range(total_epochs)):
        train_loss = train_step(encoder, decoder, train_loader, nn.MSELoss(), optimizer, device=device)   
        print(f"Epochs = {epoch}, Training Loss : {train_loss}")
        val_loss = val_step(encoder, decoder, val_loader, nn.MSELoss(), device=device)   
        print(f"Epochs = {epoch}, Validation Loss : {val_loss}")
        # Simple Best Model saving
        if val_loss < max_loss:
            print("Validation Loss decreased, saving new best model")
            torch.save(encoder.state_dict(), "encoder_model.pt")
            torch.save(decoder.state_dict(), "decoder_model.pt")

# We need feature representations for the complete dataset as well (i.e full_loader)
embedding = create_embedding(encoder, full_loader, embedding_shape, device)
# Convert embedding to numpy and save them
numpy_embedding = embedding.cpu().detach().numpy()
print('numpy_embedding:', numpy_embedding)
num_images = numpy_embedding.shape[0]

# Save the embeddings for complete dataset, not just train
flattened_embedding = numpy_embedding.reshape((num_images, -1))
np.save("data_embedding.npy", flattened_embedding)            


<generator object Module.parameters at 0x7f1c45c293d0>




Epochs = 0, Training Loss : 0.009722529910504818


 10%|█         | 1/10 [02:04<18:38, 124.28s/it]

Epochs = 0, Validation Loss : 0.011058343574404716
Validation Loss decreased, saving new best model
Epochs = 1, Training Loss : 0.006044135894626379


 20%|██        | 2/10 [04:08<16:35, 124.42s/it]

Epochs = 1, Validation Loss : 0.006075538229197264
Validation Loss decreased, saving new best model
Epochs = 2, Training Loss : 0.003948246594518423


 30%|███       | 3/10 [06:14<14:36, 125.22s/it]

Epochs = 2, Validation Loss : 0.004409089684486389
Validation Loss decreased, saving new best model
Epochs = 3, Training Loss : 0.0043773651123046875


 40%|████      | 4/10 [08:19<12:30, 125.07s/it]

Epochs = 3, Validation Loss : 0.0039024578873068094
Validation Loss decreased, saving new best model
Epochs = 4, Training Loss : 0.004546602256596088


 50%|█████     | 5/10 [10:27<10:29, 125.95s/it]

Epochs = 4, Validation Loss : 0.00370001420378685
Validation Loss decreased, saving new best model
Epochs = 5, Training Loss : 0.003492643591016531


 60%|██████    | 6/10 [12:33<08:23, 125.89s/it]

Epochs = 5, Validation Loss : 0.0034036722499877214
Validation Loss decreased, saving new best model
Epochs = 6, Training Loss : 0.00336197461001575


 70%|███████   | 7/10 [14:38<06:17, 125.72s/it]

Epochs = 6, Validation Loss : 0.00344817410223186
Validation Loss decreased, saving new best model
Epochs = 7, Training Loss : 0.0031349333003163338


 80%|████████  | 8/10 [16:42<04:10, 125.30s/it]

Epochs = 7, Validation Loss : 0.003134028520435095
Validation Loss decreased, saving new best model
Epochs = 8, Training Loss : 0.003520687110722065


 90%|█████████ | 9/10 [18:47<02:04, 124.96s/it]

Epochs = 8, Validation Loss : 0.003042994998395443
Validation Loss decreased, saving new best model
Epochs = 9, Training Loss : 0.002975636627525091


100%|██████████| 10/10 [20:54<00:00, 125.42s/it]

Epochs = 9, Validation Loss : 0.0029887717682868242
Validation Loss decreased, saving new best model





numpy_embedding: [[[[ 4.29379463e-01  1.40495654e-02  2.50384808e-01 ...  9.99481320e-01
     5.47401547e-01 -1.71330646e-01]
   [ 2.97027171e-01  7.69898415e-01  1.25629759e+00 ... -2.96386391e-01
    -7.68322885e-01  1.28212422e-01]
   [ 1.58863950e+00 -4.80478734e-01  1.36131505e-02 ... -1.55936193e+00
     1.41450238e+00  3.18982363e-01]
   ...
   [ 3.66349757e-01 -1.06628311e+00 -2.82197855e-02 ...  1.83258593e-01
     4.16724354e-01 -2.80501151e+00]
   [-1.02106023e+00 -1.37051141e+00  1.56487370e+00 ...  6.20287120e-01
     9.01270330e-01  5.23280978e-01]
   [-3.26249689e-01  3.15327734e-01  1.63362160e-01 ...  4.14147750e-02
    -3.16214785e-02 -1.24172473e+00]]

  [[ 9.33707356e-01 -6.91357329e-02  3.76285911e-01 ...  1.20830953e+00
    -1.00412741e-02 -2.35696936e+00]
   [-6.22728884e-01  2.32567251e-01  7.16500878e-01 ... -1.87769282e+00
    -5.18301308e-01  2.15433594e-02]
   [ 3.70566934e-01  9.41954494e-01 -1.45646110e-01 ...  8.10753822e-01
    -1.64592242e+00 -2.8293177

In [None]:
print('embedding shape:', embedding.shape)
last_img_embedding = embedding[7500:, :, :, ]
print('\n embedding shape of the last image for instance:', last_img_embedding.shape)

embedding shape: torch.Size([7501, 256, 16, 16])

 embedding shape of the last image for instance: torch.Size([1, 256, 16, 16])


In [None]:
def compute_similar_images(image, num_images, embedding, device):
    """
    Given an image and number of similar images to search.
    Returns the num_images closest neares images.
    Args:
    image: Image whose similar images are to be found.
    num_images: Number of similar images to find.
    embedding : A (num_images, embedding_dim) Embedding of images learnt from auto-encoder.
    device : "cuda" or "cpu" device.
    """
    
    image_tensor = T.ToTensor()(image)
    image_tensor = image_tensor.unsqueeze(0)
    
    with torch.no_grad():
        image_embedding = encoder(image_tensor).cpu().detach().numpy()
        
    flattened_embedding = image_embedding.reshape((image_embedding.shape[0], -1))

    knn = NearestNeighbors(n_neighbors=num_images, metric="cosine")
    knn.fit(embedding)

    _, indices = knn.kneighbors(flattened_embedding)
    indices_list = indices.tolist()
    return indices_list