In [None]:
from utils import index_files

index_files('dataset/images', 'wb')

In [None]:
import tensorflow as tf
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def load_images(path):
  path = Path(path)
  # Ensures only valid image files are loaded
  img_paths = list(path.glob('*.jpg')) + list(path.glob('*.jpeg')) + list(path.glob('*.png')) \
              + list(path.glob('*.gif'))
  images = []
  filenames = []
  print(f'Loading {len(img_paths)} images')
  for img_path in tqdm(img_paths):
    # load image
    img = tf.keras.preprocessing.image.load_img(img_path, target_size=(224,224))
    images.append(img)
    filenames.append(img_path.name)
  print('images loaded as' , type(images[0]), 'type')
  return images, filenames

In [None]:
path = 'dataset/images/'
images, filenames = load_images(path)

In [None]:
from keras import backend as K 

model = tf.keras.applications.VGG19(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax",)

model.summary()

layer_outputs = [layer.output for layer in model.layers[1:-1]]  # Identifies layer outputs
# Creates a model that will return the layer feature maps as outputs for a given image
visual_model = tf.keras.models.Model(inputs = model.input, outputs = layer_outputs) 

layer_names = [layer.name for layer in visual_model.layers]

for i, layer in enumerate(layer_names):
    print(i, layer)

In [None]:
def get_embeddings(f_levels, images, filenames=None):

  # Create a single string with all the f_levels
  levels = '_'.join([str(f) for f in f_levels])
  path = Path(f'./dataset/embeddings/{levels}')
  path.mkdir(parents=True, exist_ok=True)
  # Initialize embeddings
  E = []
  for i, img in enumerate(images):
    print('embedding image', i)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.vgg19.preprocess_input(img)       # Pre-processes image for VGG19
    # run an image through the network by making a prediction
    feature_maps = visual_model.predict(img)
    
    temp = np.zeros((0))
    for level in f_levels:
      A = feature_maps[level]
      # Flatten feature map to 2x2 matrix if it is 4D (e.g. Conv layer)
      if len(A.shape) == 4:
        A = A.reshape(A.shape[0], A.shape[1]*A.shape[2], A.shape[3], order='F')
        A = A.reshape(A.shape[1], A.shape[2])
        # Compute Gram matrix (cummulative co-activation of filter per layer)
        G = np.matmul(np.transpose(A),A)
        # Append this layer's flattened Gram matrix to images embedding
        dummy = np.zeros(temp.shape[0] + G.flatten().shape[0])
        dummy[:temp.shape[0]] = temp
        dummy[temp.shape[0]:] = G.flatten()
        emb = np.copy(dummy) 
      # Flatten feature map to 1D vector if it is 2D (e.g. FC layer)
      else:
        emb = np.copy(A.flatten())

    E.append(np.copy(emb))

    # convert layer names to a single string all of them concatenated
    layers = '_'.join([str(f) for f in f_levels])

    if filenames is not None:
      # create dir to save embeddings if it doesn't exist
      Path(f'./embeddings/{layers}').mkdir(parents=True, exist_ok=True)

      # save embeddings as .npy files
      filename = filenames[i].split('.')[0]
      np.save(f'./dataset/embeddings/{layers}/{filename}', emb)

    if i%10 == 0:
      print(f'Getting embedding of img no. {i}, with shape {E[i].shape}, and {len(f_levels)} layers')
    
  return np.array(E)


# Select the layer from which to extract feature maps (conv1_1 is layer 0)
f_levels = [23]        # Choose from [1,4,9,14,19,22,23]

# Get embeddings for 'test' dataset
embeddings = get_embeddings(f_levels, images, filenames=filenames)

In [None]:
import shutil
from pathlib import Path
import numpy as np

def remove_duplicates(dir, layer, thresh=0.3, delete=False):

    num_duplicates = 0

    # create duplicates folder from dir
    Path(f'{dir}/duplicates/images').mkdir(parents=True, exist_ok=True)
    Path(f'{dir}/duplicates/embeddings').mkdir(parents=True, exist_ok=True)

    # Load embeddings
    embeddings = []
    img_filenames = []
    emb_filenames = []
    embedding_path = Path(dir) / 'embeddings' / layer
    for file in embedding_path.glob('*.npy'):
        embeddings.append(np.load(file))
        emb_filenames.append(file.name)
        img_filenames.append(file.name.split('.')[0] + '.jpg')
    embeddings = np.array(embeddings)

    print('embeddings shape', embeddings.shape)

    # Fit 5 Nerest Neighbors model to embeddings
    knnbr = NearestNeighbors(n_neighbors=5).fit(embeddings)

    # Iterate through all files in embeddings dir
    for i in range(embeddings.shape[0]):

        print('Analyzing embedding and image', emb_filenames[i], img_filenames[i])
        knn = knnbr.kneighbors(embeddings[i].reshape(1, -1))
                    
        # Gets the distance and indexes of the k nearest neighbors (minus the first one, which is the query image)
        distances = knn[0][0][1:]
        indexes = knn[1][0][1:]

        # Remove duplicates
        for j in range(len(indexes)):
            index = indexes[j]
            distance = distances[j]

            if distance < thresh:
                num_duplicates += 1
                print('found duplicate', emb_filenames[i])
                # Copy the duplicated image into the duplicates folder
                image_path = Path(dir) / 'images' / img_filenames[i]
                dup_image_path = Path(dir) / 'duplicates' / 'images' / f'{img_filenames[i]}_{distance}.jpg'
                neighbor_image_path = Path(dir) / 'images' / img_filenames[index]
                dup_neighbor_image_path = Path(dir) / 'duplicates' / 'images' / str(f'{img_filenames[i]}_dup.jpg')
                # copy files to duplicates folder
                shutil.copy(image_path, dup_image_path)
                shutil.copy(neighbor_image_path, dup_neighbor_image_path)

    print('Found a total of', num_duplicates/2, 'duplicates')
                

dir = 'dataset'
remove_duplicates(dir, '23', thresh=30.0, delete=False)