In [None]:
from utils import index_files

index_files('dataset/images', 'wb')

In [2]:
import tensorflow as tf
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def load_images(path):
  path = Path(path)
  # Ensures only valid image files are loaded
  img_paths = list(path.glob('*.jpg')) + list(path.glob('*.jpeg')) + list(path.glob('*.png')) \
              + list(path.glob('*.gif'))
  images = []
  filenames = []
  print(f'Loading {len(img_paths)} images')
  for img_path in tqdm(img_paths):
    # load image
    img = tf.keras.preprocessing.image.load_img(img_path, target_size=(224,224))
    images.append(img)
    filenames.append(img_path.name)
  print('images loaded as' , type(images[0]), 'type')
  return images, filenames

In [3]:
path = 'dataset/images/'
images, filenames = load_images(path)

Loading 1112 images


100%|██████████| 1112/1112 [01:35<00:00, 11.67it/s]

images loaded as <class 'PIL.Image.Image'> type





In [4]:
from keras import backend as K 

model = tf.keras.applications.VGG19(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax",)

model.summary()

layer_outputs = [layer.output for layer in model.layers[1:-1]]  # Identifies layer outputs
# Creates a model that will return the layer feature maps as outputs for a given image
visual_model = tf.keras.models.Model(inputs = model.input, outputs = layer_outputs) 

layer_names = [layer.name for layer in visual_model.layers]

for i, layer in enumerate(layer_names):
    print(i, layer)

Model: "vgg19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [7]:
def get_embeddings(f_levels, images, filenames=None):

  # Create a single string with all the f_levels
  levels = '_'.join([str(f) for f in f_levels])
  path = Path(f'./dataset/embeddings/{levels}')
  path.mkdir(parents=True, exist_ok=True)
  # Initialize embeddings
  E = []
  for i, img in enumerate(images):
    print('embedding image', i)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.vgg19.preprocess_input(img)       # Pre-processes image for VGG19
    # run an image through the network by making a prediction
    feature_maps = visual_model.predict(img)
    
    temp = np.zeros((0))
    for level in f_levels:
      A = feature_maps[level]
      # Flatten feature map to 2x2 matrix if it is 4D (e.g. Conv layer)
      if len(A.shape) == 4:
        A = A.reshape(A.shape[0], A.shape[1]*A.shape[2], A.shape[3], order='F')
        A = A.reshape(A.shape[1], A.shape[2])
        # Compute Gram matrix (cummulative co-activation of filter per layer)
        G = np.matmul(np.transpose(A),A)
        # Append this layer's flattened Gram matrix to images embedding
        dummy = np.zeros(temp.shape[0] + G.flatten().shape[0])
        dummy[:temp.shape[0]] = temp
        dummy[temp.shape[0]:] = G.flatten()
        emb = np.copy(dummy) 
      # Flatten feature map to 1D vector if it is 2D (e.g. FC layer)
      else:
        emb = np.copy(A.flatten())

    E.append(np.copy(emb))

    # convert layer names to a single string all of them concatenated
    layers = '_'.join([str(f) for f in f_levels])

    if filenames is not None:
      # create dir to save embeddings if it doesn't exist
      Path(f'./embeddings/{layers}').mkdir(parents=True, exist_ok=True)

      # save embeddings as .npy files
      filename = filenames[i].split('.')[0]
      np.save(f'./dataset/embeddings/{layers}/{filename}', emb)

    if i%10 == 0:
      print(f'Getting embedding of img no. {i}, with shape {E[i].shape}, and {len(f_levels)} layers')
    
  return np.array(E)


# Select the layer from which to extract feature maps (conv1_1 is layer 0)
f_levels = [14]        # Choose from [1,4,9,14,19,22,23]

# Get embeddings for 'test' dataset
embeddings = get_embeddings(f_levels, images, filenames=filenames)

embedding image 0
Getting embedding of img no. 0, with shape (262144,), and 1 layers
embedding image 1
embedding image 2
embedding image 3
embedding image 4
embedding image 5
embedding image 6
embedding image 7
embedding image 8
embedding image 9
embedding image 10
Getting embedding of img no. 10, with shape (262144,), and 1 layers
embedding image 11
embedding image 12
embedding image 13
embedding image 14
embedding image 15
embedding image 16
embedding image 17
embedding image 18
embedding image 19
embedding image 20
Getting embedding of img no. 20, with shape (262144,), and 1 layers
embedding image 21
embedding image 22
embedding image 23
embedding image 24
embedding image 25
embedding image 26
embedding image 27
embedding image 28
embedding image 29
embedding image 30
Getting embedding of img no. 30, with shape (262144,), and 1 layers
embedding image 31
embedding image 32
embedding image 33
embedding image 34
embedding image 35
embedding image 36
embedding image 37
embedding image 3

In [5]:
import shutil
from pathlib import Path
import numpy as np

def remove_duplicates(dir, layer, thresh=0.3, delete=False):

    num_duplicates = 0

    # create duplicates folder from dir
    Path(f'{dir}/duplicates/images').mkdir(parents=True, exist_ok=True)
    Path(f'{dir}/duplicates/embeddings').mkdir(parents=True, exist_ok=True)

    # Load embeddings
    embeddings = []
    img_filenames = []
    emb_filenames = []
    embedding_path = Path(dir) / 'embeddings' / layer
    for file in embedding_path.glob('*.npy'):
        embeddings.append(np.load(file))
        emb_filenames.append(file.name)
        img_filenames.append(file.name.split('.')[0] + '.jpg')
    embeddings = np.array(embeddings)

    print('embeddings shape', embeddings.shape)

    # Fit 5 Nerest Neighbors model to embeddings
    knnbr = NearestNeighbors(n_neighbors=5).fit(embeddings)

    
    # Iterate through all files in embeddings dir
    dup_i = -1
    for i in range(embeddings.shape[0]):

        # Check if the image has already been identified as a duplicate
        if i != dup_i:
            print('Analyzing embedding and image', emb_filenames[i], img_filenames[i])
            knn = knnbr.kneighbors(embeddings[i].reshape(1, -1))
                        
            # Gets the distance and indexes of the k nearest neighbors (minus the first one, which is the query image)
            distances = knn[0][0][1:]
            indexes = knn[1][0][1:]

            # Remove duplicates
            for j in range(len(indexes)):
                index = indexes[j]
                distance = distances[j]

                if distance < thresh:
                    num_duplicates += 1
                    dup_i = index
                    print('found duplicate', emb_filenames[i])
                    # Copy the duplicated image into the duplicates folder
                    image_path = Path(dir) / 'images' / img_filenames[i]
                    dup_image_path = Path(dir) / 'duplicates' / 'images' / img_filenames[i]
                    
                    neighbor_image_path = Path(dir) / 'images' / img_filenames[index]
                    dup_name = img_filenames[i].split('.')[0] + '_' + str(round(distance)) + '_dup.jpg'
                    dup_neighbor_image_path = Path(dir) / 'duplicates' / 'images' / dup_name
                    # copy files to duplicates folder
                    shutil.copy(image_path, dup_image_path)
                    shutil.copy(neighbor_image_path, dup_neighbor_image_path)

    print('Found a total of', num_duplicates/2, 'duplicates')
                

dir = 'dataset'
remove_duplicates(dir, '23', thresh=30.0, delete=False)

embeddings shape (1112, 4096)
Analyzing embedding and image wb_0.npy wb_0.jpg
Analyzing embedding and image wb_1.npy wb_1.jpg
Analyzing embedding and image wb_10.npy wb_10.jpg
Analyzing embedding and image wb_100.npy wb_100.jpg
Analyzing embedding and image wb_1000.npy wb_1000.jpg
Analyzing embedding and image wb_1001.npy wb_1001.jpg
Analyzing embedding and image wb_1002.npy wb_1002.jpg
Analyzing embedding and image wb_1003.npy wb_1003.jpg
Analyzing embedding and image wb_1004.npy wb_1004.jpg
Analyzing embedding and image wb_1005.npy wb_1005.jpg
Analyzing embedding and image wb_1006.npy wb_1006.jpg
Analyzing embedding and image wb_1007.npy wb_1007.jpg
Analyzing embedding and image wb_1008.npy wb_1008.jpg
Analyzing embedding and image wb_1009.npy wb_1009.jpg
Analyzing embedding and image wb_101.npy wb_101.jpg
Analyzing embedding and image wb_1010.npy wb_1010.jpg
Analyzing embedding and image wb_1011.npy wb_1011.jpg
Analyzing embedding and image wb_1012.npy wb_1012.jpg
Analyzing embeddin