In [75]:
import cv2
import rawpy
import os
import os.path as path
import glob
import numpy as np
import scipy.spatial
import ipdb
import os
import platform
import multiprocessing
import json
from functools import lru_cache
from collections import namedtuple
from matplotlib import pyplot as plot

In [2]:
Image = namedtuple('Image', 'icon features filename timestamp')

In [3]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

In [4]:
def extract_features(image, vector_size=32):
    try:
        # Using KAZE, cause SIFT, ORB and other was moved to additional module
        # which is adding addtional pain during install
        alg = cv2.KAZE_create()
        # Dinding image keypoints
        kps = alg.detect(image)
        # Getting first 32 of them. 
        # Number of keypoints is varies depend on image size and color pallet
        # Sorting them based on keypoint response value(bigger is better)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        # computing descriptors vector
        kps, dsc = alg.compute(image, kps)
        # Flatten all of them in one big vector - our feature vector
        dsc = dsc.flatten()
        # Making descriptor of same size
        # Descriptor vector size is 64
        needed_size = (vector_size * 64)
        if dsc.size < needed_size:
            # if we have less the 32 descriptors then just adding zeros at the
            # end of our feature vector
            dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
    except cv2.error as e:
        print('Error: ', e)
        return None

    return dsc

In [5]:
def sort_by_distance(point, other_points, distance_function):
  return sorted(other_points, key=lambda p: distance_function(point, p))

In [6]:
class AugmentedPoint:
  def __init__(self, point, cluster):
    self.point = point
    self.cluster = cluster

def cluster_images_with_dbscan(points, distance_function, e):
  # See https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68#4591
  if len(points) == 0:
    return []

  next_cluster_number = 0
  augmented_points = [AugmentedPoint(point=point, cluster=-1) for point in points]

  # While there are still points to consider
  for current_point in augmented_points:

    # Sort all other points by distance, excluding current point
    closest_points = sort_by_distance(current_point, augmented_points, lambda a, b: distance_function(a.point, b.point))[1:]

    # Drop all points too far away to join
    viable_points = list(filter(lambda another_point: distance_function(current_point.point, another_point.point) <= e, closest_points))
    
    if len(viable_points) == 0:
      # Too far away to join anything. Add into its own cluster.
      current_point.cluster = next_cluster_number
      next_cluster_number += 1
    else:
      # Found points to join with!
      for point_to_join in viable_points:
          if point_to_join.cluster == -1:
            # If the close point does not have a cluster
            
            if current_point.cluster != -1:
                # If we're already part of a cluster, absorb point
                point_to_join.cluster = current_point.cluster
            else:
                # If we're not part of a cluster, create a new cluster with point
                current_point.cluster = next_cluster_number
                point_to_join.cluster = next_cluster_number
                next_cluster_number += 1
          else:
            # If the point is already part of a cluster
          
            if point_to_join.cluster == current_point.cluster:
                # Already part of the same group, feel free to skip
                pass
            elif current_point.cluster != -1:
                # If current point is already part of cluster, merge point's cluster into the new cluster
                for point_in_current_cluster in filter(lambda p: p.cluster == current_point.cluster, augmented_points):
                  point_in_current_cluster.cluster = point_to_join.cluster
            else:
                # If the point is not part of a cluster, join that cluster
                current_point.cluster = point_to_join.cluster
    
  # All points have now been visited and joined together. Group by cluster id and return.
  clusters = [[] for _ in range(next_cluster_number)]
  for augmented_point in augmented_points:
    clusters[augmented_point.cluster].append(augmented_point.point)
  
  return clusters


def test_cluster_images_with_dbscan():
    def distance_function(a, b):
        return np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2)
    
    # Should return empty list if no points
    assert(0 == len(cluster_images_with_dbscan([], distance_function, 5)))
    
    # Should add one point to one group
    assert([[(0, 0)]] == cluster_images_with_dbscan([(0, 0)], distance_function, 5))
    
    # Should add a second point to the same group
    assert([[(0, 0), (1, 1)]] == cluster_images_with_dbscan([(0, 0), (1, 1)], distance_function, 5))
    
    # Should not add a second point if it's too far
    assert([[(0, 0)], [(1, 1)]] == cluster_images_with_dbscan([(0, 0), (1, 1)], distance_function, 1))
    
    # Should find three groups
    points = [(0, 0), (1, 1), (-1, -1),
              (20, 20), (19, 19), (21, 21),
              (-20, -20), (-21, -21), (-19, -19)]
    expected = [[(0, 0), (1, 1), (-1, -1)],
                [(20, 20), (19, 19), (21, 21)],
                [(-20, -20), (-21, -21), (-19, -19)]]
    assert(expected == cluster_images_with_dbscan(points, distance_function, 2))
    
    # Should handle two sub-clusters joining together
    points = [(4, 4), (1, 1), (0, 0)]
    expected = [[(4, 4), (1, 1), (0, 0)]]
    assert(expected == cluster_images_with_dbscan(points, distance_function, 10))
    
    # Should corrently handle merging two full groups
    points = [(0.48, 0.69), (0.53, 0.73), (0, 0), (0.35, 0)]
    expected = [[(0.48, 0.69), (0.53, 0.73), (0, 0), (0.35, 0)]]
    assert(expected == cluster_images_with_dbscan(points, distance_function, 10))
    
    print("Tests passed!")
    
test_cluster_images_with_dbscan()

Tests passed!


In [7]:
def creation_date(path_to_file):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        return os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            return stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            return stat.st_mtime

In [79]:
image_directory = 'test/resources'
image_filenames = insensitive_glob(path.join(image_directory, "*.arw"))

#image_directory = '/media/anima/Auxillary/Camera Photos/20.02.2019'
#image_filenames = insensitive_glob(path.join(image_directory, "*.arw"))[:200]

print("Loading images")

images = []

cpu_pool = multiprocessing.Pool(multiprocessing.cpu_count() * 2)

def load_image(image_filename):
    raw_image = rawpy.imread(image_filename)
    rgb_image = raw_image.postprocess(half_size=True, use_camera_wb=True, output_bps=8)

    icon = cv2.resize(rgb_image, (100, 100))
    features = icon
    modified_time = creation_date(image_filename)

    return Image(icon, features, image_filename, timestamp=modified_time)

# images = cpu_pool.map(load_image, image_filenames)

images = list(map(load_image, image_filenames))

print("Done!")

Loading images
Done!


In [16]:
def print_clusters(clusters):
    i = 1
    for cluster in clusters:
      print("Cluster %s: " % i)
      for item in cluster:
        print("  - %s" % item.filename)
      i = i + 1
    
%matplotlib qt
def visualise_clusters(clusters):
    image_size = clusters[0][0].icon.shape[0]
    number_of_images = np.sum([len(cluster) for cluster in clusters])
    hr_height = 25
    vr_length = 25
    
    max_images_per_column = 12
    columns = int(number_of_images / max_images_per_column) + 1
    
    width = (image_size + vr_length) * columns
    height = image_size * number_of_images + hr_height * len(clusters)
    buffer = np.zeros((height, width, 3))
    
    i = 0
    hr_offset = 0
    for cluster in clusters:
        for iImage, image in enumerate(cluster):
            column = int(i / max_images_per_column)
            if column != int((i - 1) / max_images_per_column):
                if iImage == 0:
                    hr_offset = hr_height
                else:
                    hr_offset = 0
            row_offset = i * image_size + hr_offset - column * max_images_per_column * image_size
            column_offset = int(i / max_images_per_column) * image_size
            buffer[row_offset:row_offset + image_size,column_offset:column_offset + image_size,:] = image.icon
            i = i + 1
        hr_offset = hr_offset + hr_height
    cv2.imshow("Clusters", buffer / np.max(buffer))
    cv2.waitKey(1)

In [10]:
# Does't work well because not normalised
def image_distance_mse(a, b):
    return np.sum((a.features - b.features)**2)

# Pretty good
def image_distance_mse_sum_norm(a, b):
        return np.sum((a.features / np.sum(a.features) - b.features / np.sum(b.features))**2)

# Not very good, not properly normalisied
def image_distance_mse_mean_norm(a, b):
    return np.sum((a.features / np.mean(a.features) - b.features / np.mean(b.features))**2)

In [11]:
print("Clustering images")

a =0.00002
images_to_group = images

# print([(str(i) + " - " + image.filename) for i, image in enumerate(images_to_group)])

print_clusters(cluster_images_with_dbscan(images_to_group, image_distance_mse_sum_norm, a))

Clustering images
Cluster 1: 
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02198.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02199.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02200.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02201.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02204.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02205.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02206.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02207.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02208.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02209.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02210.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02211.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02212.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02213.ARW
  - /media/anima/Auxillary/Camera Photos/20.02.2019/DSC02214

In [52]:
print(image_distance_mse_mean_norm(images[0], images[1]))
print(image_distance_mse_mean_norm(images[0], images[2]))
print(image_distance_mse_mean_norm(images[0], images[3]))
print(image_distance_mse_mean_norm(images[1], images[2]))
print(image_distance_mse_mean_norm(images[1], images[3]))
print(image_distance_mse_mean_norm(images[2], images[3]))

a = np.array(  [22594.711631993814,
                47146.801882724394,
                41796.4206178841,
                55572.482975170744,
                49932.742614843504,
                19480.308860878984])

a = a/np.max(a)
print(a)

419.32858419666303
817.5009533159625
864.4394283491508
583.5984933187045
691.2085125261216
199.09619596457537
[0.40658093 0.84838394 0.75210641 1.         0.8985156  0.35053875]


# Temporal Sorting

In [80]:
images = sorted(images, key=lambda x: x.timestamp)

difference_threshold = 0.00002
difference_function = image_distance_mse_sum_norm

differences = np.array([])
for i in range(len(images) - 1):
    differences = np.append(differences, (difference_function(images[i], images[i + 1])))

breaking_points = np.append([0], differences > difference_threshold)
groups = np.cumsum(breaking_points)
grouped_images = [[] for _ in range((groups[-1] + 1))]
for i, group in enumerate(groups):
    grouped_images[group] = list(grouped_images[group]) + [images[i]]
    

In [81]:
visualise_clusters(grouped_images)

In [82]:
textified_groups = list(map(lambda group: list(map(lambda image: image.filename, group)), grouped_images))

with open('grouped_images.json', 'w') as outfile:
    json.dump(textified_groups, outfile)