In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Function to show 4 images
def show_plots(N, imgs, title):
    rows = (N + 3) // 4 
    select_imgs = [imgs[np.random.choice(len(imgs))] for _ in range(N)]
    _, ax = plt.subplots(rows, 4, sharex='col', sharey='row', figsize=(20, 3 * rows))
    plt.suptitle(title, size=20)
    for i, img in enumerate(select_imgs):
        a = ax[i] if len(ax.shape) < 2 else ax[i // 4, i % 4]
        a.imshow(img)
        a.set_axis_off()
        #a.set_adjustable('box-forced')

In [None]:
import random, glob
from scipy.misc import imread
select = 500 # Only load 500 images for speed
# Data loading
train_files = sorted(glob.glob('train/*/*.jpg'), key=lambda x: random.random())[:select]
train = np.array([imread(img) for img in train_files])
print('Number of training samples {}'.format(len(train)))

In [None]:
import pandas as pd
print('Image sizes in training sample:')
shapes = np.array([str(img.shape) for img in train])
pd.Series(shapes).value_counts()

In [None]:
for uniq in pd.Series(shapes).unique():
    show_plots(4,train[shapes == uniq], 'Images with shape: {}'.format(uniq))
    plt.show()

In [None]:
import multiprocessing, progressbar
from skimage.transform import resize

# Function for computing distance between images
def compare(args):
    img, img2 = args
    img = (img - img.mean()) / img.std()
    img2 = (img2 - img2.mean()) / img2.std()
    return np.mean(np.abs(img - img2))

# Resize the images to speed it up.
train_rescaled = [resize(img, (224, 224)) for img in train]

# Create the distance matrix in a multithreaded fashion
pool = multiprocessing.Pool(8)
distances = np.zeros((len(train_rescaled), len(train_rescaled)))
bar = progressbar.ProgressBar(maxval=len(train_rescaled))
bar.start()
for i, img in enumerate(train_rescaled):
    bar.update(i)
    all_imgs = [(img, f) for f in train_rescaled]
    dists = pool.map(compare, all_imgs)
    distances[i, :] = dists

In [None]:
print distances.shape
print len(distances.flatten())
plt.hist(distances.flatten(), bins=50)
plt.title('Histogram of distance matrix')

In [None]:
from sklearn import cluster
cls = cluster.DBSCAN(metric='precomputed', min_samples=3, eps=0.6)
y = cls.fit_predict(distances)
print(y)
print('Cluster sizes:')
print(pd.Series(y).value_counts())

In [None]:
for uniq in pd.Series(y).value_counts().index:
    if uniq != -1:
        size = len(np.array(train_rescaled)[y == uniq])
        if size > 12:
            show_plots(12,np.array(train_rescaled)[y == uniq], 'BoatID: {} - Image count {}'.format(uniq, size))
            plt.show()
        else:
            show_plots(size,np.array(train_rescaled)[y == uniq], 'BoatID: {} - Image count {}'.format(uniq, size))
            plt.show() 

In [None]:
size = len(np.array(train)[y == -1])
show_plots(size,np.array(train)[y == -1], 'BoatID: {} (Unclassified images) - Image count {}'.format(-1, size))

In [None]:
same = distances[y == 15][:,y==15]
plt.imshow(same,interpolation='none')
plt.colorbar()

In [None]:
print np.arange(500)[y==28]
print np.arange(500)[y==15]
print distances[np.arange(500)[y==15],433]
show_plots(1,[train_rescaled[335]],"")

In [None]:
mask = [e in [15,28] for e in y]
similar = [img for (img,m) in zip(train_rescaled,mask) if m]
show_plots(len(similar),similar,"same boat but different categories")

In [None]:
from skimage import feature, color
img1 = color.rgb2gray(train_rescaled[335])
img2 = color.rgb2gray(train_rescaled[433])
edges1 = feature.canny(img1, sigma=2)
edges2 = feature.canny(img2, sigma=2)
_, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 6), sharex=True, sharey=True)
for i,im in enumerate([img1,edges1,img2,edges2]):
    a = ax[i // 2, i % 2]
    a.imshow(im,cmap='gray')
    a.axis('off')
    