# Fischer's Fritz fischt frische Fische

Let's start with defining a helper function to display multiple images.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

def show_plots(N, imgs, fig_title="",sub_titles=[]):
    """
    N     ... number of plots to show (should be < len(img))
    img   ... list of images as numpy 3D arrays (for RGB)
    title ... optional title
    """
    if not len(sub_titles):
        sub_titles = [""] * len(imgs)
    rows = (N + 3) // 4 
    select_index = np.random.choice(len(imgs),size = min(N,len(imgs)))
    select_imgs = [imgs[i] for i in select_index]
    select_titles = [sub_titles[i] for i in select_index]
    _, ax = plt.subplots(rows, 4, sharex='col', sharey='row', figsize=(20, 3 * rows))
    if fig_title:
        plt.suptitle(fig_title, y = 0.98 + .1/rows, size=20)
    for i, (img, title) in enumerate(zip(select_imgs,select_titles)):
        a = ax[i] if len(ax.shape) < 2 else ax[i // 4, i % 4]
        a.imshow(img)
        if title:
            a.set_title(title)
        a.set_axis_off()

Now it's time to load the training data. First, get a list of all image file names.

In [None]:
import glob
training_filenames = glob.glob('train/*/*.jpg')
print('Found {} training images.'.format(len(training_filenames)))

For future processing it may be useful to have unique identifiers for each image such that one can easily associated training and classification information (e.g. boat type, day/night etc, predicted category). We could use a database here, but let's keep it simple for the moment. We are going to use `pandas.DataFrame` with a MD5 hash of the filename as index. In addition to the filename itself, we will already store the true label (i.e. the fish species).  
In order to save time in future iterations, the data frame will be saved into a `pickle` file which can be loaded in subsequent runs. Make sure that you save the data frame if you added valueable information (e.g. output from clusterisation).

In [None]:
import pandas as pd

def init_data_frame(pickle_file="df.pickle"):
    """
    load a pandas.DataFrame from given pickle file or initialise if file does not exist
    """
    import hashlib, os, pickle, re
    
    # check for pickle file and load data frame if it exists
    if os.path.isfile(pickle_file):
        print("load from pickle file '{}'".format(pickle_file))
        df = pickle.load(open(pickle_file))
        return df
    
    # no pickle file found -> create data frame
    print('create pandas.DataFrame from scratch')
    
    # get MD5 hash of filename as index
    index = [hashlib.md5(f).hexdigest() for f in training_filenames]

    # extract true label from filename
    label_pattern = re.compile('train/([^/]+)/img_.*jpg')
    true_labels = [label_pattern.match(f).group(1)  for f in training_filenames]

    # construct the dataframe
    df = pd.DataFrame(data={'filename': training_filenames, 'true label': true_labels},index=index)
    print("save data frame to '{}'".format(pickle_file))
    df.to_pickle(pickle_file)
    return df

df = init_data_frame()
df.head()

Finally, let's actually load some of the images. To avoid any bias, we pick the subset of training images randomly.

In [None]:
from scipy.misc import imread
import numpy as np
n_images = 1000
train_df = df.sample(n_images)
train = np.array([imread(img) for img in train_df['filename']])
print('Number of training images loaded {}.'.format(len(train)))

When performing image processing, it always is important to know the dimensions of the images. So let's have a look, what we've got.

In [None]:
print('Image sizes in training sample:')
shapes = np.array([str(img.shape) for img in train])
pd.Series(shapes).value_counts()

We can also plot some example images for each image size.

In [None]:
for uniq in pd.Series(shapes).unique():
    show_plots(4,train[shapes == uniq], 'Images with shape: {}'.format(uniq),train_df['filename'][shapes == uniq])
    plt.show()

As a first pre-processing step, we may want to try to cluster images into categories where each category corresponds to one fishing cutter. This could potentially be useful for the following steps:
* identify areas of the image which do not change (e.g. structure of the boat itself) and therefore limit the area where to look for the fish
* build a different model for each boat (my assumption is that the prior distribution for each fish species depends on the fishing region, and thus, maybe on the boat as well)
We will use `sklearn.DBSCAN` as clusterisation algorithm which requires as input the pair-wise distance matrix between all images. As a start, the distance between to images is defined as mean absolute pixel error after normalisation.

In [None]:
import cv2, multiprocessing, progressbar

# Function for computing distance between images
def compare(args):
    img, img2 = args
    img = (img - img.mean()) / img.std()
    img2 = (img2 - img2.mean()) / img2.std()
    return np.mean(np.abs(img - img2))

# Resize the images to speed it up.
train_rescaled = [cv2.resize(img, dsize=(50, 50)) for img in train]

# Create the distance matrix in a multithreaded fashion
pool = multiprocessing.Pool(4)
distances = np.zeros((len(train_rescaled), len(train_rescaled)))
bar = progressbar.ProgressBar(maxval=len(train_rescaled))
bar.start()
for i, img in enumerate(train_rescaled):
    bar.update(i+1)
    all_imgs = [(img, f) for f in train_rescaled]
    dists = pool.map(compare, all_imgs)
    distances[i, :] = dists

In [None]:
print distances.shape
print len(distances.flatten())
plt.hist(distances.flatten(), bins=50)
plt.title('Histogram of distance matrix')

In [None]:
from sklearn import cluster
cls = cluster.DBSCAN(metric='precomputed', min_samples=5, eps=0.6)
y = cls.fit_predict(distances)
print(y)
print('Cluster sizes:')
print(pd.Series(y).value_counts())

In [None]:
for uniq in pd.Series(y).value_counts().index:
    if uniq != -1:
        size = len(np.array(train_rescaled)[y == uniq])
        if size > 12:
            show_plots(12,np.array(train_rescaled)[y == uniq], 'BoatID: {} - Image count {}'.format(uniq, size))
            plt.show()
        else:
            show_plots(size,np.array(train_rescaled)[y == uniq], 'BoatID: {} - Image count {}'.format(uniq, size))
            plt.show() 

In [None]:
size = len(np.array(train)[y == -1])
show_plots(size,np.array(train)[y == -1], 'BoatID: {} (Unclassified images) - Image count {}'.format(-1, size))

In [None]:
same = distances[y == 15][:,y==15]
plt.imshow(same,interpolation='none')
plt.colorbar()

In [None]:
print np.arange(500)[y==28]
print np.arange(500)[y==15]
print distances[np.arange(500)[y==15],433]
show_plots(1,[train_rescaled[335]],"")

In [None]:
mask = [e in [15,28] for e in y]
similar = [img for (img,m) in zip(train_rescaled,mask) if m]
show_plots(len(similar),similar,"same boat but different categories")

In [None]:
from skimage import feature, color
img1 = color.rgb2gray(train_rescaled[335])
img2 = color.rgb2gray(train_rescaled[433])
edges1 = feature.canny(img1, sigma=2)
edges2 = feature.canny(img2, sigma=2)
_, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 6), sharex=True, sharey=True)
for i,im in enumerate([img1,edges1,img2,edges2]):
    a = ax[i // 2, i % 2]
    a.imshow(im,cmap='gray')
    a.axis('off')
    