<a href="https://colab.research.google.com/github/Beno71/humpback-whale-classification/blob/master/Colab_siamese_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os 
print(os.getcwd())
!ls

# siamese networks for whale classification

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 18px; }</style>"))

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os, time, itertools
from skimage import io, transform
import skimage
import glob
from tqdm import tnrange, tqdm
from collections import Counter
from random import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import euclidean_distances
from IPython.display import clear_output

%matplotlib inline

# Load Data

In [3]:
# some prep steps
# for google colab
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
  
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/')
  data_folder = "/content/drive/My Drive/Colab Notebooks/data/"
else:
  data_folder = "data/"


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
#load .npz-file from folder



loader = np.load(data_folder+"humpback_300x100_gray_no_new.npz")
features = loader["features"]
labels = loader["labels"]

n_rows = labels.shape[0]



split_ratio = 0.8
data_size = 400
expansion_factor = 1

#take the sample from the most common classes
ar = np.array(Counter(labels).most_common())
count = 0
label_list = []
for idx, tup in enumerate(ar):
    label_list.append(tup[0])
    count += tup[1]
    if count > data_size:
        break
label_list

label_in_list=[x in label_list for x in labels]
labels = labels[label_in_list]
features = features[label_in_list]

def standardize(X):
    X = X.astype(np.float32)
    X = (X - np.mean(X, axis=(1,2), keepdims=True)) / np.std(X, axis=(1,2), keepdims=True)
    return X

features = standardize(features)
features = np.expand_dims(features, axis=1)

# not needed
#all_combinations_without_labels = np.array(list(itertools.combinations(range(data_size),2)))
#same_label_indices = labels[all_combinations_without_labels[:,0]] == labels[all_combinations_without_labels[:,1]]
#all_combinations = np.append(all_combinations_without_labels, same_label_indices.reshape(-1,1), axis=1)

In [29]:
np.unique(labels)

array([ 12,  18,  32,  58, 166, 241, 265])

In [0]:
def train_test_split_old(split_ratio):
    """
    Get unique combinations from [0..data_size].
    Shuffle same_label_pairs and dif_label_pairs individually.
    Sample the first split_ratio*num pairs of each class for training and 
    sample the the rest (1-split_ratio)*num pairs of each class for validation
    
    """
    same_label_pairs = np.random.permutation(all_combinations[same_label_indices])
    dif_label_pairs = np.random.permutation(all_combinations[~same_label_indices])
    
    num_pairs = int(1/2*expansion_factor*data_size)
    num_train = int(split_ratio*num_pairs)
   
    train_matchings = np.append(same_label_pairs[:num_train], dif_label_pairs[:num_train], axis = 0)
    val_matchings = np.append(same_label_pairs[num_train:num_pairs], dif_label_pairs[num_train:num_pairs], axis = 0)
    
    np.random.shuffle(train_matchings), np.random.shuffle(val_matchings)
    
    return train_matchings.astype(int), val_matchings.astype(int)

In [0]:
def train_test_split(split_ratio):
    """
    Split data into train, validation and test set.
    Test set contains just pictures. 
    Validation and Train set are pairs of pictures (unique matches).
    """
    perm = np.random.permutation(range(data_size))
    train_idx = perm[:int(split_ratio*data_size)]
    test_idx = perm[int(split_ratio*data_size):]
    
    combinations = np.array(list(itertools.combinations(train_idx,2)))
    
    same_label_indices = labels[combinations[:,0]] == labels[combinations[:,1]]
    combinations = np.append(combinations, same_label_indices.reshape(-1,1), axis=1)

    same_label_pairs = np.random.permutation(combinations[same_label_indices])
    dif_label_pairs = np.random.permutation(combinations[~same_label_indices])
    
    num_pairs = int(1/2*expansion_factor*data_size)
    num_train = int(split_ratio*num_pairs)
   
    train_matchings = np.append(same_label_pairs[:num_train], dif_label_pairs[:num_train], axis = 0)
    val_matchings = np.append(same_label_pairs[num_train:num_pairs], dif_label_pairs[num_train:num_pairs], axis = 0)
    
    np.random.shuffle(train_matchings), np.random.shuffle(val_matchings)
    return train_matchings.astype(int), val_matchings.astype(int), test_idx.astype(int)

### Train Test Split

In [43]:
train_matchings, val_matchings, test_idx = train_test_split(split_ratio)
train_matchings.shape, val_matchings.shape, test_idx.shape


((320, 3), (80, 3), (80,))

## Helper functions for calculation of accuracy

In [60]:
a = np.array([1,1,2,1,4,5,6,7])
np.argsort(a)

array([0, 1, 3, 2, 4, 5, 6, 7])

In [0]:
def get_unique_N(iterable, N):
    """Yields (in order) the first N unique elements of iterable. 
    Might yield less if data too short."""
    seen = set()
    for e in iterable:
        if e in seen: # ES IST NIE IN SEEN :D
            continue
        seen.add(e)
        yield e
        if len(seen) == N:
            return

In [0]:
def get_k_nearest(distance_matrix):
    dm = distance_matrix.copy()
    for i in range(dm.shape[1]):    
        dm[i,i] += 100000 
    top5_nearest = np.empty((distance_matrix.shape[0], 5))
    for idx, line in enumerate(dm):
        sorted_indices = np.argsort(line)
        top5_nearest[idx,:] = np.fromiter(get_unique_N(labels[sorted_indices], 5), int)
    #top5_nearest = labels[top5_nearest.astype(int)]
    return top5_nearest.astype(int)

In [0]:
weights_standard = np.array([1, 0.8, 0.6, 0.4, 0.2])
weights_first = np.array([1,0,0,0,0])
weights_half = np.array([1,0.5,0.33,0.25,0.20])
def calculate_accuracy_score(outputs_1, outputs_2=None, weights=weights_standard):
    distance_matrix = euclidean_distances(outputs_1, outputs_1[test_idx])
    top5_nearest = get_k_nearest(distance_matrix)
    #print(top5_nearest)
    true_labels = np.repeat(np.array([labels]), 5, axis=0).T
    prediction_matrix = top5_nearest == true_labels
    #print(np.count_nonzero(prediction_matrix))
    for row in prediction_matrix:
        for i, elt in enumerate(row):
            if elt:
                row[i+1:] = 0
                break
    #print(np.count_nonzero(prediction_matrix),"/",labels.shape[0],"unique whales have correct labels in the top 5")
    scores_per_image = prediction_matrix@weights_half
    score = np.sum(scores_per_image)/scores_per_image.shape[0]
    return score

In [0]:
def batch_data(num_data, batch_size):
    """ Yield batches with indices until epoch is over.
    
    Parameters
    ----------
    num_data: int
        The number of samples in the dataset.
    batch_size: int
        The batch size used using training.

    Returns
    -------
    batch_ixs: np.array of ints with shape [batch_size,]
        Yields arrays of indices of size of the batch size until the epoch is over.
    """
    
    data_ixs = np.random.permutation(np.arange(num_data))
    ix = 0
    while ix + batch_size < num_data:
        batch_ixs = data_ixs[ix:ix+batch_size]
        ix += batch_size
        yield batch_ixs

In [25]:
np.unique(labels)

array([ 12,  18,  32,  58, 166, 241, 265, 416, 502])

# Check initializer of variables

In [0]:
class SiamN:
    
    def __init__(self, name, learning_rate=0.001, length=300, height=100, channels=1, margin=0.5):
        
        self.name = name
        self.dropout = tf.placeholder_with_default(0.0, shape=(), name="dropout")
        self.learning_rate = learning_rate
        self.weights =[]
        self.biases =[]
        self.margin = margin
        
        self.X1 = tf.placeholder(shape=[None, channels, height, length], dtype=tf.float32, name="data_1") 
        self.X2 = tf.placeholder(shape=[None, channels, height, length], dtype=tf.float32, name="data_2") 
        self.Y = tf.placeholder(shape=[None,], dtype=tf.float32, name="labels") 
        
        self.output1 = self.forward_pass(self.X1, reuse=False)
        self.output2 = self.forward_pass(self.X2, reuse=True)
        self.loss = tf.contrib.losses.metric_learning.contrastive_loss(self.Y, self.output1, self.output2, self.margin)
        #self.loss = self.contrastive_loss(self.Y, self.output1, self.output2, self.margin)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
        
    
    def forward_pass(self, X, reuse=False):
        
        convkwargs = {'padding': 'same', 'activation_fn': tf.nn.relu, 'reuse': reuse}
        poolkwargs = {'kernel_size': 2, 'stride': 2, 'padding': 'same'}
        
        with tf.variable_scope("conv1") as scope:
            x = tf.contrib.layers.conv2d(X, 64, kernel_size=9, stride=2, scope=scope, **convkwargs)
            x = tf.contrib.layers.batch_norm(x, scope=scope, reuse=reuse)
            x = tf.contrib.layers.max_pool2d(x, **poolkwargs)
        with tf.variable_scope("conv2") as scope:
            x = tf.contrib.layers.conv2d(x, 64, kernel_size=7, stride=1, scope=scope, **convkwargs)
            x = tf.contrib.layers.batch_norm(x, scope=scope, reuse=reuse)
            x = tf.contrib.layers.max_pool2d(x, **poolkwargs)
        with tf.variable_scope("conv3") as scope:
            x = tf.contrib.layers.conv2d(x, 128, kernel_size=5, stride=1, scope=scope, **convkwargs)
            x = tf.contrib.layers.batch_norm(x, scope=scope, reuse=reuse)
            x = tf.contrib.layers.max_pool2d(x, **poolkwargs)
        with tf.variable_scope("conv4") as scope:
            x = tf.contrib.layers.conv2d(x, 256, kernel_size=3, stride=1, scope=scope, **convkwargs)
            x = tf.contrib.layers.batch_norm(x, scope=scope, reuse=reuse)
            x = tf.contrib.layers.max_pool2d(x, **poolkwargs)
                
        
        x = tf.contrib.layers.flatten(x)
        x = tf.contrib.layers.fully_connected(x, 500)
        x = tf.contrib.layers.fully_connected(x, 20)
        
        output = x
            
        return output
            
        #deprecated
    def contrastive_loss(self, Y, output1, output2, margin=0.5):
        distance = tf.norm(output2 - output1)
        similarity = Y * tf.square(distance)                                           # keep the similar label (1) close to each other
        dissimilarity = (1-Y) * tf.square(tf.maximum((margin - distance), 0))        # give penalty to dissimilar label if the distance is bigger than margin
        loss = tf.reduce_mean((dissimilarity + similarity) / 2)
        return loss

    
    def train(self, features, train_matchings, val_matchings, epochs=20, dropout=0.0, batch_size=512):

        train_losses = []
        val_losses = []
        acc_scores = []
        
        config = tf.ConfigProto()
        #config.gpu_options.allow_growth=True
        self.session = tf.Session(config=config)
        session = self.session
        
        session.run(tf.global_variables_initializer())
        
        train_loss = session.run(self.loss, feed_dict={self.X1: features[train_matchings[:,0]], self.X2: features[train_matchings[:,1]], self.Y: train_matchings[:,2]})
        val_loss = session.run(self.loss, feed_dict={self.X1: features[val_matchings[:,0]], self.X2: features[val_matchings[:,1]], self.Y: val_matchings[:,2]})
        
        acc_score_output = session.run(self.output1, feed_dict={self.X1: features})
        acc_score = calculate_accuracy_score(acc_score_output)
                
        train_losses.append(round(train_loss/train_matchings.shape[0], 7))
        val_losses.append(round(val_loss/val_matchings.shape[0], 7))
        acc_scores.append(int(acc_score*100))
        print(f"Epoch 0/{epochs} train_loss: {train_losses[-1]} val_loss: {val_losses[-1]} acc_score: {acc_scores[-1]}")
        
        for epoch in range(epochs):
            if (epoch+1) % 5 == 0:
                print(f"Epoch {epoch+1}/{epochs} train_loss: {train_losses[-1]} val_loss: {val_losses[-1]} acc_score: {acc_scores[-1]}")  
            for batch_ixs in batch_data(train_matchings.shape[0], batch_size):
                    _ = session.run( self.optimizer, feed_dict={self.X1: features[train_matchings[batch_ixs,0]], self.X2: features[train_matchings[batch_ixs,1]], self.Y: train_matchings[batch_ixs,2]})  
            
            #TODO: remove boilerplate code, define function to calc accs and errors with flag print=TRUE/FALSE
            train_loss = session.run(self.loss, feed_dict={self.X1: features[train_matchings[:,0]], self.X2: features[train_matchings[:,1]], self.Y: train_matchings[:,2]})
            val_loss = session.run(self.loss, feed_dict={self.X1: features[val_matchings[:,0]], self.X2: features[val_matchings[:,1]], self.Y: val_matchings[:,2]})

            output = session.run(self.output1, feed_dict={self.X1: features})
            acc_score = calculate_accuracy_score(output)

            #train_losses.append(round(train_loss/train_matchings.shape[0], 7))
            #val_losses.append(round(val_loss/val_matchings.shape[0], 7))
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            acc_scores.append(int(acc_score*100))

        
        
        self.hist={'train_loss': np.array(train_losses),
           'val_loss': np.array(val_losses), "epochs_trained": epoch}
        
        

In [51]:
i#model1.session.close()

NameError: ignored

In [54]:
#You can change layer types and the number of neurons by changing the following variables.
t = time.time()
epochs = 200
batch_size = 256

tf.reset_default_graph()
model1 = SiamN("first_model", learning_rate = 0.0005, margin = 2)

model1.train(features, train_matchings, val_matchings, epochs, batch_size=batch_size)
print("Training finished in", time.time()-t,"s.")


#shape of tuple can not be built
# in particular:
# features[train_matchings[:,0]].shape.ndims
# probably should be tensor instead of np.array

Epoch 0/200 train_loss: 0.0517379 val_loss: 0.2197217 acc_score: 34
Epoch 5/200 train_loss: 0.8946079015731812 val_loss: 1.4131824970245361 acc_score: 35
Epoch 10/200 train_loss: 0.37902483344078064 val_loss: 1.3338706493377686 acc_score: 36
Epoch 15/200 train_loss: 0.2908611297607422 val_loss: 1.3860896825790405 acc_score: 37
Epoch 20/200 train_loss: 0.25481367111206055 val_loss: 1.679944634437561 acc_score: 34
Epoch 25/200 train_loss: 0.2160298377275467 val_loss: 1.6481382846832275 acc_score: 35
Epoch 30/200 train_loss: 0.20402464270591736 val_loss: 1.7248280048370361 acc_score: 37
Epoch 35/200 train_loss: 0.20690540969371796 val_loss: 1.861559510231018 acc_score: 34
Epoch 40/200 train_loss: 0.21182648837566376 val_loss: 2.0203349590301514 acc_score: 34
Epoch 45/200 train_loss: 0.20793752372264862 val_loss: 1.94972825050354 acc_score: 34
Epoch 50/200 train_loss: 0.20287743210792542 val_loss: 2.1302225589752197 acc_score: 32
Epoch 55/200 train_loss: 0.19625599682331085 val_loss: 1.965

KeyboardInterrupt: ignored

In [64]:
test = tf.constant([[1,2,3], [1,2,3]])
test.shape.ndims

2

In [0]:
import pdb; pdb.pm()

> /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/variable_scope.py(861)_get_single_variable()
-> name, "".join(traceback.format_list(tb))))
(Pdb) model1.X1
*** NameError: name 'model1' is not defined
(Pdb) quit


In [0]:
scores_1, scores_2

In [0]:
calculate_accuracy_score(outputs)

NameError: ignored

In [0]:
score

In [0]:
idx = np.random.randint(3200)
pic_a = features[train_matchings[idx,0]]
pic_b = features[train_matchings[idx,1]]
print(f"same whales? {bool(train_matchings[idx,2])}")    
print("Index: ", idx)
distance = model1.session.run(model1.distance, feed_dict={model1.X_1: np.array([pic_a]), model1.X_2: np.array([pic_b]), model1.Y: np.array([train_matchings[idx,2]])})
print(f"distance: {distance}")
plt.figure(figsize=(10,5))
plt.imshow(pic_a[0], cmap="gray")
plt.figure(figsize=(10,5))
plt.imshow(pic_b[0], cmap="gray")

In [0]:

print("model 1")
plt.figure(figsize=(10,5))
plt.plot(model1.hist['train_loss'][5::], label="Training")
plt.plot(model1.hist['val_loss'][5::], label="Validation")

plt.xlabel("Epoch", fontsize=20)
plt.ylabel("Loss", fontsize=20)
plt.legend()
plt.show()

In [0]:
print(model1.hist["train_loss"])

# Experimental: tensorflow datasets

In [0]:
train_data = tf.data.Dataset.from_tensor_slices({"feature": train_data, "label": train_labels})
val_data = tf.data.Dataset.from_tensor_slices({"feature": val_data, "label": val_labels})
train_data

In [0]:
train_data.output_types

In [0]:
#build batches
batch_size = 500
train_data.shuffle(30000)
batches = dataset.batch(batch_size)

In [0]:


sess = tf.Session()
iterator = batches.make_one_shot_iterator()
next_element = iterator.get_next()
no_of_batches = int(np.ceil(labels.shape[0] / batch_size))
counter = 1
for i in tqdm(range(no_of_batches)):
    value = sess.run(next_element)
    print(value["feature"].shape)
    print(counter)
    counter+=1

sess.close()

In [0]:
# run this to check available gpu memory (i got 5gb)

# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()