In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re
import pretrainedmodels
from torch.nn.functional import cross_entropy

import albumentations
from utils import *

In [2]:
# fixes ulimit issue: https://github.com/pytorch/pytorch/issues/973

import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

In [3]:
import fastai
fastai.__version__

'1.0.46.dev0'

In [4]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [5]:
from zen_dataset import *
from zen_dataset.utils import *

## Setup

Below follows a lot of code to set things up. I give an overview of how it all works together before I start to train the model.

In [6]:
aug = albumentations.Compose([
    albumentations.RandomBrightnessContrast(p=0.75),
    albumentations.ShiftScaleRotate(shift_limit=0, scale_limit=0.1, rotate_limit=10, interpolation=2, p=0.75)
])

In [7]:
composed_augs = albumentations.Compose([
    albumentations.RandomBrightnessContrast(p=0.75),
    albumentations.ShiftScaleRotate(shift_limit=0, scale_limit=0.1, rotate_limit=10, interpolation=2, p=0.75)
])

In [8]:
def augment(ary):
    return composed_augs(image=ary)['image']

In [51]:
class Reader():
    def __init__(self, path, augment_fn=None):
        self.path = path
        self.augment_fn = augment_fn
    def __call__(self, fns):
        paths = [f'{self.path}/{filename}' for filename in fns]
        images = [open_image(image_path) for image_path in paths]
        tensors = [image2tensor(image, augment_fn = self.augment_fn) for image in images]
        return [imagenet_normalize(tensor) for tensor in tensors]

In [10]:
class Labeler():
    def __init__(self):
        df = pd.read_csv('data/train.csv')
        self.fn2label = {}
        for row in df[df.Id != 'new_whale'].itertuples():
            self.fn2label[row.Image] = row.Id
        self.classes = sorted(list(set(list(self.fn2label.values()))))
    def __call__(self, fns):
        labels = [self.fn2label[fn] for fn in fns]
        return [self.classes.index(label) for label in labels] + [1 if labels[0] != labels[1] else 0]

In [11]:
class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(*list(models.resnet50(True).children())[:-2])
        self.head = create_head(4096, 5004, [2048])
        self.ada_concat = AdaptiveConcatPool2d(1)

    def forward(self, ims_a, ims_b):
        cnn_out_a = self.cnn(ims_a)
        out_a = self.head(cnn_out_a)
        
        cnn_out_b = self.cnn(ims_b)
        out_b = self.head(cnn_out_b)

        return out_a, out_b, self.ada_concat(cnn_out_a).squeeze(), self.ada_concat(cnn_out_b).squeeze()

In [12]:
# https://hackernoon.com/facial-similarity-with-siamese-networks-in-pytorch-9642aa9db2f7

class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """

    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

In [13]:
MARGIN = 60

def cross_entropy_loss(preds, labels_a, labels_b, diff_class_ind):
    return F.cross_entropy(preds[0], labels_a) + F.cross_entropy(preds[1], labels_b)

def contr_loss(preds, labels_a, labels_b, diff_class_ind):
    c_loss = ContrastiveLoss(MARGIN)
    return c_loss(preds[2], preds[3], diff_class_ind.float())

def loss_fn(preds, labels_a, labels_b, diff_class_ind):
    return 10 * cross_entropy_loss(preds, labels_a, labels_b, diff_class_ind) + contr_loss(preds, labels_a, labels_b, diff_class_ind) / 25

In [14]:
def accuracy_mod(preds, labels_a, labels_b, diff_class_ind):
    return 0.5 * accuracy(preds[0], labels_a) + 0.5 * accuracy(preds[1], labels_b)

def map5_mod(preds, labels_a, labels_b, diff_class_ind):
    return 0.5 * map5(preds[0], labels_a) + 0.5 * map5(preds[1], labels_b)

In [15]:
# When I refer to 'whale', I mean a particular image (the file name).

df = pd.read_csv('data/train.csv')
df = df[df.Id != 'new_whale']
images_without_meaningful_bbox_predictions = \
    ['85a95e7a8.jpg', 'b370e1339.jpg', 'b4cb30afd.jpg', 'd4cb9d6e4.jpg', '6a72d84ca.jpg']
df = df[~df.Image.isin(images_without_meaningful_bbox_predictions)]

labeler = Labeler()

def create_basic_dataloader(sz, batch_size, num_workers=12):
    reader = Reader(f'data/train-extracted-{sz}')
    basic_ds = Dataset([*zip(df.Image.tolist(), df.Image.tolist())], reader, labeler)
    return DataLoader(basic_ds, batch_size=batch_size, num_workers=num_workers)

def create_similarity_dict(model, dataloader):
    # Calculating descriptors for each image
    descs = []
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            ims = batch[0][0].cuda()
            cnn_out = learn.model.cnn(ims)
            descs.append(learn.model.ada_concat(cnn_out).squeeze().detach().cpu())

    descs = torch.cat(descs).cuda()

    # Calculating similarity dict for each image
    dists = {}
    for i, (whale, _) in enumerate(dataloader.items):
        dists[whale] = torch.pairwise_distance(descs[i], descs).cpu().numpy()
    
    return dists

def create_data(sz, dist_dict, batch_size, k=20, num_workers=12, train_on_both_train_and_val=False):
    reader_aug = Reader(f'data/train-extracted-{sz}', augment_fn=augment)
    reader = Reader(f'data/train-extracted-{sz}')
    
    val_fns = list(pd.read_pickle('data/val_fns'))
    val_fns_set = set(val_fns)

    trn_df = df[~df.Image.isin(val_fns)]
    val_df = df[df.Image.isin(val_fns)]
    
    ds_on_which_dists_were_calculated = Dataset([*zip(df.Image.tolist(), df.Image.tolist())], reader, labeler)
    
    uniq_whales = df.Id.unique().tolist() if train_on_both_train_and_val else trn_df.Id.unique().tolist()

    def sample_other_whale():
        candidate_whales = dist_dict[this_whale].argsort() 
        this_whale_class = labeler.fn2label[this_whale]
        candidate_fns = []
        for i in range(200):
            candidate_whale = ds_on_which_dists_were_calculated.items[candidate_whales[i]][0]
            if (candidate_whale not in val_fns_set) and (labeler.fn2label[candidate_whale] != this_whale_class): 
                candidate_fns.append(candidate_whale)
            if len(candidate_fns) == k: break 
        np.random.shuffle(candidate_fns) # randomly pick one from K toughest matches
        return candidate_fns[0]

    def sample_this_whale():
        return this_whale_df.sample(n=1).iloc[0].Image

    train_items = []
    for whale in uniq_whales:
        this_whale_df = trn_df[trn_df.Id == whale]
        other_whale_df = trn_df[trn_df.Id != whale]

        this_whale = sample_this_whale()

        # sampling same whale if possible
        if this_whale_df.shape[0] == 1: # only a single picture of this whale in dataset
            other_whale = sample_other_whale()
            train_items.append([this_whale, other_whale])
        else:
            same_whale = this_whale_df[this_whale_df.Image != this_whale].sample(n=1).iloc[0].Image
            train_items.append([this_whale, same_whale])

        # sampling different whales
        this_whale = sample_this_whale()
        train_items.append([this_whale, sample_other_whale()])
    
    if train_on_both_train_and_val:
        valid_items = list(zip(val_df.Image.values[:batch_size].tolist(), val_df.Image.values[BS:2*batch_size].tolist()))
    else:
        valid_items = list(zip(val_df.Image.values[:1465].tolist(), val_df.Image.values[1465:2930].tolist()))

    train_ds = Dataset(train_items, reader_aug, labeler)
    valid_ds = Dataset(valid_items, reader, labeler)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    valid_dl = DataLoader(valid_ds, batch_size=batch_size, num_workers=num_workers)

    data = DataBunch(train_dl, valid_dl)
    data.train_ds.loss_func = lambda: None
    
    return data

In [16]:
def create_fake_data(): # needed for loading the model
    fake_ds = Dataset([],_,_)
    fake_dl = DataLoader(fake_ds)

    data = DataBunch(fake_dl, fake_dl)
    data.train_ds.loss_func = lambda: None 
    
    return data

In [17]:
%%time

learn = Learner(create_fake_data(), CustomModel(), loss_func=loss_fn, metrics=[accuracy_mod, map5_mod, cross_entropy_loss, contr_loss])
learn = learn.clip_grad()
learn.split((learn.model.cnn[6], learn.model.head))

CPU times: user 1.99 s, sys: 592 ms, total: 2.58 s
Wall time: 2.6 s


I chose to implement a model based on resnet50 that would both classify each of the presented images as well as calculate dissimilarity between image pairs.

Each training example consists of two images, most of them consisting of images of different whales and where possible of images of the same whale. I sample the images in a way as to maintain some class balance and to not favor whales with significantly more images.

The model is presented with images A and B. It first sends the images through the convolution part of resnet50 (pretrained on imagenet). This way we obtain the 2048 feature maps of some dimensionality (the actual dimensionality of feature maps will depend on the size of the input). Once we have those, we run a classifier head on them to predict labels (whale ids) for each of the images.

For each image pair the model outputs label prediction for image A, label prediction for image B, a 4096-length feature vector for image A and a 4096-length feature vector for image B.

These outputs are then used for calculating the loss. I use a custom loss that combines cross entropy with contrastive loss.

Below I generate initial data for the model to train on. Whale pairs are samples based on euclidean distance between the CNN features (after application of adaptive concatenation which doubles their lenght from 2048 to 4096). Controlling the `k` parameter is a proxy for how hard we want the sampled dataset to be. I also add some measure of randomness at multiple points to hopefully keep the datasets diverse while still balanced and challenging.

In the later portions of the training I resample the dataset after each epoch.

Classification output of a model trained in this fashion achieves around 0.86 on [the private LB](https://www.kaggle.com/c/humpback-whale-identification/leaderboard). Using similarity calculations solely, the performance improves to 0.9.

The training procedure looked as followed:
* train the classification model on extracted bounding boxes without dataset construction as in earlier notebooks
* load the weights into the custom model (modifying the state dict and loading of weights which is not shown here)
* train on 224x224 images as below
* train on 448x448 images
* train on 448x448 images only with contrastive loss

For the later parts of the training I generated new datasets every epoch with K as low as 3.

In [21]:
%%time

SZ = 224
NUM_WORKERS = 12
BS = 32

basic_dataloader = create_basic_dataloader(SZ, BS, NUM_WORKERS)
dists = create_similarity_dict(learn.model, basic_dataloader)
data = create_data(SZ, dists, BS)

CPU times: user 1min 23s, sys: 27.7 s, total: 1min 51s
Wall time: 1min 51s


Here I am training from scratch. I first train the classifier head with the rest of the model frozen.

In [23]:
%%time

learn = Learner(data, CustomModel(), loss_func=loss_fn, metrics=[accuracy_mod, map5_mod, cross_entropy_loss, contr_loss])
learn = learn.clip_grad()
learn.split((learn.model.cnn[6], learn.model.head))
learn.freeze()

learn.fit_one_cycle(12, 1e-2)
learn.save(name())

epoch     train_loss  valid_loss  accuracy_mod  map5_mod  cross_entropy_loss  contr_loss
1         148.910690  147.522888  0.055290      0.076900  14.646444           26.461220   
2         123.472168  145.344574  0.082594      0.108106  14.397060           34.348557   
3         85.668175   135.626740  0.125256      0.165592  13.510564           13.028355   
4         53.227573   127.417381  0.187713      0.246024  12.650944           22.698233   
5         37.310898   117.722824  0.256655      0.318282  11.670763           25.379938   
6         30.048853   112.574509  0.309215      0.374215  11.157745           24.926008   
7         22.527187   108.538574  0.356655      0.419767  10.791218           15.660150   
8         17.551012   99.792419   0.403754      0.465836  9.885334            23.477522   
9         14.864764   91.965767   0.449829      0.508680  9.101912            23.665983   
10        12.320655   83.247070   0.477816      0.538737  8.249646            18.765621   
1

I then proceed to training the entire model. I use the one cycle policy and use discriminative fine tuning.

In [24]:
learn.unfreeze()
dists = create_similarity_dict(learn.model, basic_dataloader)
learn.data = create_data(SZ, dists, BS)

max_lr = 1e-3
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(20, lrs)
learn.save(name())

epoch     train_loss  valid_loss  accuracy_mod  map5_mod  cross_entropy_loss  contr_loss
1         26.731331   74.631279   0.519795      0.576769  7.457275            1.462620    
2         21.375389   75.528976   0.516724      0.571519  7.549324            0.893103    
3         20.266861   76.560631   0.497611      0.556661  7.649171            1.723411    
4         18.548521   77.408104   0.483959      0.544317  7.732693            2.029275    
5         17.174431   78.717873   0.479181      0.537947  7.857864            3.480946    
6         16.853291   72.678078   0.521160      0.577253  7.253875            3.482910    
7         15.177959   75.314728   0.501024      0.559454  7.515693            3.944957    
8         13.921166   70.167458   0.518430      0.578083  7.001643            3.775323    
9         13.039845   67.782875   0.534471      0.590421  6.761198            4.272540    
10        10.834149   66.399384   0.544369      0.597173  6.614733            6.301155    
1

I used the results on the validation set to understand what effect changes had on performance, to pick hyperparameters, etc. For this competition, the validation set removed a lot of valuable information from the train set.

As such, to complete the training, I switch to training on the entire train set (without retaining any images for the validation set).

(I could do that through the insights I gained earlier and also because I knew my model would not overfit)

In [26]:
learn.metrics = [] # metrics calculated on the validation set will no longer be useful

In [52]:
max_lr = 5e-4
lrs = [max_lr/100, max_lr/10, max_lr]

for _ in range(5):
    dists = create_similarity_dict(learn.model, basic_dataloader)
    learn.data = create_data(SZ, dists, BS, k=10, train_on_both_train_and_val=True)
    learn.fit(1, lrs)
print(f'Finished training with lr: {max_lr}')

learn.save(name())

epoch     train_loss  valid_loss
1         12.779222   62.867638   
Total time: 02:02
epoch     train_loss  valid_loss
1         12.693471   59.829559   
Total time: 02:03
epoch     train_loss  valid_loss
1         12.260114   61.417843   
Total time: 02:02
epoch     train_loss  valid_loss
1         12.241240   63.250805   
Total time: 02:03
epoch     train_loss  valid_loss
1         12.103703   65.692261   
Total time: 02:03
Finished training with lr: 0.0005
fmnmnrqjyx


The train loss is much higher, but the sampling of whales here is harder.

In [53]:
max_lr = 1e-4
lrs = [max_lr/100, max_lr/10, max_lr]

for _ in range(5):
    dists = create_similarity_dict(learn.model, basic_dataloader)
    learn.data = create_data(SZ, dists, BS, k=7, train_on_both_train_and_val=True)
    learn.fit(1, lrs)
print(f'Finished training with lr: {max_lr}')

learn.save(name())

epoch     train_loss  valid_loss
1         11.932597   61.330284   
Total time: 02:03
epoch     train_loss  valid_loss
1         11.082793   59.027733   
Total time: 02:03
epoch     train_loss  valid_loss
1         10.894583   57.672829   
Total time: 02:03
epoch     train_loss  valid_loss
1         11.322955   58.176491   
Total time: 02:03
epoch     train_loss  valid_loss
1         10.782765   57.838867   
Total time: 02:03
Finished training with lr: 0.0001
azhovmyckz


In [54]:
max_lr = 5e-5
lrs = [max_lr/100, max_lr/10, max_lr]

for _ in range(5):
    dists = create_similarity_dict(learn.model, basic_dataloader)
    learn.data = create_data(SZ, dists, BS, k=3, train_on_both_train_and_val=True)
    learn.fit(1, lrs)
print(f'Finished training with lr: {max_lr}')

learn.save(name())

epoch     train_loss  valid_loss
1         10.938106   57.279255   
Total time: 02:03
epoch     train_loss  valid_loss
1         10.535534   56.972042   
Total time: 02:03
epoch     train_loss  valid_loss
1         10.983456   56.210239   
Total time: 02:03
epoch     train_loss  valid_loss
1         10.421021   57.603001   
Total time: 02:03
epoch     train_loss  valid_loss
1         11.137517   57.158092   
Total time: 02:03
Finished training with lr: 5e-05
nfbjfylcqh


The above is not exactly how I trained, but it hopefully captures the gist of it in a readable way.

At this point I would switch to training on 448x448 crops. I would train on 448x448 crops with the custom loss function (combining cross entropy and contrastive divergence). For the last segment of the training, I used contrastive loss only.

I will not carry out the training on larger images here, let us rather proceed to generating a submission.

## Generating predictions

In [55]:
class Reader():
    def __call__(self, paths):
        images = [open_image(image_path) for image_path in paths]
        tensors = [image2tensor(image) for image in images]
        return [imagenet_normalize(tensor) for tensor in tensors]

train_items = df.Image.apply(lambda fn: f'data/train-extracted-{SZ}/{fn}').tolist()
test_items = list(map(lambda p: str(p), paths_to_files_in(f'data/test-extracted-{SZ}')))

train_and_test_items = train_items + test_items

train_and_test_ds = Dataset([*zip(train_and_test_items)], Reader(), lambda _: 0)
train_and_test_dl = DataLoader(train_and_test_ds, batch_size=BS, num_workers=NUM_WORKERS)

I run the cnn part of the model on all the images in the train and test sets to obtain features (I call them descs for descriptors).

In [56]:
%%time

descs = []
learn.model.eval()
with torch.no_grad():
    for batch in train_and_test_dl:
        ims = batch[0][0].cuda()
        cnn_out = learn.model.cnn(ims)
        descs.append(learn.model.ada_concat(cnn_out).squeeze().detach().cpu())

descs = torch.cat(descs).cuda()

CPU times: user 27.8 s, sys: 14 s, total: 41.8 s
Wall time: 42.4 s


I calculate distances between each image in the test set and all images in the train set.

In [57]:
def path2whale(path):
    return re.search('(\w*.\w*$)', path).group(1)

In [58]:
%%time

dists = {}
for i, path in enumerate(train_and_test_dl.items[15694:]):
    whale = path2whale(path[0])
    dists[whale] = torch.pairwise_distance(descs[i + 15694], descs[:15694]).cpu().numpy()

CPU times: user 17.7 s, sys: 9.37 s, total: 27.1 s
Wall time: 27.1 s


And I generate whale id predictions.

In [59]:
test_fns = [p.name for p in paths_to_files_in(f'data/test-extracted-{SZ}')]

In [68]:
%%time

new_whale_threshold = 47

all_preds = []
for fn in test_fns:
    most_similar = list(dists[fn].argsort())
    preds = []
    
    while len(preds) < 5:
        similar = most_similar.pop(0)
        class_of_similar = labeler.fn2label[path2whale(train_and_test_items[similar])]
        if dists[fn][similar] > new_whale_threshold:
            if 'new_whale' not in preds: preds.append('new_whale')
        if len(preds) < 5:
            if class_of_similar not in preds: preds.append(class_of_similar)
    all_preds.append(preds)

CPU times: user 17.1 s, sys: 4 ms, total: 17.1 s
Wall time: 17.1 s


I tried looking for the best threshold for predicting `new_whale` in a couple of ways. Based on thinking about this and the results I was seeing I came to the conclusion that predicting `new_whale` as first prediction a little more often than 27% of the time (which was the ratio of new whales in the public portion of the test set) should work quite well.

One can alter this by modifying the `new_whale_threshold` above.

In [69]:
np.mean([preds[0] == 'new_whale' for preds in all_preds])

0.3466515893956527

Let's generate the submission.

In [70]:
sub_name = 'res50_similarity'

sub = pd.DataFrame({'Image': test_fns, 'Id': all_preds})
sub.Id = sub.Id.str.join(' ')
sub.to_csv(f'subs/{sub_name}.csv.gz', index=False, compression='gzip')

There was an image missing from the test set (one where I was unable to extract a bounding box) so here I am adding a prediction for it.

In [71]:
sub = pd.read_csv(f'subs/{sub_name}.csv.gz')
sub.append({'Image': '6a72d84ca.jpg', 'Id': 'new_whale'}, ignore_index=True).to_csv(f'subs/{sub_name}.csv.gz', index=False, compression='gzip')

In [72]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{sub_name}.csv.gz -m "{sub_name}"

100%|████████████████████████████████████████| 186k/186k [00:01<00:00, 73.7kB/s]
Successfully submitted to Humpback Whale Identification

The model as trained above achieves 0.84812 on private LB. With a bit more training on 448x448 images the score increased to 0.90813. As I have not spent a lot of time training the model, there is some chance the score would improve further with more training.