In [1]:
import os
import pickle
import itertools
import io
import threading
import cv2
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from models.resnet import *
from torchvision import transforms
import time

## The labels

The block below uses LabelEncoder to create encodings for all labels that I use throughout the entire pipeline.

In [3]:
# if there is no labelencoder.pkl -> create
if not os.path.isfile('../data/labelencoder.pkl'):
    documents = bson.decode_file_iter(open('../data/train.bson', 'rb'))

    # create (id <-> class) lookup table
    categories = [(d['_id'], d['category_id']) for d in documents]
    categories = pd.DataFrame(categories, columns=['id', 'cat'])
    categories.to_csv('../data/categories.csv')

    # create labelencoder file
    labelencoder = LabelEncoder()
    labelencoder.fit(categories.cat.unique().ravel())
    with open('../data/labelencoder.pkl', 'wb') as f:
        pkl.dump(labelencoder, f)

# if there is -> load
else:
    with open('../data/labelencoder.pkl', 'rb') as f:
        labelencoder = pickle.load(f)
    categories = pd.read_csv('../data/categories.csv')

## The batch generator

The data is stored in BSON which is used by MongoDB to store its data. Install pymongo to have access to the bson module. I would have liked to convert the BSON file to .jpg's with classes as folder structure, but my SSD does not have enough Inodes to store all those images. Because of this, we can't load data shuffled (the BSON iterator loads sequentially).

The batch generator creates an iterator from the train/test.bson file. Then we load images from the iterator, process them, store them in a GPU tensor (to accelerate the process just a little bit, was about 0.002s faster per image). When we have [batch_size] images, we yield them.

In [4]:
def batch_generator(data_path, size=130, batch_size=32, return_labels=True):

    ''' preprocessing pipeline (inputs PIL image, outputs processed tensor)'''
    process = transforms.Compose([
        transforms.Scale(size), # downscale the image to size[=130]
        transforms.RandomHorizontalFlip(), # random horizontal flip
        transforms.ToTensor(), # transform from PIL Image to Tensor           
        lambda x: x.cuda().view(1,3,size,size), # send image to GPU and transform to size (1,3,130,130)
        transforms.Normalize(mean=[.485, .456, .406],
                             std=[.229, .224, .225]) # zero-center image by mean subtraction and std division
        ])
    
    # decodes BSON, and creates iterator that yields documents from BSON collection
    data = bson.decode_file_iter(open(data_path, 'rb'))

    # create GPU tensors that stores the batch
    x = torch.FloatTensor(()).cuda()
    y = torch.LongTensor(()).cuda()
    
    # iterate over BSON iterator
    for item in data:

        # get item label
        # labelencoder has all the encodings for the labels, like a lookup table
        # store in GPU
        category = item.get('category_id', '')
        label = int(labelencoder.transform([category])) if category else 0 # if test.bson there is no categ
        label = torch.LongTensor([label]).cuda()

        # for all images in item, process (sometimes, one document has multiple images)
        for image in item.get('imgs'):

            # from binary, to PIL, apply preprocessing pipeline
            proc_img = process(Image.open(io.BytesIO(image.get('picture', None))))

            # add to batch
            x = torch.cat([x, proc_img])
            y = torch.cat([y, label])

            # once we have batches of batch_size, yield the batch
            if x.size(0) == batch_size:

                if return_labels:
                    # to use tensors with models they have to be transformed to Variable()
                    yield Variable(x), Variable(y) 
                else:
                    yield Variable(x)
                
                # reset batch tensors
                x = torch.FloatTensor(()).cuda()
                y = torch.LongTensor(()).cuda()

In the end we end up with a batch of type Variable, that we can use directly to input in the model:

In [10]:
batch_gen = batch_generator('../data/train.bson', batch_size=32)
x, y = next(batch_gen)

print(x.size()) # 32 processed images of 130x130x3 pixels
print(y.size()) # 32 labels

torch.Size([32, 3, 130, 130])
torch.Size([32])


In [13]:
%timeit next(batch_gen)

58.2 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


It takes about 58 milliseconds to load next [batch_size] images, process and yield them.

## The train phase
PyTorch model training involves some steps. 
* Set the model to train mode (model.train()). This lets layers such as Dropout know that it must drop nodes (dropout should not work in test)
* Reset the gradients (optimizer.zero_grad()). This resets all gradients to 0. If this is not done, the gradients get summed every time.
* Calculate the loss. Pass the data through the model (out = model(x)) and calculate the loss (loss_fn(out, y))
* Do a backward pass to calculate the gradients w.r.t. weights (loss.backward())
* Once the gradients are known, optimize the model with these gradients (optimizer.step())
* Reset the gradients again (optimizer.zero_grad())

In [11]:
# helper function
def write_loss(loss, acc):
    '''writes loss, acc to file'''
    w = '{}, {}\n'.format(format(loss, '.3f'), format(acc, '.2f'))
    with open('exp2_loss.txt', 'a') as f:
        f.write(w)
#---------------------

def train(epoch):

    # init stats
    c = 0
    train_loss = 0
    train_acc = 0

    # set model to train mode and reset gradients
    model.train()
    optimizer.zero_grad()
    
    # iterate over training batches
    for batch_idx, (x, y) in enumerate(batch_gen):

        # get batch predictions and loss
        output = model(x)
        loss = crit(output, y)
        
        # accumulate gradients
        '''
        This is a trick that seemed to work for people. Gradient accumulation
        sums the gradients of some batches before optimizing the network.
        If we accumulate the gradients 4 batches, our effective batch size
        is 4*batch_size.
        '''
        loss.backward()
        if batch_idx % accum_iter == 0:
            optimizer.step()
            optimizer.zero_grad()

        # accumulate statistics
        _, idx = output.cpu().max(1)
        train_loss += loss.data[0]
        train_acc += accuracy_score(y.cpu().data.numpy(), idx.data.numpy().ravel())
        c += 1

        # print statistics
        if batch_idx % print_iter == 0:

            # get average loss and accuracy
            train_loss /= c
            train_acc /= c

            # save loss and acc to file
            write_loss(train_loss, train_acc)

            # print the statistics
            print('\rEpoch {} [{}/{} ({:.0f}%)] - loss: {:.6f} - acc: {:.3f}'.format(
                epoch+1, batch_idx * batch_size, 7069896, 100. * batch_idx / (7069896//batch_size), 
                train_loss, train_acc), end='')
            
            # reset stats
            c = 0
            train_loss = 0
            train_acc = 0

        # exit training phase
        if batch_idx >= val_split:
            return

## The test phase

During the test phase, we do not optimize the model etc. We simply set the model to evaluation mode (model.eval()), do a forward pass (model(x)) and calculate the loss. Finally, I return the average loss and average accuracy.

In [12]:
def test():

    # init stats
    test_loss = 0
    correct = 0

    # set model to evaluation mode
    model.eval()

    # iterate over validation batches
    for batch_idx, (x, y) in enumerate(data_loader):

        # forward pass plus stat accumulation
        output = model(x)
        test_loss += crit(output, y).data[0]
        pred = output.data.max(1)[1]
        correct += pred.eq(y.data.view_as(pred)).cpu().sum()
        
    # print validation phase statistics
    test_loss /= (batch_idx + 1)
    print('Validation set: Average loss: {:.4f}, Accuracy: {:.0f}%\n'.format(
        test_loss, (correct / ((batch_idx + 1) * batch_size))*100))

## The model

At the moment I am still working with the ResNet50 model. The model is found in the resnet.py file. I took this file from the PyTorch source code, because I wanted to modify it slightly. I have added .2 dropout for generalization capacity and replaced an average pool layer with adaptive avg pool, so that I could use smaller images than that ResNet was originally designed for.

Because we use pre-trained weights, I do not want to train the whole network. Rather I will train the fully-connected layer, and two layers of convolutional blocks (layer4 and layer3).

In [20]:
# load ResNet50 without ImageNet weights
model = resnet50(pretrained=True)

# freeze all parameters
for param in model.parameters():
    param.requires_grad = False
    
# add unfrozen custom fully connected layer
model.fc = nn.Linear(2048, 5270) # 5270 == len(classes)

# unfreeze fully-connected and 3rd/4th layer
for param in model.layer4.parameters():
    param.requires_grad = True
for param in model.layer3.parameters():
    param.requires_grad = True

# send model to GPU
model.cuda()

ResNet (
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (relu): ReLU (inplace)
  (maxpool): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
  (layer1): Sequential (
    (0): Bottleneck (
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU (inplace)
      (downsample): Sequential (
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      )
    )
    (1): Bott

## Hyperparams

Finally, we select some hyperparameters, optimizer algorithm and loss function.

In [21]:
# parameters
batch_size = 32
learning_rate = 1e-4
epochs = 3
num_classes = len(labelencoder.classes_)
val_split = round(0.9*(7069896//batch_size)) # use 90% for training
accum_iter = 2
print_iter = 10

# Categorical cross-entropy as loss function
crit = nn.CrossEntropyLoss()

# Stochastic gradient descent
# the filter gets only unfrozen layer parameters to optimize
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), 
                      lr=learning_rate, momentum=0.9, weight_decay=5e-4)

## Putting it together

In [22]:
# train the model
for e in range(epochs):
    batch_gen = batch_generator('../data/train.bson', batch_size=batch_size) # reload the batch gen
    train(e) # train phase
    test() # test phase
    torch.save(model.state_dict(), './resnet50_{}-epoch_finetune-fc-lyr3-lyr4.pth'.format(e+1)) # save weights



KeyboardInterrupt: 