In [1]:
#
# This code is based on an assignment from excellent Deep Learning course at https://dlcourse.ai/
#
import PIL
import os
from tqdm.notebook import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, SubsetRandomSampler, Sampler
from torchvision import transforms

from onnx import onnx_pb
from onnx_coreml import convert

In [2]:
class HotdogOrNotDataset(Dataset):
# Define our own dataset class. We will load images from files. 
# The ground truth is calculated based on the filename.
#
    def __init__(self, folder, transform=None):
        self.transform = transform
        self.folder = folder
        # assumption: the folder contains only normal files, no subfolders
        self.filelist = os.listdir(folder)
        
    def __len__(self):
        return len(self.filelist)
    
    def __getitem__(self, index):        
        filename = self.filelist[index]
        img = PIL.Image.open(os.path.join(self.folder, filename))
        if (self.transform is not None):
            img = self.transform(img)
        # We will use more variants of hot dogs, not just classical ones
        if filename.startswith("chili-dog") or filename.startswith("frankfurter") or filename.startswith("hotdog"):
            y = 1 # This is a hot dog
        else: 
            y = 0 # NOT a hot dog
        img_id = filename
        
        return img, y, img_id

# The transformations below are used to distort images, so that model will train better
train_dataset = HotdogOrNotDataset("train_images/", 
                       transform=transforms.Compose([
                           transforms.RandomHorizontalFlip(),
                           transforms.RandomVerticalFlip(),
                           transforms.ColorJitter(hue=.05, saturation=.05),
                           transforms.RandomRotation(25, resample=PIL.Image.BILINEAR),
                           transforms.Resize((224, 224)),
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])                            
                       ]))

# No distortion transformations for testing, but we still need to resze image to 224x224 and
# adjust the brightness/contrast to standard mean and deviation. 
test_dataset = HotdogOrNotDataset("test_images/",
                       transform=transforms.Compose([
                           transforms.Resize((224, 224)),
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
                       ]))

In [3]:
def generate_loaders(train_dataset, split = 0.2, batch_size = 64):
# We split the dataset into train part and validation part. We create two data loaders for that.
#
# Train loader is used to train model.
# Validation loader is used to estimate how well we are doing during the training.
# Test loader is used AFTER the training is complete to see how well the model was trained.

    np.random.seed(0)
    data_size = len(train_dataset)
    indices = list(range(data_size))
    np.random.shuffle(indices)
    val_split = int(np.floor(split * data_size))
    
    val_indices, train_indices = indices[:val_split], indices[val_split:]
    num_batches = int((data_size - val_split) / batch_size)
    
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
                                             sampler=val_sampler)
    return train_loader, val_loader, num_batches

In [4]:
def train_model(model, train_dataset, loss, optimizer, num_epochs, scheduler=None):
# Train model for a specified number of epochs

    loss_history = []
    train_history = []
    val_history = []
    train_loader, val_loader, num_batches = generate_loaders(train_dataset, batch_size=batch_size)
        
    for epoch in range(num_epochs):
        model.train() # Enter train mode
        loss_accum = 0
        correct_samples = 0
        total_samples = 0

        for i, (x, y, _) in tqdm(enumerate(train_loader), total=len(train_loader)):     
            x_gpu = x.to(device)
            y_gpu = y.to(device)
            prediction = model(x_gpu)    
            loss_value = loss(prediction, y_gpu)
            
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            _, indices = torch.max(prediction, 1)
            batch_correct_samples = torch.sum(indices == y_gpu)
            correct_samples += batch_correct_samples
            total_samples += y.shape[0]
            loss_accum += loss_value
            
        if scheduler is not None:
            scheduler.step()
            
        # Never use gradient calculations if we don't train the model.
        with torch.no_grad():
            ave_loss = loss_accum / i
            train_accuracy = float(correct_samples) / total_samples
            # calculate accuracy on validation dataset
            val_accuracy = compute_accuracy(model, val_loader)
        
            # keep the history of loss and accuracy in case we'll want to see how the training goes
            loss_history.append(float(ave_loss))
            train_history.append(train_accuracy)
            val_history.append(val_accuracy)
        
        print('Average loss: %f, Train accuracy: %f, Val accuracy: %f' % (ave_loss, train_accuracy, val_accuracy))
        
    return loss_history, train_history, val_history
        
def compute_accuracy(model, loader):
# Compute accuracy of the model using data from loader    

    model.eval() # Evaluation mode
    correct_samples = 0
    total_samples = 0
    for i_step, (x, y, _) in enumerate(loader):
        x_gpu = x.to(device)
        y_gpu = y.to(device)
        # calculate predictions for the batch
        prediction = model(x_gpu)
        _, indices = torch.max(prediction, 1)
        # sum correct predictions
        correct_in_batch = torch.sum(indices == y_gpu)
        correct_samples += correct_in_batch
        total_samples += y.shape[0]

    # calculate accuracy across all batches
    accuracy = float(correct_samples) / total_samples
    
    return accuracy

In [5]:
# This will only run on a computer with CUDA GPU
device = torch.device('cuda:0')
# We will use pre-trained model from Torch model zoo
model = models.mobilenet_v2(pretrained=True)

In [6]:
# We change the output layer to predict only 2 possible answers: hot dog or not a hot dog
num_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_features, 2)

# We don't freeze any layers, because we have a small model and a small dataset to train on

# Copy model to GPU
model = model.to(device)

In [7]:
# Change this to 40 or less if you have 4GB memory on the GPU
batch_size = 80
# Use standard loss function for classification
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-03)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.25)

# 3 epochs are enough for our tiny dataset
loss_history, train_history, val_history = train_model(model, train_dataset, loss, 
                                                       optimizer, 3, scheduler=scheduler)

HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))


Average loss: 0.263595, Train accuracy: 0.879718, Val accuracy: 0.893478


HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))


Average loss: 0.140342, Train accuracy: 0.942981, Val accuracy: 0.948913


HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))


Average loss: 0.102030, Train accuracy: 0.961444, Val accuracy: 0.953261


In [8]:
# supress those pesky warnings
import warnings
warnings.filterwarnings('ignore')

# store model to file, so we can load it later, if needed
torch.save(model, 'my_mobilenet_v2.pth')

In [9]:
onnx_file = 'mobilenet_v2-2.onnx'
# convert model to ONNX format
dummy_input = torch.randn(1, 3, 224, 224, device=device)
input_names = ['actual_input_1'] + ['learned_%d' % i for i in range(10)]
output_names = ['output1']
torch.onnx.export(model, dummy_input, onnx_file, verbose=False, 
                  input_names=input_names, output_names=output_names)

In [10]:
# convert ONNX to CoreML, so it can run on iOS
model_file = open(onnx_file, 'rb')
model_proto = onnx_pb.ModelProto()
model_proto.ParseFromString(model_file.read())
coreml_model = convert(model_proto, image_input_names=['actual_input_1'], image_output_names=['outputImage'])
coreml_model.save('coreml_output-2')

1/151: Converting Node Type Conv
2/151: Converting Node Type BatchNormalization
3/151: Converting Node Type Clip
4/151: Converting Node Type Conv
5/151: Converting Node Type BatchNormalization
6/151: Converting Node Type Clip
7/151: Converting Node Type Conv
8/151: Converting Node Type BatchNormalization
9/151: Converting Node Type Conv
10/151: Converting Node Type BatchNormalization
11/151: Converting Node Type Clip
12/151: Converting Node Type Conv
13/151: Converting Node Type BatchNormalization
14/151: Converting Node Type Clip
15/151: Converting Node Type Conv
16/151: Converting Node Type BatchNormalization
17/151: Converting Node Type Conv
18/151: Converting Node Type BatchNormalization
19/151: Converting Node Type Clip
20/151: Converting Node Type Conv
21/151: Converting Node Type BatchNormalization
22/151: Converting Node Type Clip
23/151: Converting Node Type Conv
24/151: Converting Node Type BatchNormalization
25/151: Converting Node Type Add
26/151: Converting Node Type Conv
