In [None]:
###############################################
# Functions used for preprocessing input images
###############################################

import torchvision.transforms as transforms

# once the images are loaded, how do we pre-process them before being passed into the network
# and normalize them to mean = 0 and standard-deviation = 1

# We also include data augmentation techniques as our dataset is quite small.  These include
# RandomHorizontalFlip
# These transformations are performed at each epoch

data_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(0.5),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])


###############################################
# Main of the program
###############################################

import argparse
import os
import numpy as np
import pandas as pd

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 600)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torch.autograd import Variable
from tqdm import tqdm
import PIL.Image as Image
os.system("! pip install cnn_finetune")
from cnn_finetune import make_model
import xgboost as xgb

import gc
gc.enable()


# Training settings
data_dir = "../input/train-images-recvis18/train_images/train_images"
seed = 1
batch_size = 32
log_interval = 5
experiment = "../working/experiment"
test_dir = "../input/mva-recvis-2018/bird_dataset/bird_dataset/test_images/mistery_category"
momentum = 0.9
nepoch = 10
nclasses = 20 
lr = 0.01

use_cuda = torch.cuda.is_available()
torch.manual_seed(seed)

# Create experiment folder
if not os.path.isdir(experiment):
    os.makedirs(experiment)

# 1. Define functions used for fitting models

In [None]:
###################################################
# Functions used to run models and make predictions
###################################################

#########################################################
# Train and validation functions for CNN trained with SGD
#########################################################

def train(epoch, model, train_loader, ntrain, batch_size):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        criterion = torch.nn.CrossEntropyLoss(reduction='elementwise_mean')
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), ntrain,
                100. * batch_idx / np.int(ntrain / batch_size), loss.data.item()))
    return model

def validation(model, val_loader, nval):
    model.eval()
    validation_loss = 0
    correct = 0
    for data, target in val_loader:
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        output = model(data)
        # sum up batch loss
        criterion = torch.nn.CrossEntropyLoss(reduction='elementwise_mean')
        validation_loss += criterion(output, target).data.item()

        # get the index of the max log-probability
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    validation_loss /= nval
    accuracy = 100. * correct / nval
    accuracy = accuracy.data.numpy()
    print('\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        validation_loss, correct, nval, accuracy))
    return accuracy
    

###################################################################
# Train function for classifier trained with batch gradient descent
###################################################################


def train_batch(model, classifier, classifier_label, train_loader, val_loader, nval):
    model.train()
    
    batches_data_train, batches_data_val = [], []
    batches_target_train, batches_target_val = [], []

    for batch_idx, (data, target) in enumerate(train_loader):
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        
        # Compute output of pretrained model and store it in numpy format
        output = model(data)
        output = output.view(output.size()[0], -1).cpu().data.numpy()
        batches_data_train.append(output)
        
        # Store target in numpy format
        target = target.cpu().data.numpy()
        batches_target_train.append(target)

    for data, target in val_loader:
        if use_cuda:
            data, target = data.cuda(), target.cuda()

        # Compute output of pretrained model and store it in numpy format
        output = model(data)
        output = output.view(output.size()[0], -1).cpu().data.numpy()
        batches_data_val.append(output)
        
        # Store target in numpy format
        target = target.cpu().data.numpy()
        batches_target_val.append(target)
        
    data_train = np.concatenate(batches_data_train, axis=0)
    target_train = np.concatenate(batches_target_train, axis=0)
    data_val = np.concatenate(batches_data_val, axis=0)
    target_val = np.concatenate(batches_target_val, axis=0)
    
    print("Whole train and validation loaded for classifier\n")
    
    if classifier_label == "xgboost":
        xgb_train = xgb.DMatrix(data_train, label=target_train)
        xgb_val = xgb.DMatrix(data_val, label=target_val)
        
        params = {
            # Parameters that we are going to tune.
            'booster': 'gbtree',
            'n_estimators': 200,
            'max_depth':15,
            'min_child_weight': 1,
            'learning_rate':.1,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'seed': 1,
            # Other parameters
            'objective': 'multi:softmax',
            'num_class': nclasses,
            'eval_metric': 'merror', 
            'silent': 1
        }
        
        num_boost_round = 35
        early_stopping_rounds = 5
        
        # # Number of folds for cross-validation of parameters
        # nfold = 5
        
        # # Cross-validate max_depth/min_child_weight
        # # These parameters control the complexity of the trees
        # gridsearch_params = [(max_depth, min_child_weight) for max_depth in [8, 10, 15] for min_child_weight in [1,5,8]]
        
        # # Define initial best params and MAE
        # min_merror = float("Inf")
        # best_params = None
        
        # for max_depth, min_child_weight in gridsearch_params:
        #     print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
            
        #     # Update our parameters
        #     params['max_depth'] = max_depth
        #     params['min_child_weight'] = min_child_weight
            
        #     cv_results = xgb.cv(
        #         params,
        #         xgb_train,
        #         num_boost_round=num_boost_round,
        #         nfold=nfold,
        #         early_stopping_rounds=early_stopping_rounds,
        #         verbose_eval=True
        #     )
        
    
        #     # Update best MAE
        #     mean_merror = cv_results['test-merror-mean'].min()
        #     boost_rounds = cv_results['test-merror-mean'].argmin()
            
        #     print("\tmerror {} for {} rounds".format(mean_mae, boost_rounds))
        #     if mean_merror < min_merror:
        #         min_merror = mean_merror
        #         best_params = (max_depth, min_child_weight)
                
        # print("Best params: {}, {}, min_merror: {}".format(best_params[0], best_params[1], min_mae))
        
        # # Cross-validate max_depth/min_child_weight
        # # These parameters control the complexity of the trees
        # gridsearch_params = [(subsample, colsample) for subsample in [i/10. for i in range(7,11)] for colsample in [i/10. for i in range(7,11)]]
        
        classifier = xgb.train(
            params,
            xgb_train,
            num_boost_round=num_boost_round,
            evals=[(xgb_val, "val")],
            early_stopping_rounds=early_stopping_rounds,
            xgb_model=classifier # Continue training from previous epoch
        )
        
        pred_val = classifier.predict(xgb_val)
        correct = (pred_val==target_val).sum()
        accuracy = 100. * correct/nval
        
        print('\nValidation set: Accuracy: {}/{} ({:.0f}%)\n'.format(
            correct, nval, accuracy))

    return (classifier, accuracy)
    
####################################
# Functions used to make predictions
####################################

def pil_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        with Image.open(f) as img:
            return img.convert('RGB')
            
def make_predictions(model, test_dir, outfile, classifier=None):
    output_file = open(outfile, "w")
    output_file.write("Id,Category\n")
    print("Making predictions on the test set...")
    for f in os.listdir(test_dir):
        if 'jpg' in f:
            data = data_transforms(pil_loader(test_dir + '/' + f))
            data = data.view(1, data.size(0), data.size(1), data.size(2))
            if use_cuda:
                data = data.cuda()
                
            if classifier == None:
                output = model(data)
                pred = output.data.max(1, keepdim=True)[1]
            else:
                output = model(data)
                if type(classifier) == xgb.core.Booster:
                    output = xgb.DMatrix(output.view(output.size()[0], -1).cpu().data.numpy())
                pred = classifier.predict(output)
                
            output_file.write("%s,%d\n" % (f[:-4], pred))
    
    output_file.close()
    print("Succesfully wrote " + outfile + ', you can upload this file to the kaggle competition website\n')

# 2. Cross-validation settings

In [None]:
# Creates random indices that will be used to cross-validate generalization error
data_loader = torch.utils.data.DataLoader(datasets.ImageFolder(data_dir, transform=data_transforms), 
    batch_size=batch_size, shuffle=True, num_workers=0)

ndata = len(data_loader.dataset)
nfold = 10
nval = np.int(ndata/nfold)
ntrain = ndata - nval

# Generate splits
rand_indices = np.random.permutation(ndata)
list_val_indices = [rand_indices[max((i-1)*nval,0):min(i*nval, ndata)] for i in range(1, nfold+1)]
list_train_indices = [list(set(rand_indices).difference(set(val_indices))) for val_indices in list_val_indices]

# Normally if you define a sampler in the DataLoader function you need to set shuffle=False. However
# using SubsetRandomSampler ensures shuffling itself even though shuffle=False
# As a consequence validation set is shuffled too but it's no big deal

train_indices = list_train_indices[0]
val_indices = list_val_indices[0]

train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_indices)
val_sampler = torch.utils.data.sampler.SubsetRandomSampler(val_indices)

# 3. Run models

## 3.1 Pretrained + default models

In [None]:
#####################################################
# Classifiers used as last layer of pretrained models
#####################################################

def nn_1(in_features, num_classes):
    return nn.Sequential(
        nn.Linear(in_features, 4096),
        nn.ReLU(inplace=True),
        nn.Linear(4096, nclasses),
    )

In [None]:
# Let's use only train/val split to estimate quality of the model
# First train complete CNN with pretrained models
# Parameters of last layers of pretrained models are retrained

classifier_label = "default"
pretrained_models = {}
validation_results = pd.DataFrame(columns=["pretrained", "classifier", "lr", "epoch", "accuracy"])

for pretrained in ["se_resnext50_32x4d", "inceptionresnetv2", "inception_v3", "densenet121", "se_resnet50"]:
    if not os.path.isdir("experiment/%s" % pretrained):
        os.mkdir("experiment/%s" % pretrained)
        
    for classifier_label in ["default"]:
        if classifier_label == "default":
            model = make_model(pretrained, num_classes=nclasses, pretrained=True, input_size=(224, 224))
        else:
            model = make_model(pretrained, num_classes=nclasses, pretrained=True, input_size=(224, 224), classifier_factory=eval(classifier_label))
            
        # Because CUDA memory is limited, we can't update all parameters of pretrained models
        if pretrained == "densenet121":
            # Let weights of first 2 layers unchanged
            for param in list(model.children())[0][0].parameters():
                param.requires_grad = False
            for param in list(model.children())[0][1].parameters():
                param.requires_grad = False

            # _Dense Block is composed of 6 dense layers
            # Let weights of first 3 layers unchanged
            for i in range(3):
                for param in list(model.children())[0][4][i].parameters():
                    param.requires_grad = False

        elif pretrained == "inception_v3":
            # First 7 layers of inception_v3 are BasicConv2d and MaxPool2D
            # Next 4 are InceptionA layers
            for i in range(11):
                for param in list(model.children())[0][i].parameters():
                    param.requires_grad = False

        elif pretrained == "se_resnet50":
            # CUDA memory is enough to update all parameters
            pass
        
        elif pretrained == "inceptionresnetv2":
            # First 15 layers, only update params of last one
            for i in range(14):
                for param in list(model.children())[0][i].parameters():
                    param.requires_grad = False

        print("#"*40)
        print("Model specifications")
        print("Pretrained: %s" % pretrained)
        print("Classifier: %s" % classifier_label)
        print("SGD learning rate: %s" % lr)
        if use_cuda:
            print('Using GPU')
            model.cuda()
        else:
            print('Using CPU')
        print("#"*40)
        print("\n")

        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

        # Data augmentation apply to each epoch
        for epoch in range(1, nepoch + 1):
            dataset = datasets.ImageFolder(data_dir, transform=data_transforms) # Data augmentation apply to each epoch
            train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
            val_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

            # Train the model on the selected epoch on the train dataset and then compute validation accuracy on val dataset
            train(epoch, model, train_loader, ntrain, batch_size) # Use mini-batches of size batch_size
            accuracy = validation(model, val_loader, nval)

            row = {"pretrained": pretrained, "classifier": classifier_label, "lr": lr, "epoch": epoch, "accuracy": accuracy}
            validation_results = validation_results.append(row, ignore_index=True)
            validation_results.to_csv("../working/experiment/validation_results.csv")

            # Make predictions on test images
            model.eval()
            if use_cuda:
                model.cuda()

            outfile = "../working/experiment/%s/kaggle_%s_%s_epoch%d.csv" % (pretrained, pretrained, classifier_label, epoch)
            make_predictions(model, test_dir, outfile)

        # Save model with weights updated from training
        pretrained_models[pretrained] = model

## 3.2 Other classifiers

In [None]:
for pretrained, model in pretrained_models.items():
    model = list(model.children())[0] # Retrieve pretrained model and discard last layers used for classification
    for classifier_label in ["xgboost"]:
        print("#"*40)
        print("Model specifications")
        print("Pretrained: %s" % pretrained)
        print("Classifier: %s" % classifier_label)
        print("#"*40)
        print("\n")
        
        classifier = None
        
        dataset = datasets.ImageFolder(data_dir, transform=data_transforms) # Data augmentation apply to each epoch
        train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
        val_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

        # Train the model on the selected epoch and then compute validation accuracy
        classifier, accuracy = train_batch(model, classifier, classifier_label, train_loader, val_loader, nval)
        row = {"pretrained": pretrained, "classifier": classifier_label, "lr": -1, "epoch": epoch, "accuracy"
        : accuracy}
        validation_results = validation_results.append(row, ignore_index=True)
        validation_results.to_csv("../working/experiment/validation_results.csv")

        # Make predictions on test images
        model.eval()
        if use_cuda:
            model.cuda()

        outfile = "../working/experiment/%s/kaggle_%s_%s_epoch%d.csv" % (pretrained, pretrained, classifier_label, epoch)
        make_predictions(model, test_dir, outfile, classifier)

## 3.3 Ensemble technique: MajorityVote Classifier

In [None]:
validation_results

In [None]:
from scipy.stats import mode

######################################################
# Code that mixes predictions of the differents models
# One technique is used majority vote to classify
######################################################

path = "../working/experiment/"
threshold = 80

# Read output files
vals = validation_results
output = pd.read_csv(path + "densenet121/kaggle_densenet121_default_epoch1.csv", usecols=["Id"])
        
# Select predictions of models having validation performance above threshold
outputs_model = pd.DataFrame.copy(output)
del outputs_model["Id"]
for pretrained in vals.pretrained.unique():
    for classifier in vals.classifier.unique():
        for epoch in vals.epoch.unique():
            mask = (vals.pretrained == pretrained) & (vals.classifier == classifier) & (vals.epoch == epoch)
            if mask.sum() > 0:
                accuracy = vals[mask]["accuracy"].values[0]
                name = "Category_%s_%s_%s" % (pretrained, classifier, epoch)

                outfile = "%s/kaggle_%s_%s_epoch%d.csv" % (pretrained, pretrained, classifier, epoch)
                out = pd.read_csv(path  + outfile)

                if accuracy >= threshold:
                    outputs_model.loc[:, name] = out["Category"]
                
output.loc[:, "Category"] = outputs_model.apply(lambda x: mode(x)[0][0], axis=1)
output.to_csv("output.csv", index=False)