In [1]:
import os
import random
import time
import copy
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from time import sleep

import torch
import cv2
import timm
from torch import nn, optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from catalyst.data.sampler import BalanceClassSampler
from ranger import Ranger
#https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import albumentations as A
from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, ShiftScaleRotate, CenterCrop, Resize
)
from albumentations.pytorch import ToTensorV2

import logging
logging.getLogger().setLevel(logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# seed everything
SEED = 55555

random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# hyperparameters
DATA_PATH = r"./data/"

IMAGE_SIZE = 250
BATCH_SIZE = 24
GRADIENT_ACCUMULATION_STEPS = 5
LEARNING_RATE = 0.0005
EPOCHS = 100
N_CLASS = 2
LAMBDA = 30


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    logging.info(f"Running on {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
    logging.info("Running on a CPU")

INFO:root:Running on GeForce GTX 1060 6GB


In [5]:
content_list = []
labels_list = []

for image in tqdm_notebook(os.listdir(DATA_PATH)):    
    if ".jpg" in image:
        content = cv2.imread(DATA_PATH + image)
        content = cv2.cvtColor(content, cv2.COLOR_BGR2RGB)        
        content = np.array(content)
        content_list.append(content)
    elif ".txt" in image:
        with open(DATA_PATH + image, 'r') as f:
            labels = f.read()
        labels = np.array(labels.split(" "), dtype=int)
        labels[0] = 0 if labels[0] == 1 else 1
        labels = np.roll(labels, -1)   
        labels_list.append(labels)


HBox(children=(FloatProgress(value=0.0, max=6770.0), HTML(value='')))




In [6]:
dataset = np.array([list(a) for a in zip(content_list, labels_list)])


In [7]:
train_dataset, test_dataset = train_test_split(dataset, shuffle=True, test_size=0.1, random_state=SEED)
train_dataset, valid_dataset = train_test_split(train_dataset, shuffle=True, test_size=0.2, random_state=SEED)
train_clf_labels = [a[-1] for a in train_dataset[:,1]]

In [8]:
# custom dataset class that augments data during training
# imagenet normalization is used for efficientnet fine-tuning

class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transforms=None):
        super().__init__()
        self.data = dataset
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, label = self.data[index][0], self.data[index][1]
        if image.shape[-1] == 4:
            # removing alpha channel if present
            image = image[..., :3]
        if len(image.shape) == 2:
            # converting single channel image to 3 channel for possible greyscales
            image = np.stack((image,)  * 3, axis = -1)

        if self.transforms is not None:
            # bboxes=[labels] labels are wrapped into list for correct augmentation
            transformed = self.transforms(image=image, bboxes=[label])

            image = transformed["image"]
            label = torch.Tensor(transformed["bboxes"])[0]

        return image, label

transform = Compose([
            Resize(width=IMAGE_SIZE, height=IMAGE_SIZE),
            HorizontalFlip(p=0.4),
            ShiftScaleRotate(p=0.3),
            MedianBlur(blur_limit=7, always_apply=False, p=0.3),
            IAAAdditiveGaussianNoise(scale=(0, 0.15*255), p=0.5),
            HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.4),
            RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            # in this implementation imagenet normalization is used            
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            Cutout(p=0.4),
            ToTensorV2(p=1.0),
        ], p=1.0, bbox_params=A.BboxParams(format='pascal_voc'))

test_transform = Compose([
            # only resize and normalization is used for testing
            # no TTA is implemented in this solution
            Resize(width=IMAGE_SIZE, height=IMAGE_SIZE),                    
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.0, bbox_params=A.BboxParams(format='pascal_voc'))


train_dataset, valid_dataset = Dataset(train_dataset, transforms = transform), Dataset(valid_dataset, transforms = transform)
test_dataset = Dataset(test_dataset, transforms = test_transform)

In [9]:
train_dataloader = DataLoader(
            train_dataset,
            #balanced sampler is used to minimize harmful effects of dataset not being fully balanced
            sampler=BalanceClassSampler(labels=train_clf_labels, mode="upsampling"),
            #sampler = RandomSampler(train_dataset),
            batch_size = BATCH_SIZE)
valid_dataloader = DataLoader(
            valid_dataset,
            sampler = SequentialSampler(valid_dataset),
            batch_size = BATCH_SIZE)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 1)

In [10]:
# class EFN_Classifier(nn.Module):
#     def __init__(self, model_arch, n_class, pretrained=True):
#         super().__init__()
#         self.model = timm.create_model(model_arch, pretrained=pretrained)
#         n_features = self.model.classifier.in_features
#         self.model.classifier = nn.Linear(n_features, 2048)

#         self.fc1 = nn.Linear(2048, 1536)
#         self.fc2 = nn.Linear(2048, 512)

#         self.dropout = nn.Dropout(0.35)

#         self.fc_bbox = nn.Linear(1536, 4)
#         self.fc_clf = nn.Linear(512, n_class)
        
#     def forward(self, x):
#         x = self.model(x)
#         x = self.dropout(x)

#         x_bbox = self.fc1(x)
#         x_bbox = self.dropout(x_bbox)

#         x = self.fc2(x)
#         x = self.dropout(x)

#         x_bbox = self.fc_bbox(x_bbox)
#         x = self.fc_clf(x)

#         return x, x_bbox

# net = EFN_Classifier("tf_efficientnet_b1_ns", N_CLASS).to(device)
# EfficientNet: https://arxiv.org/abs/1905.11946
# fine tuning the efficientnet for classification and object detection
# in this implementation, no weights are frozen
# ideally, batchnorm layers can be frozen for marginal training speed increase

In [11]:
class EFN_Simple_Classifier(nn.Module):
    def __init__(self, n_class):
        super(EFN_Simple_Classifier, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=2, padding=0)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, padding=2)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, padding=2)
        self.bn4 = nn.BatchNorm2d(256)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=5, padding=2)
        self.bn5 = nn.BatchNorm2d(512)
        self.conv6 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=5, padding=2)
        self.bn6 = nn.BatchNorm2d(1024)
        self.conv7 = nn.Conv2d(in_channels=1024, out_channels=2048, kernel_size=5, padding=2)
        self.bn7 = nn.BatchNorm2d(2048)

        self.pool = nn.AvgPool2d(2)

        self.fc1 = nn.Linear(2048, 2048)

        self.dropout = nn.Dropout(0.15)

        self.fc_bbox = nn.Linear(2048, 4)
        self.fc_clf = nn.Linear(2048, n_class)

    def mish(self, x):
        return x * torch.tanh(F.softplus(x))
        
    def forward(self, x):

        x = self.pool(self.mish(self.bn1(self.conv1(x)))) 
        x = self.dropout(self.pool(self.mish(self.bn2(self.conv2(x)))))
        x = self.dropout(self.pool(self.mish(self.bn3(self.conv3(x)))))
        x = self.dropout(self.pool(self.mish(self.bn4(self.conv4(x)))))
        x = self.dropout(self.pool(self.mish(self.bn5(self.conv5(x)))))
        x = self.dropout(self.pool(self.mish(self.bn6(self.conv6(x)))))
        x = self.dropout(self.pool(self.mish(self.bn7(self.conv7(x)))))
        #print(x.size())
        x = torch.flatten(x, 1)
        #print(x.size())        
        x_clf = self.fc_clf(x)
        x_bbox = self.fc1(x)
        x_bbox = self.fc_bbox(x_bbox)

        return x_clf, x_bbox

net = EFN_Simple_Classifier(N_CLASS).to(device)

In [12]:
loss_function = nn.MSELoss()
clf_loss_function = nn.CrossEntropyLoss()
#optimizer = Ranger(net.parameters(), lr=LEARNING_RATE, weight_decay = 0.999, betas = (0.9, 0.999))
optimizer = torch.optim.AdamW(net.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), weight_decay=0.999)
scheduler = CosineAnnealingLR(optimizer, EPOCHS * 0.25, LEARNING_RATE * 0.0001)
scheduler.last_epoch = EPOCHS
scaler = GradScaler()

def training_loop():
    best_model_wts = copy.deepcopy(net.state_dict())
    best_loss = float("inf")
    
    for epoch in (range(EPOCHS)):        
        if epoch != 0 and epoch > 0.25 * EPOCHS: # cosine anneal the last 25% of epochs
            scheduler.step()
        logging.info(f"Epoch {epoch+1}")

        logging.info("Training")
        train_losses, train_accuracies, train_miou = forward_pass(train_dataloader, train = True)  

        logging.info("Validating")
        val_losses, val_accuracies, val_miou = forward_pass(valid_dataloader)

        logging.info(f"Training accuracy:   {sum(train_accuracies)/len(train_accuracies):.2f} | Training loss: {sum(train_losses)/len(train_losses):.2f} | Training mIoU: {sum(train_miou)/len(train_miou):.2f}")
        logging.info(f"Validation accuracy: {sum(val_accuracies)/len(val_accuracies):.2f} | Validation loss: {sum(val_losses)/len(val_losses):.2f} | Validation mIoU: {sum(val_miou)/len(val_miou):.2f}")
        
        epoch_val_loss = sum(val_losses)/len(val_losses)
        
        if best_loss > epoch_val_loss:    
            best_loss = epoch_val_loss
            best_model_wts = copy.deepcopy(net.state_dict())
            torch.save(net.state_dict(), "best.pth")
            logging.info(f"Saving with loss of {epoch_val_loss}, improved over previous {best_loss}")

In [13]:
def bbox_iou(true_boxes, pred_boxes):
    iou_list = []    
    for true_box, pred_box in zip(true_boxes, pred_boxes):        

        x_left = max(true_box[0], pred_box[0]).item()
        y_top = max(true_box[1], pred_box[1]).item()

        x_right = min(true_box[2], pred_box[2]).item()
        y_bottom = min(true_box[3], pred_box[3]).item()

        if x_right < x_left or y_bottom < y_top:
            return 0.0 

        overlap = (x_right - x_left) * (y_bottom - y_top)

        true_box_area = (true_box[2] - true_box[0]) * (true_box[3] - true_box[1])
        pred_box_area = (pred_box[2] - pred_box[0]) * (pred_box[3] - pred_box[1])
        iou = overlap / float(true_box_area + pred_box_area - overlap)        
        iou_list.append(iou)
    
    iou = torch.tensor(iou)
    iou = torch.mean(iou)    

    return iou

def draw_boxes(images, bboxes, labels):        
        label_dict = {0 : "Cat", 1 : "Dog"}

        for batch in zip(images, bboxes, labels):
                cv2.destroyAllWindows()
                image, bbox, label = batch[0].cpu().numpy(), batch[1].cpu().numpy(), torch.argmax(batch[2]).cpu().item()
                image = np.rollaxis(image, 0, 3)
                image = ((image - image.min()) * (1/(image.max() - image.min()) * 255)).astype('uint8')
                image = cv2.UMat(image)

                cv2.rectangle(image,
                        (bbox[0], bbox[1]),
                        (bbox[2], bbox[3]),
                        (0, 255, 0),
                        thickness=2)

                cv2.putText(image, f"{label_dict[label]}", (bbox[1], bbox[3]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
                cv2.imshow("test", image)
                cv2.waitKey(1)
                sleep(1)        


def forward_pass(dataloader, draw=False, train=False):
    if train:
        net.train()
    else:
        net.eval()

    losses = []
    accuracies = []
    miou = []

    for step, batch in (enumerate(dataloader)):
        inputs = batch[0].to(device).float()
        labels = batch[1].to(device).float()
        
        # splitting labels for separate loss calculation
        bbox_labels = labels[:, :4] 
        clf_labels = labels[:, 4:].long()
        clf_labels = clf_labels[:,0]       

        with autocast():
            if train:
                label_outputs, bbox_outputs = net(inputs)
                bbox_loss = loss_function(bbox_outputs, bbox_labels)                
                clf_loss = clf_loss_function(label_outputs, clf_labels)

                loss = torch.mean(bbox_loss + clf_loss*LAMBDA)
                scaler.scale(loss).backward()
            else:
                with torch.no_grad():
                    label_outputs, bbox_outputs = net(inputs)
                    bbox_loss = loss_function(bbox_outputs, bbox_labels)                
                    clf_loss = clf_loss_function(label_outputs, clf_labels)
                    loss = torch.mean(bbox_loss + clf_loss*LAMBDA)
        if draw:
            draw_boxes(inputs, bbox_outputs, label_outputs)
        
        matches = [torch.argmax(i) == j for i, j in zip(label_outputs, clf_labels)] 
        acc = matches.count(True)/len(matches)
        iou = bbox_iou(bbox_labels, bbox_outputs)

        miou.append(iou)
        losses.append(loss)
        accuracies.append(acc)
        
        if train and (step+1) % GRADIENT_ACCUMULATION_STEPS == 0:
        # gradient accumulation to train with bigger effective batch size 
        # with less memory use
        # fp16 is used to speed up training and reduce memory consumption
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()            
            logging.info(f"Step {step} of {len(train_dataloader)},\t"\
                            f"Accuracy: {sum(accuracies)/len(accuracies):.2f},\t"\
                            f"mIoU: {sum(miou)/len(miou):.2f},\t"\
                            f"Loss: {sum(losses)/len(losses):.2f}")

    return losses, accuracies, miou


In [14]:
training_loop()

INFO:root:Epoch 1
INFO:root:Training
INFO:root:Step 4 of 140,	Accuracy: 0.41,	mIoU: 0.00,	Loss: 16737.21
INFO:root:Step 9 of 140,	Accuracy: 0.45,	mIoU: 0.00,	Loss: 16785.74
INFO:root:Step 14 of 140,	Accuracy: 0.48,	mIoU: 0.00,	Loss: 16825.61
INFO:root:Step 19 of 140,	Accuracy: 0.48,	mIoU: 0.00,	Loss: 16895.94
INFO:root:Step 24 of 140,	Accuracy: 0.48,	mIoU: 0.00,	Loss: 16940.50
INFO:root:Step 29 of 140,	Accuracy: 0.49,	mIoU: 0.00,	Loss: 16785.81
INFO:root:Step 34 of 140,	Accuracy: 0.50,	mIoU: 0.00,	Loss: 16704.66
INFO:root:Step 39 of 140,	Accuracy: 0.49,	mIoU: 0.00,	Loss: 16763.71
INFO:root:Step 44 of 140,	Accuracy: 0.50,	mIoU: 0.00,	Loss: 16624.22
INFO:root:Step 49 of 140,	Accuracy: 0.52,	mIoU: 0.00,	Loss: 16389.11
INFO:root:Step 54 of 140,	Accuracy: 0.52,	mIoU: 0.00,	Loss: 16232.79
INFO:root:Step 59 of 140,	Accuracy: 0.52,	mIoU: 0.00,	Loss: 15990.52
INFO:root:Step 64 of 140,	Accuracy: 0.52,	mIoU: 0.00,	Loss: 15700.86
INFO:root:Step 69 of 140,	Accuracy: 0.52,	mIoU: 0.00,	Loss: 15389.06

KeyboardInterrupt: 

In [15]:
net.load_state_dict(torch.load("best.pth"))

<All keys matched successfully>

In [16]:
start = time.time()

test_losses, test_accuracies, test_miou = forward_pass(test_dataloader, draw=False, train=False)
total_time = time.time() - start
logging.info(f"Average inference time is: {total_time/len(test_dataloader):.3f}")
logging.info(f"Test accuracy: {sum(test_accuracies)/len(test_accuracies):.2f} | Test loss: {sum(test_losses)/len(test_losses):.2f} | Test mIoU: {sum(test_miou)/len(test_miou):.2f}")

INFO:root:Average inference time is: 0.047
INFO:root:Test accuracy: 0.83 | Test loss: 455.46 | Test mIoU: 0.58


In [17]:

INFO:root:Average inference time is: 0.063
INFO:root:Test accuracy: 0.99 | Test loss: 62.80 | Test mIoU: 0.82

SyntaxError: invalid syntax (<ipython-input-17-ef58aeec6500>, line 1)