# **Human Protein Multi Label Image Classification**

This notebook is using the dataset available at [Zero to GANs - Human Protein Classification](https://www.kaggle.com/c/jovian-pytorch-z2g).

In [1]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, random_split, DataLoader
from PIL import Image
import torchvision.models as models
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from sklearn.metrics import f1_score
import torch.nn.functional as F
import torch.nn as nn
from torchvision.utils import make_grid
%matplotlib inline

In [2]:
#TODO: change this for cloud/local process
DATA_DIR = "../input/jovian-pytorch-z2g/Human protein atlas"

TRAIN_DIR = "../input/jovian-pytorch-z2g/Human protein atlas/train"
TEST_DIR = "../input/jovian-pytorch-z2g/Human protein atlas/test"

TRAIN_CSV = "../input/jovian-pytorch-z2g/Human protein atlas/train.csv"
TEST_CSV = "../input/jovian-pytorch-z2g/submission.csv"

In [3]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

Unnamed: 0,Image,Label
0,19567,9
1,29993,6 4
2,17186,1 4
3,29600,6 2
4,701,3 4


In [4]:
labels = {
    0: 'Mitochondria',
    1: 'Nuclear bodies',
    2: 'Nucleoli',
    3: 'Golgi apparatus',
    4: 'Nucleoplasm',
    5: 'Nucleoli fibrillar center',
    6: 'Cytosol',
    7: 'Plasma membrane',
    8: 'Centrosome',
    9: 'Nuclear speckles'
}

In [5]:
def encode_label(label):
    # create a tensor containing ten 0s
    target = torch.zeros(10)
    # we will turn our label into string and then loop through it
    for l in str(label).split(' '):
        # we take respective index in the label and turn the respective position in the tensor from 0 into 1
        target[int(l)] = 1.
    return target

def decode_target(target, text_labels=False, threshold=0.5):
    # initiate an empty result
    result = []
    # loop through the target vector and its index
    for i, x in enumerate(target):
        # if the probability value is greater or equal to the threshold
        if (x >= threshold):
            # if it is a text_label
            if text_labels:
                # append the name of the label and the id
                result.append(f"{labels[i]} ({str(i)})")
            else:
                # if not, or if it is just label id then we append the index
                result.append(str(i))
    return ' '.join(result)
    

In [6]:
class HumanProteinDataset(Dataset):
    # we will instantiate our class by specifying the csv_file, the root directory, and the optional transform param
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.transform = transform
        self.root_dir = root_dir
    
    # this part is pretty self-explanatory
    def __len__(self):
        return len(self.df)    
    
    # this method accepts an index as a parameter
    def __getitem__(self, idx):
        # we locate the row by its index
        row = self.df.loc[idx]
        
        # separate image_id and image_label
        img_id, img_label = row['Image'], row['Label']
        
        # choosing the image file name
        img_fname = self.root_dir + "/" + str(img_id) + ".png"
        
        # open the image using PIL library
        img = Image.open(img_fname)
        
        # if we have specified our transform parameter in the beginning
        if self.transform:
            # transform the image accordingly
            img = self.transform(img)
        # the function will open the image and encode the image label
        return img, encode_label(img_label)

In [7]:
# specify image transforms for augmentation during training
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.4),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0),
    transforms.RandomAffine(degrees=20, translate=(0.1, 0.1), scale=(0.8, 1.2),
                            shear=None, resample=False, fillcolor=(255, 255, 255)),
    transforms.ToTensor()
])

val_transform = transforms.Compose([transforms.ToTensor()])
dataset = HumanProteinDataset(TRAIN_CSV, TRAIN_DIR, transform=train_transform)

In [8]:
def show_sample(img, target, invert=True):
    if invert:
        plt.imshow(1 - img.permute((1, 2, 0)))
    else:
        plt.imshow(img.permute(1, 2, 0))
    print('Labels:', decode_target(target, text_labels=True))

In [9]:
# setting seed to the number of life
torch.manual_seed(42)

<torch._C.Generator at 0x7fa554020c50>

In [10]:
ratio = 0.05
val_size = int(ratio * len(dataset))
train_size = len(dataset) - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
len(train_ds), len(val_ds)

(18275, 961)

In [11]:
def show_batch(dl, invert=True):
    for images, labels in dl:
        fig, ax = plt.subplots(figsize=(16, 8))
        ax.set_xticks([]); ax.set_yticks([])
        data = 1-images if invert else images
        ax.imshow(make_grid(data, nrow=16).permute(1, 2, 0))
        break

In [12]:
def F_score(output, label, threshold=0.5, beta=1):
    prob = output > threshold
    label = label > threshold

    TP = (prob & label).sum(1).float()
    TN = ((~prob) & (~label)).sum(1).float()
    FP = (prob & (~label)).sum(1).float()
    FN = ((~prob) & label).sum(1).float()

    precision = torch.mean(TP / (TP + FP + 1e-12))
    recall = torch.mean(TP / (TP + FN + 1e-12))
    F2 = (1 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-12)
    return F2.mean(0)

In [13]:
class MultilabelImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, targets = batch 
        out = self(images)                      
        loss = F.binary_cross_entropy(out, targets)      
        return loss
    
    def validation_step(self, batch):
        images, targets = batch 
        out = self(images)                           # Generate predictions
        loss = F.binary_cross_entropy(out, targets)  # Calculate loss
        score = F_score(out, targets)
        return {'val_loss': loss.detach(), 'val_score': score.detach() }
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_scores = [x['val_score'] for x in outputs]
        epoch_score = torch.stack(batch_scores).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_score': epoch_score.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_score: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_score']))

In [14]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [15]:
device = get_default_device()
device

device(type='cuda')

In [16]:
def try_batch(dl):
    for images, labels in dl:
        print('images.shape:', images.shape)
        out = model(images)
        print('out.shape:', out.shape)
        print('out[0]:', out[0])
        break
# try_batch(train_dl)

## Training the model

In [17]:
from tqdm.notebook import tqdm

In [18]:
from tqdm.notebook import tqdm

@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD, schedule_func=None):
    torch.cuda.empty_cache()
    history = []
    optimizer = opt_func(model.parameters(), lr)
    scheduler = schedule_func(optimizer, mode='max', factor=0.1, patience=2, verbose=True, threshold=0.01, threshold_mode='rel', cooldown=0, min_lr=1e-8, eps=1e-08)
    for epoch in range(epochs):
        # Training Phase 
        model.train()
        train_losses = []
        for batch in tqdm(train_loader):
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
        if scheduler:
            scheduler.step(result['val_score'])
    return history

## Different models

In [19]:
class ProteinCnnModel_MobileNetv2(MultilabelImageClassificationBase):
    def __init__(self):
        super().__init__()
        # Use a pretrained model
        self.network = models.mobilenet_v2(pretrained=True)
        # Replace last layer
        # num_ftrs = self.last_channel
        num_ftrs = self.network.classifier._modules['1'].in_features
        self.network.classifier._modules['1'] = nn.Linear(num_ftrs, 10)
    
    def forward(self, xb):
        return torch.sigmoid(self.network(xb))

In [20]:
model = to_device(ProteinCnnModel_MobileNetv2(), device)

batch_size = 32

train_dl = DeviceDataLoader(DataLoader(train_ds, batch_size, shuffle=True, num_workers=1, pin_memory=True), device)
val_dl = DeviceDataLoader(DataLoader(val_ds, batch_size*2, num_workers=1, pin_memory=True), device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/checkpoints/mobilenet_v2-b0353104.pth


HBox(children=(FloatProgress(value=0.0, max=14212972.0), HTML(value='')))




In [21]:
num_epochs = 40
lr = 1e-3
opt_func = torch.optim.Adam
schedule_func = torch.optim.lr_scheduler.ReduceLROnPlateau

history = fit(num_epochs, lr, model, train_dl, val_dl, opt_func, schedule_func)

HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [0], train_loss: 0.2726, val_loss: 0.2592, val_score: 0.5544


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [1], train_loss: 0.2336, val_loss: 0.2634, val_score: 0.6054


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [2], train_loss: 0.2199, val_loss: 0.2434, val_score: 0.5931


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [3], train_loss: 0.2160, val_loss: 0.2407, val_score: 0.6241


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [4], train_loss: 0.2069, val_loss: 0.2332, val_score: 0.6314


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [5], train_loss: 0.2035, val_loss: 0.2212, val_score: 0.6618


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [6], train_loss: 0.1999, val_loss: 0.2066, val_score: 0.6371


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [7], train_loss: 0.1965, val_loss: 0.2184, val_score: 0.6614


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [8], train_loss: 0.1957, val_loss: 0.2214, val_score: 0.6543
Epoch     9: reducing learning rate of group 0 to 1.0000e-04.


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [9], train_loss: 0.1786, val_loss: 0.1872, val_score: 0.7048


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [10], train_loss: 0.1722, val_loss: 0.1904, val_score: 0.7112


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [11], train_loss: 0.1705, val_loss: 0.1791, val_score: 0.7164


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [12], train_loss: 0.1682, val_loss: 0.1836, val_score: 0.7119


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [13], train_loss: 0.1671, val_loss: 0.1815, val_score: 0.7208


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [14], train_loss: 0.1665, val_loss: 0.1814, val_score: 0.7116
Epoch    15: reducing learning rate of group 0 to 1.0000e-05.


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [15], train_loss: 0.1641, val_loss: 0.1728, val_score: 0.7245


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [16], train_loss: 0.1636, val_loss: 0.1849, val_score: 0.7147


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [17], train_loss: 0.1623, val_loss: 0.1772, val_score: 0.7168


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [18], train_loss: 0.1630, val_loss: 0.1864, val_score: 0.7184
Epoch    19: reducing learning rate of group 0 to 1.0000e-06.


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [19], train_loss: 0.1620, val_loss: 0.1771, val_score: 0.7163


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [20], train_loss: 0.1622, val_loss: 0.1727, val_score: 0.7221


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [21], train_loss: 0.1613, val_loss: 0.1837, val_score: 0.7119
Epoch    22: reducing learning rate of group 0 to 1.0000e-07.


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))



Epoch [25], train_loss: 0.1627, val_loss: 0.1804, val_score: 0.7173


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [26], train_loss: 0.1617, val_loss: 0.1869, val_score: 0.7212


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [27], train_loss: 0.1624, val_loss: 0.1747, val_score: 0.7165


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [30], train_loss: 0.1620, val_loss: 0.1719, val_score: 0.7179


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [33], train_loss: 0.1614, val_loss: 0.1892, val_score: 0.7177


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [35], train_loss: 0.1617, val_loss: 0.1814, val_score: 0.7062


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))


Epoch [36], train_loss: 0.1618, val_loss: 0.1815, val_score: 0.7231


HBox(children=(FloatProgress(value=0.0, max=572.0), HTML(value='')))

In [22]:
test_dataset = HumanProteinDataset(TEST_CSV, TEST_DIR, transform=val_transform)

## Creating a submission file

In [23]:
test_dl = DeviceDataLoader(DataLoader(test_dataset, batch_size, num_workers=2, pin_memory=True), device)

In [24]:
@torch.no_grad()
def predict_dl(dl, model):
    torch.cuda.empty_cache()
    batch_probs = []
    for xb, _ in tqdm(dl):
        probs = model(xb)
        batch_probs.append(probs.cpu().detach())
    batch_probs = torch.cat(batch_probs)
    return [decode_target(x) for x in batch_probs]

In [25]:
test_preds = predict_dl(test_dl, model)

HBox(children=(FloatProgress(value=0.0, max=258.0), HTML(value='')))




Let us know create a submission file with these predictions

In [26]:
submission_df = pd.read_csv(TEST_CSV)
submission_df.Label = test_preds
submission_df.head()

Unnamed: 0,Image,Label
0,24117,4
1,15322,1 4
2,14546,6
3,8079,0
4,13192,3 4


We can now save it batck to CSV, and download the file from the sidebar (check the output folder)

In [27]:
sub_fname = 'MovbileNet2_submission_v3.csv'

In [28]:
submission_df.to_csv(sub_fname, index=False)

You can now upload this submission file here: https://www.kaggle.com/c/jovian-pytorch-z2g/submit

## Save to Jovian

In [29]:
# !pip install jovian --upgrade

In [30]:
# import jovian

In [31]:
# jovian.commit(project='zerogans-protein-competition')

You can also use the "Save Version" button on Kaggle itself, to save a copy on your Kaggle profile.