In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Human Protein Classification with Fastai v1

In [2]:
#!conda uninstall --force jpeg libtiff -y

In [3]:
#!conda install -c conda-forge libjpeg-turbo

In [4]:
#!CC="cc -mavx2" pip install --no-cache-dir -U --force-reinstall --no-binary :all: --compile pillow-simd

In [5]:
import os
from pathlib import Path
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from fastai import *
from fastai.vision import *

from utils import open_4_channel
from dataset import ImageMulti4Channel
from resnet import Resnet4Channel

from sklearn.metrics import f1_score

In [6]:
np.random.seed(21)

In [7]:
path = Path('data/')

In [8]:
df = pd.read_csv(path/'train.csv')
df.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [9]:
df_external = pd.read_csv('external/external.csv')
df_external.head()

Unnamed: 0,Id,Target
0,10580_1610_C1_1,13 25 0 2 21
1,10580_1610_C1_2,13 25 0 2 21
2,10580_1756_B1_1,13 25 0 2 21
3,10580_1756_B1_2,13 25 0 2 21
4,10580_1758_B1_1,13 25 0 2 21


In [10]:
df_complete = df.append(df_external).reset_index(drop=True)

In [11]:
len(df), len(df_external), len(df_complete)

(31072, 74606, 105678)

In [12]:
fns = pd.Series([id for id in df_complete.Id])
len(fns)

105678

In [13]:
fns = pd.Series([id for id in df_complete.Id])
labels = [targ.split(' ') for targ in df_complete.Target]
classes=[str(i) for i in range(28)]

In [14]:
trn_mask=[]

for i in range(len(df_complete)):
    trn_mask.append(True if i in range(len(df)) else False)
    
len(trn_mask), trn_mask.count(True), trn_mask.count(False)

(105678, 31072, 74606)

In [15]:
trn_ds, val_ds = ImageMulti4Channel.from_folder_validx(path, 'train', trn_mask, fns, labels, classes=classes)
len(trn_ds), len(val_ds)

(31072, 74606)

In [16]:
df_test = pd.read_csv('data/sample_submission.csv')
df_test.head()

Unnamed: 0,Id,Predicted
0,00008af0-bad0-11e8-b2b8-ac1f6b6435d0,0
1,0000a892-bacf-11e8-b2b8-ac1f6b6435d0,0
2,0006faa6-bac7-11e8-b2b7-ac1f6b6435d0,0
3,0008baca-bad7-11e8-b2b9-ac1f6b6435d0,0
4,000cce7e-bad4-11e8-b2b8-ac1f6b6435d0,0


In [17]:
test_ids = list(df_test.Id)

In [18]:
len(df_complete) + len(df_test)

117380

In [20]:
test_ds,_ = ImageMulti4Channel.from_folder(
    path, 'test', pd.Series(test_ids),[['0'] for _ in range(len(test_ids))], valid_pct=0, classes=['0'])


In [23]:
stats_train = ([0.0574, 0.0475, 0.0305, 0.0894], [0.0873, 0.0721, 0.0766, 0.1303])

In [24]:
def resnet50(pretrained=True):
    return Resnet4Channel(encoder_depth=50, pretrained=pretrained)

In [25]:
def _resnet_split(m:nn.Module): return (m[0][6],m[1])

In [26]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2): #gamma=4 working pretty bad
        super().__init__()
        self.gamma = gamma
        
    def forward(self, input, target):
        if not (target.size() == input.size()):
            raise ValueError("Target size ({}) must be the same as input size ({})"
                             .format(target.size(), input.size()))

        max_val = (-input).clamp(min=0)
        loss = input - input * target + max_val + \
            ((-max_val).exp() + (-input - max_val).exp()).log()

        invprobs = F.logsigmoid(-input * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        
        return loss.sum(dim=1).mean()

In [27]:
f1 = partial(fbeta, thresh=0.4, beta=1) #thresh=weights

def acc(preds,targs,th=0.0):
    preds = (preds > th).int()
    targs = targs.int()
    return (preds==targs).float().mean()

In [28]:
torch.backends.cudnn.benchmark = True
src_size = 1024

In [29]:
data = ImageDataBunch.create(trn_ds, val_ds, test_ds=test_ds, path=path, bs=16, ds_tfms=([], []), \
                             num_workers=16, size=512).normalize(stats_train)

In [30]:
learn = create_cnn(
    data,
    resnet50,
    ps=[0.5, 0.5],
    cut=-2,
    split_on=_resnet_split,
    path=path,
    metrics=[f1,acc],
    callback_fns=[ShowGraph] #, BnFreeze
)

In [31]:
learn.loss_func = FocalLoss()

In [32]:
learn.split(split_on=_resnet_split)

In [33]:
learn.clip = 1.0 #gradient clipping

In [34]:
learn.layer_groups

[Sequential(
   (0): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
   (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (2): ReLU(inplace)
   (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
   (4): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (8): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (10): ReLU(inplace)
   (11): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (13): Conv2d(256, 64, kernel_

In [35]:
gc.collect()
models_path = path/'models/best_models'
os.listdir(models_path)

['res50_folding.pth', 'res50_old_aug.pth', 'res50_oversampling.pth']

In [36]:
for model in os.listdir(models_path):
    print(model)
    learn.load('best_models/'+model[:-4])
    learn.model.eval()
    
    preds_train,y = learn.TTA(ds_type=DatasetType.Train)
    preds_train_np = preds_train.numpy()
    np.savetxt('prediction_whole_set/' + model[:-4]+'_train.csv', preds_train_np, delimiter=",")
    
    preds_valid,y = learn.TTA(ds_type=DatasetType.Valid)
    preds_valid_np = preds_valid.numpy()
    np.savetxt('prediction_whole_set/' + model[:-4]+'_external.csv', preds_valid_np, delimiter=",")
    
    preds_test, y_t = learn.TTA(ds_type=DatasetType.Test)
    preds_test_np = preds_test.numpy()
    np.savetxt('prediction_whole_set/' + model[:-4]+'_test.csv', preds_test_np, delimiter=",")
    
    gc.collect()

KeyboardInterrupt: 