In [1]:
midx = '5'

In [2]:
import socket
import timeit
import time
from datetime import datetime
import os
import glob
from collections import OrderedDict
import numpy as np
import pandas as pd
import pickle
import gc
import cv2
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
import seaborn as sns
sns.set_style("white")
import random
import PIL
import pathlib
import pathlib

import torch
from torch.autograd import Variable
import torch.optim as optim
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.utils import make_grid
from torch import nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau, StepLR
from torch.utils.data.sampler import WeightedRandomSampler
import torchvision

import albumentations as A

from skimage.exposure import histogram, equalize_hist, equalize_adapthist
from skimage.morphology import dilation, remove_small_objects, remove_small_holes, label

import pretrainedmodels
from xception import xception

from tensorboardX import SummaryWriter

from scipy.special import logit
from sklearn.metrics import jaccard_similarity_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

import imgaug as ia
from imgaug import augmenters as iaa
import multiprocessing
import threading

from dataloaders import utils
from dataloaders import custom_transforms as tr

# from losses import CombinedLoss, BCELoss2d
import lovasz_losses as L

In [3]:
directory = './'

ori_size = 512
up_size = 512
image_size = 512

interp = cv2.INTER_AREA
# methods=[("area", cv2.INTER_AREA), 
#          ("nearest", cv2.INTER_NEAREST), 
#          ("linear", cv2.INTER_LINEAR), 
#          ("cubic", cv2.INTER_CUBIC), 
#          ("lanczos4", cv2.INTER_LANCZOS4)]

y_pad = image_size - up_size
y_min_pad = int(y_pad / 2)
y_max_pad = y_pad - y_min_pad

x_pad = image_size - up_size
x_min_pad = int(x_pad / 2)
x_max_pad = x_pad - x_min_pad

print(ori_size, up_size, image_size)

512 512 512


In [4]:
PATH_TO_TRAIN = './train/'
PATH_TO_TEST = './test/'
PATH_TO_EXTERNAL2 = './external_data2/'
PATH_TO_EXTERNAL3 = './external_data3/'
PATH_TO_EXTERNAL4 = './external_data4/'
PATH_TO_EXTERNAL5 = './external_data5/'
PATH_TO_TARGET = './train.csv'
PATH_TO_TARGETXX = './HPAv18Y.csv'
PATH_TO_SUB = './sample_submission.csv'

LABEL_MAP = {
0: "Nucleoplasm" ,
1: "Nuclear membrane"   ,
2: "Nucleoli"   ,
3: "Nucleoli fibrillar center",   
4: "Nuclear speckles"   ,
5: "Nuclear bodies"   ,
6: "Endoplasmic reticulum"   ,
7: "Golgi apparatus"  ,
8: "Peroxisomes"   ,
9:  "Endosomes"   ,
10: "Lysosomes"   ,
11: "Intermediate filaments"  , 
12: "Actin filaments"   ,
13: "Focal adhesion sites"  ,
14: "Microtubules"   ,
15: "Microtubule ends"   ,
16: "Cytokinetic bridge"   ,
17: "Mitotic spindle"  ,
18: "Microtubule organizing center",  
19: "Centrosome",
20: "Lipid droplets"   ,
21: "Plasma membrane"  ,
22: "Cell junctions"   ,
23: "Mitochondria"   ,
24: "Aggresome"   ,
25: "Cytosol" ,
26: "Cytoplasmic bodies",
27: "Rods & rings"}

LOC_MAP = {}
for k in LABEL_MAP.keys(): LOC_MAP[LABEL_MAP[k]] = k

In [5]:
# from Tomomi
dxx = pd.read_csv(PATH_TO_TARGETXX, index_col = None)
dxx.set_index('Id',inplace=True)
dxx = dxx[dxx.GotYellow==1]
print(dxx.head())
print(dxx.shape)

                              Target  GotYellow
Id                                             
ENSG00000000003_4109_24_H11_1     25          1
ENSG00000000003_4109_24_H11_2     25          1
ENSG00000000003_4109_23_H11_1     25          1
ENSG00000000003_4109_23_H11_2     25          1
ENSG00000000003_4109_25_H11_1     25          1
(77444, 2)


In [6]:
# dataloader bombs out on iteration 63914, so limit size here
# dxx = dxx.iloc[:50000]
# dxx = dxx.iloc[50000:]
# dxx = dxx.iloc[37154:]
print(dxx.shape)

(77444, 2)


In [7]:
def image_histogram_equalization(image, number_bins=256):
    # from http://www.janeriksolem.net/2009/06/histogram-equalization-with-python-and.html

    # get image histogram
    image_histogram, bins = np.histogram(image.flatten(), number_bins, density=True)
    cdf = image_histogram.cumsum() # cumulative distribution function
    cdf = 255 * cdf / cdf[-1] # normalize

    # use linear interpolation of cdf to find new pixel values
    image_equalized = np.interp(image.flatten(), bins[:-1], cdf)

    # return image_equalized.reshape(image.shape), cdf
    return image_equalized.reshape(image.shape)

def equalize(arr):
    arr = arr.astype('float')
    # usually do not touch the alpha channel
    # but here we do since it is yellow
    for i in range(arr.shape[-1]):
        # arr[...,i] = 255 * equalize_hist(arr[...,i])
        arr[...,i] = image_histogram_equalization(arr[...,i])                                  
    return arr

def normalize(arr, q=0.01):
    arr = arr.astype('float')
    # usually do not touch the alpha channel
    # but here we do since it is yellow
    # print('arr before',arr.shape,arr.min(),arr.mean(),arr.max())
    for i in range(arr.shape[-1]):
        # arr[...,i] = 255 * equalize_hist(arr[...,i])
        ai = arr[...,i]
        # print('ai ' + str(i) + ' before',i,ai.shape,ai.min(),ai.mean(),ai.max())
        qlow = np.percentile(ai,100*q)
        qhigh = np.percentile(ai,100*(1.0-q))
        if qlow == qhigh:
            arr[...,i] = 0.
        else:
            arr[...,i] = 255.*(np.clip(ai,qlow,qhigh) - qlow)/(qhigh - qlow)                              
        # print('ai ' + str(i) + ' after',i,ai.shape,ai.min(),ai.mean(),ai.max())
    # print('arr after',arr.shape,arr.min(),arr.mean(),arr.max())
    return arr

def standardize(arr):
    arr = arr.astype('float')
    # usually do not touch the alpha channel
    # but here we do since it is yellow
    # print('arr before',arr.shape,arr.min(),arr.mean(),arr.max())
    for i in range(arr.shape[-1]):
        # arr[...,i] = 255 * equalize_hist(arr[...,i])
        ai = (arr[...,i] - arr.mean())/(arr.std() + 1e-6)
        # print('ai ' + str(i) + ' after',i,ai.shape,ai.min(),ai.mean(),ai.max())
    # print('arr after',arr.shape,arr.min(),arr.mean(),arr.max())
    return arr



class MultiBandMultiLabelDataset(Dataset):
    
#     BANDS_NAMES = ['_red.png','_green.png','_blue.png','_yellow.png']
    BANDS_NAMES = ['_red','_green','_blue','_yellow']
    
    def __len__(self):
        return len(self.images_df)
    
    def __init__(self, images_df, 
                 base_path, 
                 image_transform=None, 
                 augmentator=None,
                 train_mode=True,
                 external=0
                ):
        if not isinstance(base_path, pathlib.Path):
            base_path = pathlib.Path(base_path)
            
        self.images_df = images_df.reset_index()
        self.image_transform = image_transform
        self.augmentator = augmentator
        self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x)
        self.mlb = MultiLabelBinarizer(classes=list(LABEL_MAP.keys()))
        self.train_mode = train_mode
        self.external = external
        if self.external == 2: self.suffix = '.jpg'
        else: self.suffix = '.png'
        self.cache = {}
                                 
    def __getitem__(self, index):
        # print('index class',index.__class__)
        if isinstance(index, torch.Tensor): index = index.item()
        if index in self.cache: 
            X, y = self.cache[index]
        else:
            y = None
            X = self._load_multiband_image(index)
            if self.train_mode:
                y = self._load_multilabel_target(index)
            self.cache[index] = (X,y)
        
        # augmentator can be for instance imgaug augmentation object
        if self.augmentator is not None:
#             print('getitem before aug',X.shape,np.min(X),np.mean(X),np.max(X))
#             X = self.augmentator(np.array(X))
            X = self.augmentator(image=X)['image']
#             print('getitem after aug',X.shape,np.min(X),np.mean(X),np.max(X))
           
        if self.image_transform is not None:
            X = self.image_transform(X)
        
        return X, y 
        
    def _load_multiband_image(self, index):
        row = self.images_df.iloc[index]
        
        if self.external == 1:
            p = str(row.Id.absolute()) + self.suffix
            band3image = PIL.Image.open(p)
    
        else:
            image_bands = []
            for i,band_name in enumerate(self.BANDS_NAMES):
                p = str(row.Id.absolute()) + band_name + self.suffix
                pil_channel = PIL.Image.open(p)
                if self.external == 2: 
#                     pa = np.sum(np.array(pil_channel),axis=-1)
# #                     pa = np.array(pil_channel)
# #                     print(i,band_name,pil_channel.mode,pa.shape,pa.min(),pa.mean(),pa.max())
#                     if pa.max() > 0:
#                         pil_channel = PIL.Image.fromarray(pa.astype('uint8'),'L')
                    pil_channel = pil_channel.convert("L")
                image_bands.append(pil_channel)

            # pretend its a RBGA image to support 4 channels
            band4image = PIL.Image.merge('RGBA', bands=image_bands)
#             band3image = PIL.Image.merge('RGB', bands=image_bands)
    
#         band3image = band3image.resize((image_size,image_size), PIL.Image.ANTIALIAS)
        band4image = band4image.resize((image_size,image_size), PIL.Image.ANTIALIAS)

#         # normalize each channel     
#         arr = np.array(band4image)
# #         arr = np.array(band3image)
    
# #         # average red and yellow channels, orange
# #         arr[...,0] = (arr[...,0] + arr[...,3])/2.0
# #         arr = arr[...,:3]
        
#         # arr = np.array(band3image)
#         # print('arr shape',arr.shape)
#         # if index==0: print(index,'hist before',histogram(arr))
        
# #         arr = normalize(arr)
# #         arr = standardize(arr)
# #         arr = equalize(arr)
        
# #         # average red and yellow channels, orange
# #         arr[...,0] = (arr[...,0] + arr[...,3])/2.0
# #         arr = arr[...,:3]
                
#         # if index==0: print(index,'hist after',histogram(arr))
# #         band3image = PIL.Image.fromarray(arr.astype('uint8'),'RGB')
#         band4image = PIL.Image.fromarray(arr.astype('uint8'),'RGBA')

        # histogram equalize each channel
        
#         arr = np.array(band4image)
#         # print('arr',arr.shape)
#         # if index==0: print(index,'hist before',histogram(arr))
#         arr = equalize(arr)
#         # if index==0: print(index,'hist after',histogram(arr))
#         band4image = PIL.Image.fromarray(arr.astype('uint8'),'RGBA')
        
        return band4image
#         return band3image
#         return arr

#         band3image = PIL.Image.new("RGB", band4image.size, (255, 255, 255))
#         band3image.paste(band4image, mask=band4image.split()[3]) 
#         band3image = band3image.resize((image_size,image_size), PIL.Image.ANTIALIAS)
#         return band3image
   
    
    def _load_multilabel_target(self, index):
        y = self.images_df.iloc[index].Target.split(' ')
#         print(y)
        try:
            yl = list(map(int, y))
        except:
            yl = []
        return yl
    
        
    def collate_func(self, batch):
        labels = None
        images = [x[0] for x in batch]
        
        if self.train_mode:
            labels = [x[1] for x in batch]
            labels_one_hot  = self.mlb.fit_transform(labels)
            labels = torch.FloatTensor(labels_one_hot)
            
        
        # return torch.stack(images)[:,:4,:,:], labels
        return torch.stack(images), labels


In [8]:
imean = (0.08069, 0.05258, 0.05487)
istd = (0.13704, 0.10145, 0.15313)

train_aug = A.Compose([
#                         A.Rotate((0,30),p=0.75),
                        A.RandomRotate90(p=1),
                        A.HorizontalFlip(p=0.5),
                        A.ShiftScaleRotate(p=0.9),
#                         A.RandomBrightness(0.05),
#                         A.RandomContrast(0.05),
                        A.Normalize(mean=imean, std=istd,max_pixel_value=255.)
                        ])

test_aug = A.Compose([
                        A.Normalize(mean=imean, std=istd, max_pixel_value=255.)
                        ])


In [9]:
composed_transforms_train = transforms.Compose([
#     transforms.Resize(size=final_size),
#     # transforms.RandomResizedCrop(size=224),
#     transforms.RandomHorizontalFlip(p=0.5),
#     transforms.RandomVerticalFlip(p=0.5),
# #     transforms.RandomRotation(degrees=45),
#     transforms.RandomAffine(degrees=45, translate=(0.1,0.1), shear=10, scale=(0.9,1.1)),
    transforms.ToTensor()
#     transforms.Normalize(mean=[0.456]*4, std=[0.224]*4)
])

composed_transforms_test = transforms.Compose([
#     transforms.Resize(size=final_size),
    transforms.ToTensor()
#     transforms.Normalize(mean=[0.456]*4, std=[0.224]*4)
])

In [10]:
eps = 1e-5
gpu_id = 0

thresh = 0.1

# save_dir_root = os.path.join(os.path.dirname(os.path.abspath(__file__)))
# exp_name = os.path.dirname(os.path.abspath(__file__)).split('/')[-1]

save_dir_root = './'

gc.collect()

7

In [11]:
fold = -1

if gpu_id >= 0:
    print('Using GPU: {} '.format(gpu_id))
    torch.cuda.set_device(device=gpu_id)

torch.cuda.empty_cache()

from os import listdir
from os.path import isfile, join
file_list_x = [f for f in listdir(PATH_TO_EXTERNAL2) if isfile(join(PATH_TO_EXTERNAL2, f))]
print(file_list_x[:15],len(file_list_x))

Using GPU: 0 
['ENSG00000000460_24451_224_G2_1_blue.jpg', 'ENSG00000000003_4109_24_H11_1_red.jpg', 'ENSG00000000003_4109_23_H11_1_blue.jpg', 'ENSG00000000003_4109_24_H11_1_blue.jpg', 'ENSG00000000003_4109_24_H11_2_blue.jpg', 'ENSG00000000003_4109_23_H11_1_red.jpg', 'ENSG00000000003_4109_24_H11_1_yellow.jpg', 'ENSG00000000003_4109_23_H11_1_yellow.jpg', 'ENSG00000000003_4109_24_H11_2_red.jpg', 'ENSG00000000003_4109_23_H11_1_green.jpg', 'ENSG00000000003_4109_24_H11_2_green.jpg', 'ENSG00000000003_4109_24_H11_1_green.jpg', 'ENSG00000000003_4109_24_H11_2_yellow.jpg', 'ENSG00000000003_4109_23_H11_2_red.jpg', 'ENSG00000000003_4109_23_H11_2_blue.jpg'] 311022


In [12]:
db_xx = MultiBandMultiLabelDataset(dxx, 
                                  base_path=PATH_TO_EXTERNAL2,
#                                   augmentator=test_aug,
                                  image_transform=composed_transforms_test,
                                  external=2)

xxloader = DataLoader(db_xx, collate_fn=db_xx.collate_func,
                         batch_size=1, shuffle=False,
                         num_workers=1)

In [13]:
id_list = []
im_list = []
y_list = []
for i, (im, y) in enumerate(xxloader):
#     if i % 1000 == 0: print(i,id)
#     if i < 63914: continue
    id = str(db_xx.images_df.Id[i])
    im = im.cpu().detach().numpy()[0].transpose(1,2,0)*255
#     print(im.shape,im.min(),im.mean(),im.max())
    im = PIL.Image.fromarray(im.astype('uint8'),'RGBA')
    id = PATH_TO_EXTERNAL5 + id[15:]
    im.save(id+'.png',"PNG")
#     y = y.cpu().detach().numpy()
#     id_list.append(id)
#     im_list.append(im)
#     y_list.append(y)
    if i % 1000 == 0: print(i,id)
#     if i % 1000 == 0: print(i,id,s,y)
#     if i==10: break

0 ./external_data5/ENSG00000000003_4109_24_H11_1
1000 ./external_data5/ENSG00000008405_18762_1035_D8_2
2000 ./external_data5/ENSG00000023608_72276_1400_G3_6
3000 ./external_data5/ENSG00000047644_39814_451_G6_1
4000 ./external_data5/ENSG00000060339_48513_798_G7_2
5000 ./external_data5/ENSG00000067365_74173_1457_E5_2
6000 ./external_data5/ENSG00000073282_6288_1596_E11_3
7000 ./external_data5/ENSG00000078304_27553_251_B10_2
8000 ./external_data5/ENSG00000083828_30306_1313_F6_1
9000 ./external_data5/ENSG00000088854_67418_1263_C6_2
10000 ./external_data5/ENSG00000093000_48328_1137_H7_1
11000 ./external_data5/ENSG00000100162_42404_692_D4_5
12000 ./external_data5/ENSG00000100883_48977_749_D12_2
13000 ./external_data5/ENSG00000102119_2029_673_G6_2
14000 ./external_data5/ENSG00000103591_40174_424_F7_1
15000 ./external_data5/ENSG00000105197_48843_736_B12_1
16000 ./external_data5/ENSG00000106236_58320_1742_F8_22_cr5804950d083f7
17000 ./external_data5/ENSG00000108175_45144_1125_C8_3
18000 ./extern