In [1]:
!ls /kaggle/input/pyvips-python-and-deb-package
 # intall the deb packages
!dpkg -i --force-depends /kaggle/input/pyvips-python-and-deb-package/linux_packages/archives/*.deb
 # install the python wrapper
!pip install pyvips -f /kaggle/input/pyvips-python-and-deb-package/python_packages/ --no-index


linux_packages	python_packages
Selecting previously unselected package autoconf.
(Reading database ... 107763 files and directories currently installed.)
Preparing to unpack .../autoconf_2.69-11.1_all.deb ...
Unpacking autoconf (2.69-11.1) ...
Selecting previously unselected package automake.
Preparing to unpack .../automake_1%3a1.16.1-4ubuntu6_all.deb ...
Unpacking automake (1:1.16.1-4ubuntu6) ...
Selecting previously unselected package autopoint.
Preparing to unpack .../autopoint_0.19.8.1-10build1_all.deb ...
Unpacking autopoint (0.19.8.1-10build1) ...
Selecting previously unselected package autotools-dev.
Preparing to unpack .../autotools-dev_20180224.1_all.deb ...
Unpacking autotools-dev (20180224.1) ...
Selecting previously unselected package bzip2-doc.
Preparing to unpack .../bzip2-doc_1.0.8-2_all.deb ...
Unpacking bzip2-doc (1.0.8-2) ...
Selecting previously unselected package debhelper.
Preparing to unpack .../debhelper_12.10ubuntu1_all.deb ...
Unpacking debh

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import matplotlib.pyplot as plt
import os
import pyvips
import time
from tqdm import tqdm
import random
from PIL import Image
Image.MAX_IMAGE_PIXELS = None

from scipy.special import softmax


from torch import optim 
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score, accuracy_score
    
df_train = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
df_test = pd.read_csv('/kaggle/input/UBC-OCEAN/test.csv')
df_sub = pd.read_csv('/kaggle/input/UBC-OCEAN/sample_submission.csv')

In [3]:
def extract_tiles_PIL(image_path,
                        patch_size=300, 
                        num_patches=100,
                        scale = 1.,
                        threshold_black_pixel=10*3,
                        threshold_black_background_ratio=0.1, 
                        threshold_variability=0.15):
    
    images = []
    name, _ = os.path.splitext(os.path.basename(image_path))    
    with Image.open(image_path) as image:
        n_patches_h = int(np.floor(image.width/patch_size))
        n_patches_v = int(np.floor(image.height/patch_size))

        grid_h = np.arange(n_patches_h)
        random.shuffle(grid_h)

        grid_v = np.arange(n_patches_v)
        random.shuffle(grid_v)

        counter = 0
        for i in grid_h:
            if counter == num_patches:
                break
            for j in grid_v:
                idx = i*n_patches_h+j
                patch = image.crop((i * patch_size, j * patch_size,
                                   i * patch_size+patch_size, j * patch_size+patch_size,))

                bw = np.sum(patch, axis=2) # black and white image
                num_black_pixels = np.sum(bw<threshold_black_pixel) # threshold 

                if np.mean(bw)>0:
                    variability = (np.std(bw)/np.mean(bw))
                else:
                    variability = 0

                # if Background is more than 10% of image, discard
                condition1 = num_black_pixels < threshold_black_background_ratio*patch_size**2

                # if variability is smaller than 0.1, discard
                condition2 =  variability > threshold_variability

                if (condition1 and condition2):
                    if scale<1:
                        patch = patch.resize((int(scale*patch_size), int(scale*patch_size)))
                    images.append(patch)
                    counter += 1
                    if counter == num_patches:
                        break
        if counter==0:
            image = image.resize(patch_size, patch_size)
            images.append(patch)
            print('no patch found')
            
    return images, int(name)

In [4]:
def extract_tiles_jirka(image_path, 
                        patch_size=1200,
                        num_patches=100, 
                        scale=0.25, 
                        drop_thr = 0.6,
                        white_thr = 240):
    images = []
    name, _ = os.path.splitext(os.path.basename(image_path))
    im = pyvips.Image.new_from_file(image_path)
    w = h = patch_size
    # https://stackoverflow.com/a/47581978/4521646
    idxs = [(y, y + h, x, x + w) for y in range(0, im.height, h) for x in range(0, im.width, w)]
    # random subsample
    num_patches = num_patches if isinstance(num_patches, int) else int(len(idxs) * num_patches)
    random.shuffle(idxs)
    files = []
    counter = 0
    for y, y_, x, x_ in idxs:
        # https://libvips.github.io/pyvips/vimage.html#pyvips.Image.crop
        tile = im.crop(x, y, min(w, im.width - x), min(h, im.height - y)).numpy()[..., :3]
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile_size = (h, w) if tile.ndim == 2 else (h, w, tile.shape[2])
            tile = np.zeros(tile_size, dtype=tile.dtype)
            tile[:tile_.shape[0], :tile_.shape[1], ...] = tile_
        black_bg = np.sum(tile, axis=2) == 0
        tile[black_bg, :] = 255
        mask_bg = np.mean(tile, axis=2) > white_thr
        if np.sum(mask_bg) >= (np.prod(mask_bg.shape) * drop_thr):
            #print(f"skip almost empty tile: {k:06}_{int(x_ / w)}-{int(y_ / h)}")
            continue
            
        # print(tile.shape, tile.dtype, tile.min(), tile.max())
        new_size = int(patch_size * scale), int(patch_size * scale)
        images.append(Image.fromarray(tile).resize(new_size, Image.LANCZOS))
        counter+=1
        # need to set counter check as some empty tiles could be skipped earlier
        if counter >= num_patches:
            break
    return images, int(name)

In [5]:
im_paths = glob.glob('/kaggle/input/UBC-OCEAN/test_images/*.png')
im_sizes = [os.path.getsize(im_path) for im_path in im_paths]
df = pd.DataFrame({'paths':im_paths, 'sizes':im_sizes})
df = df.sort_values('sizes', ascending=True).reset_index()
file_paths = df['paths']
df.head()

sizes = [os.path.getsize(f) for f in file_paths]

start = 0
max_size = 3e9

start_list = [0]
num_files_list = []
while start<len(sizes):
    num_files = np.sum([(np.cumsum(sizes[start:])<max_size)])
    if num_files > 0:
        num_files_list.append(num_files)
    else:
        num_files_list.append(1)
    start += num_files_list[-1]
    start_list.append(start)
    

from concurrent.futures import ThreadPoolExecutor

t0 = time.time()

for i, start in enumerate(start_list[:1]):
    print()
    num_files = start_list[i+1]-start
    print(start, num_files)
    if num_files >1:
        print(f'Batch {i}/{len(start_list)}: Multi')
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = []
            for i in range(start, start+num_files):
                futures.append(executor.submit(task, i))
        for future in futures:
            future.result()

    else:
        print(f'Batch {i}/{len(start_list)}: Single')
        task(start)
    print(f'{start+num_files} files processed ({np.sum(sizes[:start+num_files])/1e9} GB)')
    print(f'This took {(time.time()-t0)/60} min')


img_dir = '/kaggle/working/tiles'
#df = pd.read_csv('/kaggle/input/UBC-OCEAN/test.csv')

fpaths = []
image_ids = []
for fname in os.listdir(img_dir):
    image_id = int(fname.split('_')[0])
    fpaths.append(os.path.join(img_dir, fname))
    image_ids.append(image_id)

df_tiles = pd.DataFrame({'image_id':image_ids, 'fpath':fpaths})
print(df_tiles.head())

#df_tiles['tile_number'] = [int(f.split('.')[0].split('_')[1]) for f in fpaths]
#df_tiles = df_tiles[df_tiles['tile_number']<tiles_per_image]
print(len(df_tiles))

In [6]:
# One Hot encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
df = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
labels = df['label']
l_encoder = LabelEncoder()
l_encoded = l_encoder.fit_transform(labels)

oh_encoder = OneHotEncoder(sparse_output=False)
l_encoded = l_encoded.reshape(len(l_encoded), 1)
oh_encoded = oh_encoder.fit_transform(l_encoded)

def oh_encoding(label):
    l_encoded = l_encoder.transform(label)
    l_encoded = l_encoded.reshape(len(l_encoded), 1)
    oh_encoded = oh_encoder.transform(l_encoded)
    #print(label, l_encoded, oh_encoded)

    return torch.from_numpy(oh_encoded)

def oh_encoding_inv(probs):
    a = np.argmax(probs)
    return l_encoder.inverse_transform([a])[0]

In [7]:
class TileDataSet(Dataset):
    def __init__(self, idx=0, transform=None, mode='jirka'):
        self.df = df
        self.transform = transform
        self.mode = mode
        if self.mode == 'jirka':
            self.images, self.image_id = extract_tiles_jirka(file_paths[idx],
                                                        patch_size=1200, 
                                                        num_patches=20,
                                                        scale=0.25,
                                                        drop_thr = 0.6,
                                                        white_thr = 240)
        elif self.mode == 'chris':
            Image.MAX_IMAGE_PIXELS = None
            self.images, self.image_id = extract_tiles_PIL(file_paths[idx],
                                                        patch_size=300, 
                                                        num_patches=100,
                                                        scale = 1.,
                                                        threshold_black_pixel=10*3,
                                                        threshold_black_background_ratio=0.1, 
                                                        threshold_variability=0.15)
        
    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):    
        im = self.images[idx]
        im = self.transform(im)
        return im

In [8]:
from torchvision import transforms
import multiprocessing as mproc
import pytorch_lightning as pl
from torch.utils.data import DataLoader

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
    #transforms.Normalize(clr_mean, clr_std),  # custom
])


In [9]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

In [10]:
class UBCModel(nn.Module):
    def __init__(self, model_name, num_classes, pretrained=True):
        super(UBCModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)

        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.linear = nn.Linear(in_features, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        output = self.linear(pooled_features)
        return output

    
#model = UBCModel(CONFIG['model_name'], CONFIG['num_classes'], CONFIG['pretrained'])
#model.to(CONFIG['device']);
#torch.save(model, 'model_initial_weights.pt')

In [11]:
cuda = False
import torch
#model = torch.load('/kaggle/input/training-with-tiles/model_initial_weights.pt')
#ckpt = torch.load('/kaggle/input/training-with-tiles/state_dict_epoch_50.pt',  map_location=torch.device('cpu'))


model = torch.load('/kaggle/input/training-with-tiles-gem-pooling/model_initial_weights.pt',
                   map_location=torch.device('cpu'))
ckpt = torch.load('/kaggle/input/training-with-tiles-gem-pooling/state_dict_epoch_11.pt',  
                  map_location=torch.device('cpu'))


model.load_state_dict(ckpt['model_state_dict'])
model.eval();

In [12]:
predictions = []
image_ids = []
def infer_single_image(idx):
    tile_dataset = TileDataSet(idx=idx, transform=val_transform, mode='jirka')
    tile_dataloader = DataLoader(tile_dataset,
                                    batch_size=5,
                                    shuffle=False)    
    outputs = []
    for data in tile_dataloader:
        img = data
        if cuda:
            img = img.to(device='cuda')
        output = model(img).detach()

        if cuda:
            output = output.to('cpu')
            torch.cuda.empty_cache()
        #output = oh_encoding_inv(output)
        outputs += output.tolist()

    probs = softmax(np.sum(np.array(outputs), axis=0))
    if np.max(probs)>0.5:
        prediction = oh_encoding_inv(probs)
    else:
        prediction = 'Other'    
    return tile_dataset.image_id, prediction


In [13]:
from tqdm.auto import tqdm
from joblib import Parallel, delayed

num_files = len(file_paths)

t0 = time.time()
images = Parallel(n_jobs=-1)(
    delayed(infer_single_image)(i)
    for i in tqdm(range(num_files))
)

print(f'Processed {num_files} files, {np.sum(sizes[:num_files])/1e9:.3f} GB')
t1 = time.time()
print(f'This took {(t1-t0)/60} min')

  0%|          | 0/1 [00:00<?, ?it/s]

Processed 1 files, 0.650 GB
This took 0.5137820442517599 min


df_predictions = pd.DataFrame(images, columns=['image_id', 'label'])

def get_label(image_id):
    return df_train.set_index('image_id').loc[image_id][0]


df_predictions['label_true'] = df_predictions['image_id'].apply(get_label)

pred = df_predictions['label']
true = df_predictions['label_true']

from sklearn.metrics import accuracy_score, balanced_accuracy_score
print('accuracy', accuracy_score(true, pred))
print('balanced accuracy', balanced_accuracy_score(true, pred))

In [14]:
df_predictions = pd.DataFrame(images, columns=['image_id', 'label'])
df_predictions.to_csv('submission.csv', index = False)

In [15]:

#df_sub = pd.read_csv('/kaggle/input/UBC-OCEAN/sample_submission.csv')
#df_sub['image_id'] = image_ids
#df_sub['label'] = predicted_labels
#df_sub.to_csv('submission.csv')

#image_ids = df_sub['image_id']
#predicted_labels = ['HGSC' for i in range(len(df_sub))]

#df = pd.DataFrame({'image_id':image_ids, 'label':predicted_labels})
#df.to_csv('submission.csv', index = False)
#df.head()

In [16]:
! head submission.csv

image_id,label
41,HGSC
