## In this notebook...

### we train the prototypical network for  

In [2]:
import os 
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#os.environ['CUDA_VISIBLE_DEVICES']='3'
os.environ['OMP_NUM_THREADS'] = '6'
os.environ['MKL_NUM_THREADS'] = '6'
#os.environ['http_proxy'] = "http://proxy.hcm.fpt.vn:80/"
#os.environ['https_proxy'] = "https://proxy.hcm.fpt.vn:80/"

In [3]:
from datetime import datetime

In [4]:
import numpy as np                                    # Array, Linear Algebra
from torch.utils.data.dataset import random_split     # spliting inTrain Val
import pandas as pd                                   # handling CSV
import os                                             # For File handling
import random                                         # Choosing from images dataset
import time                                           # timing Epochs  
from tqdm.notebook import tqdm                        # Testing
from os.path import join                              # File Handling
from torchvision import transforms                    # Data Aug
import torch                                          # Framework
from PIL import Image                                 # Loading Image
from torch.utils.data import Dataset, DataLoader      # Dataset
import torch.nn.functional as F                       # Function
import json                                           # Loading Metadat
from PIL import  ImageOps                             # Data Aug 
from PIL.Image import open as openIm                  # Image Handling
import matplotlib.pyplot  as plt                      # Ploting Image
import cv2
from sklearn.metrics import f1_score
import seaborn as sns

Include the following line to import the functions from few_shot

In [5]:
import sys
sys.path.append('./few_shot/')


In [6]:
"""
Reproduce Omniglot results of Snell et al Prototypical networks.
"""
from torch.optim import Adam
from torch.utils.data import DataLoader
import argparse

from few_shot.datasets import OmniglotDataset, MiniImageNet
from few_shot.models import get_few_shot_encoder
from few_shot.core import NShotTaskSampler, EvaluateFewShot, prepare_nshot_task
from few_shot.proto import proto_net_episode
from few_shot.train import fit
from few_shot.callbacks import *
from few_shot.utils import setup_dirs
from config import PATH


In [7]:
'''
import vision stuff
'''
import torchvision
import torch.nn as nn

In [8]:
#base_dir = '/bigdata/user/hieunt124/kaggle/herbarium/'
base_dir = 'D:/Data/Kaggle_HerbariumChallenge2020'
train_dir = base_dir + '/nybg2020/train/'
test_dir = base_dir + '/nybg2020/test/'
metadata_file = 'metadata.json'

In [9]:

with open(train_dir + metadata_file, encoding = "ISO-8859-1") as json_file:
    train_metadata = json.load(json_file)

train_img = pd.DataFrame(train_metadata['images'])
train_label = pd.DataFrame(train_metadata['annotations'])
train_df = (pd.merge(train_label, train_img
                    #, left_on='image_id'
                    , on='id'
                    , how='left')
            .drop(['image_id', 'license', 'region_id'], axis=1)
            .sort_values(by=['category_id'])
           )
train_df.head()

Unnamed: 0,category_id,id,file_name,height,width
76407,0,626762,images/000/00/626762.jpg,1000,681
601590,0,72077,images/000/00/72077.jpg,1000,681
76408,0,818271,images/000/00/818271.jpg,1000,681
556748,0,495523,images/000/00/495523.jpg,1000,681
335261,0,437000,images/000/00/437000.jpg,1000,681


In [10]:
train_df.rename(columns={'category_id': 'class_id'
                        , 'file_name': 'filepath'
                        }, inplace=True)

Here we define the training parameters. For now, we can keep them as is, just to check the pipeline works.

In [19]:
setup_dirs()
assert torch.cuda.is_available()
device = torch.device('cuda')
torch.backends.cudnn.benchmark = True


##############
# Parameters #
##############
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='miniImageNet')
parser.add_argument('--distance', default='l2')
parser.add_argument('--n-train', default=5, type=int)
parser.add_argument('--n-test', default=5, type=int)
parser.add_argument('--k-train', default=20, type=int)
parser.add_argument('--k-test', default=5, type=int)
parser.add_argument('--q-train', default=15, type=int)
parser.add_argument('--q-test', default=1, type=int)
args = parser.parse_args('')

In [20]:
args.dataset = 'Herbarium'
args.q_train = 1
args.q_test = 1
args.n_train = 1
args.n_test = 1
args.k_train = 30
args.k_test = 10


Here is where we are to make the most changes, as we'll feed a new Herbarium dataset into the model.

For training, we'll omit the classes with only 1 sample, which from value_counts include 3 classes. The remaining classes contain at least 2 samples so we can still perform multi-way, 1-shot 1-query training.

In [13]:
import albumentations as A
def load_rgb_image(image_file):
    '''
    load image file in RGB format
    '''
    img = cv2.imread(str(image_file))
    try:
        #img = img.astype('uint8')
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    except Exception as error:
        print(error)
        print(image_file)
        print(img.shape)
    return img

def get_augmentations(re_size=300#224
                      , crop_size=300#224
                      , train=True
                     ):
    '''
    get image augmentations from albumentations
    '''
    augs = [A.Resize(height=re_size, width=re_size)]
    if train:
        augs.extend([A.RandomCrop(height=crop_size, width=crop_size)
                     , A.ShiftScaleRotate(shift_limit=.1, scale_limit=.3, rotate_limit=30, p=.75)
                     #, A.RandomBrightnessContrast(brightness_limit=.5, contrast_limit=.5, p=.5)
                     , A.RandomBrightnessContrast(brightness_limit=.3, contrast_limit=.3, p=.15)
                     #, A.Blur(.5)
                     , A.Cutout(max_h_size=crop_size//12, max_w_size=crop_size//12, p=.75)
                    ])
    else:
        augs.extend([A.CenterCrop(height=crop_size, width=crop_size)])
    
    # A.Normalize uses Imagenet stats by default
    return A.Compose(augs + [A.Normalize()])

In [14]:
class myHerbariumDataset(Dataset
                        ):
    def __init__(self, df_image
                         , train=True
                         , base_folder='/bigdata/user/hieunt124/kaggle/herbarium/'):
        if (train):
            df_image.index = df_image['id']

        self.df = df_image ## dataframe of all image annotations
        self.datasetid_to_filepath = df_image.to_dict()['filepath']  ## get file path from image id
        self.datasetid_to_class_id = df_image.to_dict()['class_id']  ## get class id from image id

        self.classes = self.df['class_id'].unique() ## list of labels
        self.base_folder = base_folder
        self.loader = lambda x: load_rgb_image(base_folder + x)  ## loader function for the image
        self.transform = get_augmentations(train=train) ## transform the image
        self.to_tensor = transforms.ToTensor()  ## transform image to Torch tensor
        
        self.fastai_transforms = get_transforms(flip_vert=True)
        self.fastai_loader = lambda x: get_fastai_img(base_folder + x, self.fastai_transforms)
        
    def __getitem__(self, item):
        '''
        input item id, output the image and its label
        '''
        
        #image = self.loader(self.datasetid_to_filepath[item])
        #image = self.transform(image=image)['image']
        #image = self.to_tensor(image)
        image = self.fastai_loader(self.datasetid_to_filepath[item])
        label = self.datasetid_to_class_id[item]
        
        return image, label
        
        
    def __len__(self):
        '''
        returns size of dataset
        '''
        return len(self.df)
    
    def num_classes(self):
        '''
        returns number of classes
        '''
        return len(self.classes)
    
    def classes_value_counts(self):
        '''
        return number of samples per class
        '''
        return self.df.Label.value_counts().reset_index()

In [15]:
from fastai.vision import get_transforms, open_image, imagenet_stats
fastai_transforms = get_transforms(flip_vert=True)
def get_fastai_img(path, fastai_transforms
                   , img_size=256):
    '''
    '''
    fastai_img = open_image(path)
    fastai_img = fastai_img.apply_tfms(*(fastai_transforms))
    fastai_img = fastai_img.resize(img_size).px
    fastai_img = transforms.Normalize(mean=imagenet_stats[0], std=imagenet_stats[1])(fastai_img)
    
    return fastai_img

In [16]:
from torch.distributions.beta import Beta
def generate_mixup_samples(samples, labels, beta_distribution):
    '''
    generate mixup samples from 1 batch of samples, labels 
    and a pre-defined beta_distribution
    '''
    temp_batch_size = len(labels)
    beta_weights = beta_distribution.sample((temp_batch_size,))
    shuffled_index = torch.randperm(temp_batch_size)
    samples_shuffled = samples[shuffled_index]
    labels_shuffled = labels[shuffled_index]
    
    samples_mixup = torch.stack([beta_weights[i] * samples[i] 
                                 + (1 - beta_weights[i]) * samples_shuffled[i]
                                 for i in range(temp_batch_size)])
    labels_mixup = torch.stack([labels.float()
                                , labels_shuffled.float()
                                , beta_weights 
                               ])
    return samples_mixup, labels_mixup

class DataLoader_mixup(DataLoader):
    '''
    wrapper for DataLoader for mixup training:
    for each batch, generate mixup samples
    '''
    def __init__(self, **kwargs
                ):
        super().__init__(**kwargs)
        #self.dl = dl
        self.beta_distribution = Beta(.4,.4)
        
    #def __len__(self):
    #    return len(self)

    def __iter__(self):
        #batches = iter(self)
        for samples, labels in self:
            yield (generate_mixup_samples(samples, labels
                                          , beta_distribution=self.beta_distribution)
                  )

class CrossEntropyLoss_mixup(nn.Module):
    '''
    wrapper of CrossEntropyLoss for mixup
    using mixup while training and usual loss when evaluating
    '''
    
    def __init__(self, loss, reduction='mean'):
        super().__init__()
        self.loss_function = loss
        self.reduction = reduction
        setattr(self.loss_function, 'reduction', 'none')
        
    def forward(self, output, labels_mixup):
        if len(labels_mixup.shape) != 1:
            # mixup training mode
            labels = labels_mixup[0,:].long()
            labels_shuffled = labels_mixup[1,:].long()
            beta_weights = labels_mixup[2,:]
            
            loss = (beta_weights * self.loss_function(output, labels)
                   + (1-beta_weights) * self.loss_function(output, labels_shuffled)
                   )
            
        else:
            # evaluation mode, return loss as usual
            loss = self.loss_function(output, labels_mixup)
        
        if self.reduction == 'mean': 
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()
        return loss

We'll extract the classes that only have 1 sample here, they'll be excluded from training.

In [21]:
label_counts = pd.DataFrame(train_df.class_id.value_counts().reset_index()
                           )
label_counts.columns = ['class_id','count_samples']
single_categories = label_counts[label_counts['count_samples'] < 2].class_id.values
quarter_categories = label_counts[label_counts['count_samples'] > 4].class_id.values


In [28]:
high_sample_categories = label_counts[label_counts['count_samples'] > 200].class_id.values

Having defined the Dataset class, we can then define the corresponding data loaders. The number of episodes is taken to be roughly the number of total classes divided by number of classes sampled for each episode.

In [25]:
from sklearn.model_selection import train_test_split
_, val_subset = train_test_split(train_df[train_df.class_id.isin(quarter_categories)]
                                 , test_size=.005 #originally .05
                                )
val_subset.shape

(5002, 5)

Just making sure that all classes in val set has at least 2 samples...

In [19]:
val_classes = val_subset.class_id.value_counts()[:8000].index.tolist()
val_subset = val_subset[val_subset.class_id.isin(val_classes)]

In [20]:
train_filter = ((train_df.class_id.isin(single_categories))
               | (train_df.id.isin(val_subset.id))
               )
train_subset = train_df[-train_filter]
train_subset
train_subset.shape

(986384, 5)

In [21]:
evaluation_episodes = 50#1000
episodes_per_epoch = 800

if args.dataset == 'omniglot':
    n_epochs = 40
    dataset_class = OmniglotDataset
    num_input_channels = 1
    drop_lr_every = 20
elif args.dataset == 'miniImageNet':
    n_epochs = 35 #120
    dataset_class = MiniImageNet
    num_input_channels = 3
    drop_lr_every = 40
else:
    n_epochs = 48 #120
    #dataset_class = MiniImageNet
    num_input_channels = 3
    drop_lr_every = 24
    #raise(ValueError, 'Unsupported dataset')

param_str = f'{args.dataset}_nt={args.n_train}_kt={args.k_train}_qt={args.q_train}_' \
            f'nv={args.n_test}_kv={args.k_test}_qv={args.q_test}_seresnet101'

print(param_str)

###################
# Create datasets #
###################
#background = dataset_class('background')
background = myHerbariumDataset(train_subset
                                , train=True
                                , base_folder=train_dir 
                               )
background_taskloader = DataLoader(
    background,
    batch_sampler=NShotTaskSampler(background, episodes_per_epoch, args.n_train, args.k_train, args.q_train),
    num_workers=0
)
#background_taskloader = DataLoader_mixup(background_taskloader, Beta(.4,.4))
evaluation = myHerbariumDataset(val_subset#.reset_index(drop=True)
                                , train=True
                                , base_folder=train_dir 
                               )
evaluation_taskloader = DataLoader(
    evaluation,
    batch_sampler=NShotTaskSampler(evaluation, episodes_per_epoch, args.n_test, args.k_test, args.q_test),
    num_workers=0
)



Herbarium_nt=1_kt=30_qt=1_nv=1_kv=10_qv=1_seresnet101


For now, we use the model architecture predefined as in few_shot.

In [22]:
from fastai.layers import AdaptiveConcatPool2d, Flatten
from pretrainedmodels import se_resnet101

In [23]:
def ResnetProtoTypeNet():
    
    def my_head(input_size, hidden_units, output_size):
        return nn.Sequential(AdaptiveConcatPool2d()
                                        , Flatten()
                                        , nn.BatchNorm1d(num_features=2 * input_size)
                                        , nn.Dropout(p=.25)
                                        , nn.Linear(in_features=2 * input_size, out_features=hidden_units, bias=True)
                                        , nn.ReLU(inplace=True)
                                        , nn.BatchNorm1d(num_features=hidden_units)
                                        , nn.Dropout(p=.5)
                                        , nn.Linear(in_features=hidden_units, out_features=output_size, bias=True)
                                        
                                       )

    #arch = torchvision.models.resnet101(pretrained=False)
    #arch = se_resnet101(pretrained=None)
    arch = list(arch.children())
    arch.pop(-1)
    arch.pop(-1)
    temp_arch = nn.Sequential(nn.Sequential(*arch))
    temp_children = list(temp_arch.children())
    temp_children.append(my_head(2048, 512, 200))
    model = nn.Sequential(*temp_children)
    
    model_dir = '/bigdata/user/hieunt124/kaggle/herbarium/nybg2020/train/models/'
    model_file = 'herbarium-seresnet101-weights.pth'
    weights = torch.load(model_dir + model_file)

    model.load_state_dict(weights['state_dict'])
    
    temp_head = list(model.children())[-1]
    # temp_head = nn.Sequential(*list(temp_head.children())[:-2])
    temp_head = nn.Sequential(*list(temp_head.children())[:2])
    temp_arch = nn.Sequential(nn.Sequential(*list(model.children())[:-1]))
    model = nn.Sequential(temp_arch, temp_head)
    
    return model
    #return temp_arch

In [24]:

#########
# Model #
#########
#model = get_few_shot_encoder(num_input_channels)
#model.to(device, dtype=torch.double)
model = ResnetProtoTypeNet()
model.to(device, dtype=torch.float)

Sequential(
  (0): Sequential(
    (0): Sequential(
      (0): Sequential(
        (0): Sequential(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu1): ReLU(inplace=True)
          (pool): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
        )
        (1): Sequential(
          (0): SEResNetBottleneck(
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn3): BatchNorm2d(256, eps=1e-

With the ADAM optimizer and Negative log-likelihood loss function:

In [25]:
#for i, param in enumerate(list(model.parameters())[:-9]):
    #print(i)
    #param.require_grad=False

In [26]:

############
# Training #
############
print(f'Training Prototypical network on {args.dataset}...')
optimiser = Adam(model.parameters(), lr=1e-4)
#loss_fn = torch.nn.NLLLoss().cuda()
loss_fn = torch.nn.CrossEntropyLoss()
#loss_fn = CrossEntropyLoss_mixup(loss_fn)

def lr_schedule(epoch, lr):
    # Drop lr every 2000 episodes
    if epoch % drop_lr_every == 0:
        return lr * 0.6
    else:
        return lr


Training Prototypical network on Herbarium...


In [27]:

callbacks = [
    EvaluateFewShot(
        eval_fn=proto_net_episode,
        num_tasks=evaluation_episodes,
        n_shot=args.n_test,
        k_way=args.k_test,
        q_queries=args.q_test,
        taskloader=evaluation_taskloader,
        prepare_batch=prepare_nshot_task(args.n_test, args.k_test, args.q_test),
        distance=args.distance
    ),
    ModelCheckpoint(
        filepath=PATH + f'/models/proto_nets/{param_str}.pth',
        monitor=f'val_{args.n_test}-shot_{args.k_test}-way_acc'
    ),
    LearningRateScheduler(schedule=lr_schedule),
    CSVLogger(PATH + f'/logs/proto_nets/{param_str}.csv'),
]

In [28]:
n_epochs=1

In [None]:

fit(
    model,
    optimiser,
    loss_fn,
    epochs=n_epochs,
    dataloader=background_taskloader,
    prepare_batch=prepare_nshot_task(args.n_train, args.k_train, args.q_train),
    callbacks=callbacks,
    metrics=['categorical_accuracy'],
    fit_function=proto_net_episode,
    fit_function_kwargs={'n_shot': args.n_train, 'k_way': args.k_train, 'q_queries': args.q_train, 'train': True,
                         'distance': args.distance}
)

Epoch 1:   0%|          | 0/800 [00:00<?, ?it/s]

Begin training...


Epoch 1: 100%|██████████| 800/800 [2:17:57<00:00, 10.35s/it, loss=222, categorical_accuracy=0.7]     
Epoch 2: 100%|██████████| 800/800 [1:55:59<00:00,  8.70s/it, loss=10.3, categorical_accuracy=0.525] 
Epoch 3: 100%|██████████| 800/800 [1:59:52<00:00,  8.99s/it, loss=2.16, categorical_accuracy=0.457] 
Epoch 4: 100%|██████████| 800/800 [2:11:30<00:00,  9.86s/it, loss=1.61, categorical_accuracy=0.527] 
Epoch 5: 100%|██████████| 800/800 [2:12:35<00:00,  9.94s/it, loss=1.38, categorical_accuracy=0.585] 
Epoch 6: 100%|██████████| 800/800 [2:03:14<00:00,  9.24s/it, loss=1.21, categorical_accuracy=0.631] 
Epoch 7: 100%|██████████| 800/800 [1:53:59<00:00,  8.55s/it, loss=1.1, categorical_accuracy=0.663]  
Epoch 8: 100%|██████████| 800/800 [2:06:28<00:00,  9.49s/it, loss=1.03, categorical_accuracy=0.687]   
Epoch 9: 100%|██████████| 800/800 [2:04:36<00:00,  9.35s/it, loss=0.966, categorical_accuracy=0.709] 
Epoch 10: 100%|██████████| 800/800 [2:12:18<00:00,  9.92s/it, loss=0.9, categorical_acc

In [None]:
stop

Having trained the model, let's try to test the model on the test set. For this, we'll create dataloaders for both train and test sets. We use the train dataloader to compute the prototypes for each class. We then determine the class of samples in the test set by taking the class with the minimal distance.

In [None]:
os.listdir('/bigdata/user/hieunt124/submodules/few_shot/models/proto_nets/')

In [None]:
model_file = '/bigdata/user/hieunt124/submodules/few_shot/models/proto_nets/Herbarium_nt=1_kt=60_qt=1_nv=1_kv=10_qv=1_resnet_epoch30_cat790.pth'

In [None]:
model.load_state_dict(torch.load(model_file))

We re-create the dataset object, this time with all the labels

Feel free to jack up batchsize a bit if the GPU can handle it

In [None]:
#support_df = train_df.reset_index(drop=True)
support_df = train_df[train_df.class_id.isin(np.arange(600))].reset_index(drop=True)
support_dataset = myHerbariumDataset(support_df
                                     , train=False
                                     , base_folder=train_dir)
support_loader = DataLoader(support_dataset, batch_size=256
                            , shuffle=False
                            , pin_memory=True
                           )

In [None]:
test_df = support_df.head(4096).copy()
test_dataset = myHerbariumDataset(test_df
                                  , train=False
                                  , base_folder=train_dir
                                 )
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False
                         , num_workers=0
                         , pin_memory=True
                        )

In [None]:
#class PrototypeSupportLoader()

As we loop through the dataloader, we save the embeddings and corresponding labels. The dataset is large so it's no quite viable to load all of the embeddings into a matrix, I'm thinking of how we can update the class prototype on the fly instead. So at any point, hopefully, we'll just need: 32k * 12k + 64 * 12k numbers, as opposed to 1m * 12k which is exhausting. We'll also need to keep track of how many samples have been used so far for each class.

Better yet, we can pre-sort the support dataset by class then write a custom dataloader which draws batch size according to how many samples there are for each class. This way, we won't have to compute loops within loops and can compute prototypes class by class.

To this end, we'll define a Prototype class which will take as input the pretrained model, the annotations dataframe and the (custom) dataloader based on which it computes the class prototypes. 

When evaluating, this Prototype object takes as input the evaluation dataloader, then computes the embeddings for each sample in this val/test set. To save memory, for each sample, we'll save only the top-1 or top-5 least distances and the respective predicted labels.

Yes, there's gonna be a shit ton of distances to compute. 

I wonder if there's any merit in creating a ClassLoader and loop through that?

In [None]:
class Prototypes():
    def __init__(self, model, df, support_loader, device=torch.device('cuda')):
        self.df = df
        self.n_classes = self.df['class_id'].nunique()
        self.support_loader = support_loader
        self.model = model
        self.classes = self.df['class_id'].unique().tolist()
        self.device=device
        self.model.to(device)
        self._get_prototypes()
        
    
    def _get_prototypes(self):
        '''
        compute class prototypes and store in self.class_prototypes 
        corresponding to class index
        '''
        #print('Computing prototypes...')
        self.model.eval()
        with torch.no_grad():
            for batch_index, (X, y) in (enumerate(self.support_loader)):
                X, y = X.to(device, non_blocking=True), list(y)
                X_embeddings = self.model.float()(X)#.cpu().detach().numpy()
                #print(X_embeddings.shape[-1])
                if batch_index == 0:
                    
                    # this matrix will hold the prototypes
                    
                    #self.class_prototypes = np.zeros((self.n_classes,X_embeddings.shape[-1]))
                    self.class_prototypes = torch.zeros((self.n_classes,X_embeddings.shape[-1])).to(self.device)
                                                 
                    # this array will hold the item tally for each class, 
                    # this will also be updated on the fly
                    #class_items_count = np.zeros(self.n_classes)
                    class_items_count = torch.zeros(self.n_classes).to(self.device)
                
                for i, label in enumerate(y):
                    
                    label_index = self.classes.index(label)
                    temp_item_count = class_items_count[label_index]
                    #print(type(self.class_prototypes), type(label_index), type(X_embeddings), type(temp_item_count))
                    self.class_prototypes[label_index] = (self.class_prototypes[label_index] * temp_item_count 
                                                    + X_embeddings[i]) / (temp_item_count+1)
                    class_items_count[label_index]+=1
                
    def _get_embeddings(self, images):
        '''
        compute embeddings from input images
        '''
        #X_embeddings = self.model(images.float()).cpu().detach().numpy()
        X_embeddings = self.model(images.float())
        return X_embeddings
    
    def _get_predictions(self, embeddings
                         , softmax=True
                         , normalized_softmax=True
                        ):
        '''
        compute pairwise distances from samples to each of the classes 
        and retain minimum distances to determine predicted classes
        
        '''
        #preds = np.zeros((embeddings.shape[0], self.n_classes))
        #for i, embedding in enumerate(embeddings):
        
        #for i, prototype in (enumerate(self.class_prototypes)):
            #preds[:,i] = np.linalg.norm((embeddings - prototype), axis=1)
             #preds[:,i] =                            
        
        preds = pairwise_euclidean_distance(embeddings, self.class_prototypes)
        '''
        if softmax:
            if normalized_softmax:
                preds /= np.max([1.0, np.abs(preds.mean())])
            preds = np_softmax(preds)
        '''
        return preds
    def predict(self, test_loader
                , proba=True
                , **kwargs):
        '''
        predict 
        '''
        preds_list = []
        self.model.eval()
        
        with torch.no_grad():
            for batch_index, (X_test, dummy_target) in tqdm(enumerate(test_loader)):
                #time_elapsed = []
                #time_elapsed.append(datetime.now())
                X_test = X_test.to(device, non_blocking=True)
                #time_elapsed.append(datetime.now())
                X_embeddings = self._get_embeddings(X_test)
                #time_elapsed.append(datetime.now())
                temp_preds = self._get_predictions(X_embeddings)
                #time_elapsed.append(datetime.now())
                preds_list.append(temp_preds)
                #time_elapsed.append(datetime.now())
                #print(time_elapsed)
        if (proba):
            #return torch.cat(preds_list)
            return np.concatenate(preds_list)
        else:
            
            return ([self.classes[x] for x in torch.argmin(torch.cat(preds_list), axis=1)]
                   , np.min(torch.cat(preds_list).cpu().detach().numpy(), axis=1)
                   )
            '''
            return ([self.classes[x] for x in np.argmin(np.concatenate(preds_list), axis=1)]
                    #np.argmin(np.concatenate(preds_list), axis=1)
                    , np.min(np.concatenate(preds_list), axis=1)
                   )
            '''

In [None]:
def pairwise_euclidean_distance(x, y):
    '''
    compute pairwise euclidean distance with torch tensors
    '''
    if not type(x) is torch.Tensor: x = torch.Tensor(x)
    if not type(y) is torch.Tensor: y = torch.Tensor(y)
        
    n_x = x.shape[0]
    n_y = y.shape[0]
    distances = (
                    x.cuda().unsqueeze(1).expand(n_x, n_y, -1) -
                    y.cuda().unsqueeze(0).expand(n_x, n_y, -1)
            ).pow(2).sum(dim=2)
    return distances#.cpu().detach().numpy()

In [None]:

meh = test_df.to_dict()['filepath']  ## get file path from image id
#test_df.to_dict()['class_id']

In [None]:

%time herbarium_prototypes = Prototypes(model, support_df, support_loader)

In [None]:
130000/128

In [None]:
%time predictions = herbarium_prototypes.predict(test_loader, proba=False)

In [None]:
sns.distplot(herbarium_prototypes.class_prototypes[:,0].cpu().detach().numpy())

I suppose we can relieve some of memory load by computing prototypes on batches of classes at a time. For one batch of classes, we get a minimal distance and its corresponding class. By the end, we get say 50 candidates of (min distance, class) and just take the minimal of those. We won't be able to get top-5 accuracy this way but I'm struggling to find ways to lighten the memory load.

In [None]:
support_dataset.num_classes()

In [None]:
n_class_batch = 50
np_distance = np.zeros((test_df.shape[0], n_class_batch))
np_class = np.zeros((test_df.shape[0], n_class_batch))
batch_index = 0
for class_batch in (np.array_split(np.arange(train_df.class_id.nunique()), n_class_batch)):
    print(batch_index)
    temp_support_df = train_df[train_df.class_id.isin(class_batch)].reset_index(drop=True)
    support_dataset = myHerbariumDataset(temp_support_df
                                         , train=False
                                         , base_folder=train_dir)
    support_loader = DataLoader(support_dataset, batch_size=256
                                , shuffle=False
                                , num_workers=0
                               )
    herbarium_prototypes = Prototypes(model, temp_support_df, support_loader)
    temp_class, temp_distance = herbarium_prototypes.predict(test_loader=test_loader, proba=False)
    np_class[:, batch_index] = temp_class
    np_distance[:, batch_index] = temp_distance
    batch_index+=1
    
    

### Actual test data

In [None]:
with open(test_dir + metadata_file, encoding = "ISO-8859-1") as json_file:
    test_metadata = json.load(json_file)

test_df = pd.DataFrame(test_metadata['images'])
test_df['class_id'] = 0
test_df.rename(columns={'file_name': 'filepath'}, inplace=True)
test_df.sort_values('id', inplace=True)
test_df.reset_index(inplace=True)
test_df.head()

In [None]:
test_df = test_df.head(60000)

In [None]:
support_df = train_df.reset_index(drop=True)
support_dataset = myHerbariumDataset(support_df
                                     , train=False
                                     , base_folder=train_dir)
support_loader = DataLoader(support_dataset, batch_size=256
                            , shuffle=False
                            , num_workers=2
                           )
test_dataset = myHerbariumDataset(test_df, train=False, base_folder=test_dir
                                  
                                 )
test_loader = DataLoader(test_dataset, batch_size=128
                         , shuffle=False
                         , num_workers=0
                         , pin_memory=True
                        )

In [None]:
herbarium_prototypes = Prototypes(model, support_df, support_loader)
test_df['Predicted'] = herbarium_prototypes.predict(test_loader, proba=False)

In [None]:
test_df.rename(columns={'id':'Id'}, inplace=True)
test_df[['Id','Predicted']].to_csv(base_dir + 'submission_0.csv', index=False)

### Draft

In [None]:
stopa

In [None]:
# kaggle competitions submit -c herbarium-2020-fgvc7 -f submission.csv -m "Message"

In [None]:
meh_preds = np.zeros((4,5))

In [None]:
meh_preds[:,0] = np.linalg.norm(meh, axis=1)
meh_preds

In [None]:
np.linalg.norm(meh, axis=1)

In [None]:
meh

In [None]:
model.eval()
with torch.no_grad():
    for batch_index, (X, y) in tqdm(enumerate(support_loader)):
        X, y = X.to(device), list(y)
        X_embeddings = model.float()(X).cpu().detach().numpy()
        
        if batch_index == 0:
            # this matrix will hold the prototypes
            class_prototypes = np.zeros((train_df.class_id.nunique()
                                         ,X_embeddings.shape[-1]))
            # this array will hold the item tally for each class, 
            # this will also be updated on the fly
            class_items_count = np.zeros(train_df.class_id.nunique())
        for i, label in enumerate(y):
            temp_item_count = class_items_count[label]
            class_prototypes[label] = (class_prototypes[label] * temp_item_count  + X_embeddings[i]) / (temp_item_count+1)
            class_items_count[label]+=1
            

In [None]:
import seaborn as sns

In [None]:
model.eval()
#model.to('cpu')
with torch.no_grad():
    X_embeddings = model.float()(X).detach().numpy()

In [None]:
X_embeddings.shape[-1]

Probably better to store a list of embeddings with corresponding list of classes, rather than storing a gigantic matrix of embeddings

In [None]:
meh = np.random.randn(len(temp_loader.dataset), X_embeddings.shape[-1])
meh.shape

We can finally produce a sample test prediction.

stop

In [None]:
from tqdm import tqdm

In [None]:
for stuff in tqdm((background_taskloader)):
    image, label = stuff

In [None]:
meh = background.df.to_dict()['filepath']

In [None]:
train_df[-train_df.class_id.isin(single_categories)].head()

In [None]:
meh

In [None]:
len(meh)

In [None]:
meh[meh.id == 383760]

In [None]:
background.datasetid_to_filepath[383760]

In [None]:
background.loader

In [None]:
train_df[train_df.class_id == 6143]

In [None]:
meh = load_rgb_image(train_dir + train_df[train_df.id == 383760].filepath.item())

In [None]:
plt.imshow(meh)