In [2]:
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader


## Augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

import cv2


In [3]:
class HappyWhaleTestDataset(Dataset):
    def __init__(self, df_with_arrays, transforms=None):
        self.df = df_with_arrays
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img = self.df[index]
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img
        }

In [4]:
q = cv2.imread(r'resources\defaultPhoto.jpg')
q = cv2.cvtColor(q, cv2.COLOR_BGR2RGB)
print(q.shape)


(1454, 926, 3)


In [5]:
CONFIG = {
    'img_size': 448,
    'seed': 22,
    'n_fold': 5,
    'train_batch_size': 32,
    'test_batch_size': 64,
    'num_classes': 15587,
    'patches_size': 32,
    'device': torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
}

data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "test": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [6]:
image = list(cv2.imread(r'resources\defaultPhoto.jpg'))
test_dataset = HappyWhaleTestDataset(image)
test_loader = DataLoader(test_dataset, batch_size=1)

In [7]:
test_dataset.__getitem__(0)

{'image': array([[[121, 121, 121],
         [109, 109, 109],
         [ 99,  99,  99]],
 
        [[122, 122, 122],
         [110, 110, 110],
         [100, 100, 100]],
 
        [[122, 122, 122],
         [110, 110, 110],
         [100, 100, 100]],
 
        ...,
 
        [[112, 112, 112],
         [105, 105, 105],
         [ 86,  86,  86]],
 
        [[112, 112, 112],
         [105, 105, 105],
         [ 86,  86,  86]],
 
        [[112, 112, 112],
         [105, 105, 105],
         [ 86,  86,  86]]], dtype=uint8)}

In [8]:
def img_to_patch(x, patch_size, flatten_channels=True):
    """
    Inputs:
        x - torch.Tensor representing the image of shape [B, C, H, W]
        patch_size - Number of pixels per dimension of the patches (integer)
        flatten_channels - If True, the patches will be returned in a flattened format
                           as a feature vector instead of a image grid.
    """
    B, C, H, W = x.shape
    x = x.reshape(B, C, H//patch_size, patch_size, W//patch_size, patch_size)
    x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W]
    x = x.flatten(1,2)              # [B, H'*W', C, p_H, p_W]
    if flatten_channels:
        x = x.flatten(2,4)          # [B, H'*W', C*p_H*p_W]
    return x

In [9]:
class AttentionBlock(nn.Module):

    def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
        """
        Inputs:
            embed_dim - Dimensionality of input and attention feature vectors
            hidden_dim - Dimensionality of hidden layer in feed-forward network
                         (usually 2-4x larger than embed_dim)
            num_heads - Number of heads to use in the Multi-Head Attention block
            dropout - Amount of dropout to apply in the feed-forward network
        """
        super().__init__()

        self.layer_norm_1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads,
                                          dropout=dropout)
        self.layer_norm_2 = nn.LayerNorm(embed_dim)
        self.linear = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )


    def forward(self, x):
        inp_x = self.layer_norm_1(x)
        x = x + self.attn(inp_x, inp_x, inp_x)[0]
        x = x + self.linear(self.layer_norm_2(x))
        return x

In [10]:
class VisionTransformer(nn.Module):

    def __init__(self, embed_dim, hidden_dim, num_channels, num_heads, num_layers, num_classes, patch_size, num_patches, dropout=0.0):
        """
        Inputs:
            embed_dim - Dimensionality of the input feature vectors to the Transformer
            hidden_dim - Dimensionality of the hidden layer in the feed-forward networks
                         within the Transformer
            num_channels - Number of channels of the input (3 for RGB)
            num_heads - Number of heads to use in the Multi-Head Attention block
            num_layers - Number of layers to use in the Transformer
            num_classes - Number of classes to predict
            patch_size - Number of pixels that the patches have per dimension
            num_patches - Maximum number of patches an image can have
            dropout - Amount of dropout to apply in the feed-forward network and
                      on the input encoding
        """
        super().__init__()

        self.patch_size = patch_size

        # Layers/Networks
        self.input_layer = nn.Linear(num_channels*(patch_size**2), embed_dim)
        self.transformer = nn.Sequential(*[AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)])
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, num_classes)
        )
        self.dropout = nn.Dropout(dropout)

        # Parameters/Embeddings
        self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1,1+num_patches,embed_dim))


    def forward(self, x):
        # Preprocess input
        x = img_to_patch(x, self.patch_size)
        B, T, _ = x.shape
        x = self.input_layer(x)

        # Add CLS token and positional encoding
        cls_token = self.cls_token.repeat(B, 1, 1)
        x = torch.cat([cls_token, x], dim=1)
        x = x + self.pos_embedding[:,:T+1]

        # Apply Transforrmer
        x = self.dropout(x)
        x = x.transpose(0, 1)
        x = self.transformer(x)

        # Perform classification prediction
        cls = x[0]
        out = self.mlp_head(cls)
        return out


In [11]:
model = VisionTransformer(**{
        'embed_dim': 784,
        'hidden_dim': 1568,
        'num_heads': 8,
        'num_layers': 6,
        'patch_size': 32,
        'num_channels': 3,
        'num_patches': 196,
        'num_classes': 15587,
        'dropout': 0.2
    }
)
model.to('cpu')


VisionTransformer(
  (input_layer): Linear(in_features=3072, out_features=784, bias=True)
  (transformer): Sequential(
    (0): AttentionBlock(
      (layer_norm_1): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=784, out_features=784, bias=True)
      )
      (layer_norm_2): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (linear): Sequential(
        (0): Linear(in_features=784, out_features=1568, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.2, inplace=False)
        (3): Linear(in_features=1568, out_features=784, bias=True)
        (4): Dropout(p=0.2, inplace=False)
      )
    )
    (1): AttentionBlock(
      (layer_norm_1): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=784, out_features=784, bias=True)
      )
      (layer_norm_2): L

In [13]:
model.load_state_dict(torch.load(r'models\model-e15.pt', map_location=torch.device('cpu'))['model_state_dict'])
model.eval()

VisionTransformer(
  (input_layer): Linear(in_features=3072, out_features=784, bias=True)
  (transformer): Sequential(
    (0): AttentionBlock(
      (layer_norm_1): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=784, out_features=784, bias=True)
      )
      (layer_norm_2): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (linear): Sequential(
        (0): Linear(in_features=784, out_features=1568, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.2, inplace=False)
        (3): Linear(in_features=1568, out_features=784, bias=True)
        (4): Dropout(p=0.2, inplace=False)
      )
    )
    (1): AttentionBlock(
      (layer_norm_1): LayerNorm((784,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=784, out_features=784, bias=True)
      )
      (layer_norm_2): L

In [15]:
image = list(cv2.imread(r'resources\defaultPhoto.jpg'))
test_dataset = HappyWhaleTestDataset(image, transforms=data_transforms['test'])
test_loader = DataLoader(test_dataset, batch_size=1, num_workers=0, shuffle=False)

In [16]:
next(iter(test_loader))

{'image': tensor([[[[-0.0458, -0.0458, -0.0458,  ..., -0.4226, -0.4226, -0.4226],
           [-0.0287, -0.0287, -0.0287,  ..., -0.4054, -0.4054, -0.4054],
           [ 0.0056,  0.0056,  0.0056,  ..., -0.3712, -0.3712, -0.3712],
           ...,
           [-0.1657, -0.1657, -0.1657,  ..., -0.6281, -0.6281, -0.6281],
           [-0.1999, -0.1999, -0.1999,  ..., -0.6281, -0.6281, -0.6281],
           [-0.1999, -0.1999, -0.1999,  ..., -0.6452, -0.6452, -0.6452]],
 
          [[ 0.0826,  0.0826,  0.0826,  ..., -0.3025, -0.3025, -0.3025],
           [ 0.1001,  0.1001,  0.1001,  ..., -0.2850, -0.2850, -0.2850],
           [ 0.1352,  0.1352,  0.1352,  ..., -0.2500, -0.2500, -0.2500],
           ...,
           [-0.0399, -0.0399, -0.0399,  ..., -0.5126, -0.5126, -0.5126],
           [-0.0749, -0.0749, -0.0749,  ..., -0.5126, -0.5126, -0.5126],
           [-0.0749, -0.0749, -0.0749,  ..., -0.5301, -0.5301, -0.5301]],
 
          [[ 0.3045,  0.3045,  0.3045,  ..., -0.0790, -0.0790, -0.0790],
    

In [27]:
def inference_nn(dataloader):
    with torch.no_grad():
        batch = next(iter(test_loader))
        x_batch = batch['image']
        x_batch = x_batch.to('cpu')
        y_test_pred = model(x_batch)
        y_test_pred = torch.softmax(y_test_pred, dim = 1)
        y_pred_probs, y_pred_tags = torch.topk(y_test_pred, 5, dim = 1)
        y_pred_probs = y_pred_probs.cpu().numpy()
        y_pred_tags = y_pred_tags.cpu().numpy()
        return y_pred_tags[0], y_pred_probs[0]
        

In [30]:
tags, probs = inference_nn(next(iter(test_loader)))
print(tags)
print(probs)

[14752  1343  6445  5847  4584]
[0.02046323 0.01243929 0.01217838 0.01214379 0.01046708]


In [31]:
import pandas as pd

db = pd.read_csv(r'resources/database.csv')

In [38]:
db[db['individual_id'] == 14752]['species'].mode()

0    bottlenose_dolphin
Name: species, dtype: object

In [57]:
len(probs)

5

In [64]:
animals = []
for i in tags:
        animals.append(db[db['individual_id'] == i]['species'].mode()[0])
popular_animal = db[db['individual_id'] == tags[0]]['species'].mode()
x = pd.DataFrame({'ID': animals, 'Prob': probs}, index=[i for i in range(5)])

In [66]:
x

Unnamed: 0,ID,Prob
0,bottlenose_dolphin,0.020463
1,bottlenose_dolphin,0.012439
2,bottlenose_dolphin,0.012178
3,bottlenose_dolphin,0.012144
4,bottlenose_dolphin,0.010467


In [45]:
tags[0]

14752