# Model Exploration

## Installing dependencies

In [1]:
DEPENDENCIES = [
    'tf-slim==1.1.0',
    'numpy==1.21.6',
    'pandas==1.3.5',
    'seaborn',
    'torch==1.11.0',
    'torchvision==0.12.0',
    'matplotlib==3.5.3',
    'opencv-python==4.5.4.60',
    'sklearn==0.0.post1',
    'skorch==0.12.1',
    'tqdm',
    'requests',
    'plotly==5.11.0',
    'scikit-image==0.19.3',
]

In [2]:
import subprocess
import typing as tp
import re

def install_dependencies(dependencies: tp.List[str], show_progress: bool = True) -> tp.Tuple[tp.List[str], tp.List[Exception]]:
    emit = print if show_progress else lambda x: None

    resolved_dependencies, errors = [], []
    for dependency in dependencies:
        emit(f'Installing "{dependency}"...')

        try:
            subprocess.run(["pip", "install", "--root-user-action=ignore", dependency], stdout=subprocess.DEVNULL)
            
            if '==' in dependency:
                dependency = re.search('(.+)==.+', dependency).group(1)

            if '@' in dependency:
                dependency = re.search('(.+) @ .+', dependency).group(1)
            
            pip_freeze = subprocess.Popen(("pip", "freeze"), stdout=subprocess.PIPE)
            output = subprocess.check_output(("grep", "-E", f"^({dependency}==)|({dependency} @).+$"), stdin=pip_freeze.stdout)
            resolved_dependencies.append(output.decode().strip())
        except subprocess.CalledProcessError as e:
            errors.append(e)
    
    return resolved_dependencies, errors

In [3]:
install_dependencies(DEPENDENCIES)

Installing "tf-slim==1.1.0"...
Installing "numpy==1.21.6"...
Installing "pandas==1.3.5"...
Installing "seaborn"...
Installing "torch==1.11.0+cpu"...


ERROR: Could not find a version that satisfies the requirement torch==1.11.0+cpu (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1)
ERROR: No matching distribution found for torch==1.11.0+cpu


Installing "torchvision==0.12.0+cpu"...


ERROR: Could not find a version that satisfies the requirement torchvision==0.12.0+cpu (from versions: 0.1.6, 0.1.7, 0.1.8, 0.1.9, 0.2.0, 0.2.1, 0.2.2, 0.2.2.post2, 0.2.2.post3, 0.3.0, 0.4.0, 0.4.1, 0.4.2, 0.5.0, 0.6.0, 0.6.1, 0.7.0, 0.8.0, 0.8.1, 0.8.2, 0.9.0, 0.9.1, 0.10.0, 0.10.1, 0.11.0, 0.11.1, 0.11.2, 0.11.3, 0.12.0, 0.13.0, 0.13.1, 0.14.0, 0.14.1)
ERROR: No matching distribution found for torchvision==0.12.0+cpu


Installing "matplotlib==3.5.3"...
Installing "opencv-python==4.5.4.60"...
Installing "sklearn==0.0.post1"...
Installing "skorch==0.12.1"...
Installing "tqdm"...
Installing "requests"...
Installing "plotly==5.11.0"...
Installing "scikit-image==0.19.3"...


(['tf-slim==1.1.0',
  'numpy==1.21.6',
  'pandas==1.3.5',
  'seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-split_1629095986539/work',
  'torch @ file:///tmp/torch/torch-1.11.0-cp37-cp37m-linux_x86_64.whl',
  'torchvision @ file:///tmp/torch/torchvision-0.12.0-cp37-cp37m-linux_x86_64.whl',
  'matplotlib==3.5.3',
  'opencv-python==4.5.4.60',
  'sklearn==0.0.post1',
  'skorch==0.12.1',
  'tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1649051611147/work',
  'requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1656534056640/work',
  'plotly==5.11.0',
  'scikit-image==0.19.3'],
 [])

## Seeding RNGs

Achieving reproducibility in our results, requires initializing (also known as `seeding`) the random number generators (RNG) utilized by our dependencies. In order to do so, we designate a `RANDOM_SEED` number, namely `1234`, and we use it to initialize the following RNGs:

- `numpy` (`np.random.seed`)
- `random` (`random.seed`)
- `torch (CPU)` (`torch.manual_seed`)
- `torch (GPU)` (`torch.cuda.manual_seed`)

The aforementioned RNGs are utilized by `torch`, `numpy` as well as `sklearn` in order to generate random numbers. `random.seed` corresponds to the python standard library RNG. We are seeding each and every one of them in order to cover any possible edge cases, wherein third party code utilizes any of them unbeknownst to us. Lastly, `PYTHONHASHSEED` controls the hashing of str, bytes and datetime objects. More specifically (as stated in the official `Python` documentation):

_"If this variable is not set or set to random, a random value is used to seed the hashes of str, bytes and datetime objects..."_

In [31]:
import os
import torch
import numpy as np
import random

RANDOM_SEED = 1234

if RANDOM_SEED is not None:
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed(RANDOM_SEED)
    os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

## Loading the dataset

In [32]:
from pathlib import Path

BASE_DIR = Path.cwd()
INPUT_DIR = Path("/") / "kaggle" / "input"
DATA_DIR = INPUT_DIR / "planets-dataset" / "planet" / "planet" # https://www.kaggle.com/datasets/nikitarom/planets-dataset

TRAIN_SAMPLES_DIR = DATA_DIR / 'train-jpg'
TRAIN_LABELS_FILE = DATA_DIR / 'train_classes.csv' 

We define our custom Dataset class to manipulate batches of data between RAM and Disk more easily. Some point of attentions:

- __init__: we pass the dataframe along with the target, the transformation, the file path and is_train flag. It is important to distinguish the training phase from the testing phase because we use test augmentation. Test augmentation (TTA) is helpful to diversify our training dataset and build a more robust model. It is applied on each image for each batch, meaning that is doesn't increase the length of our training dataset per say, but it transforms each image randomly during execution time.
- __getitem__: we define what the dataset return upon iteration. It needs to load both image and target. collate_fn: we use this function within the following DataLoader instance. It corresponds to the batch manipulation. This is were transform is called. We also proceed to train and test augmentation there.
- collate_fn: we use this function within the following DataLoader instance. It corresponds to the batch manipulation. This is were transform is called. We also proceed to train and test augmentation there.

In [33]:
from torch.utils.data import Dataset
import pandas as pd
import cv2
import torch
import numpy.typing as ntp

Transform = tp.Callable[[torch.Tensor], torch.Tensor]

class AmazonDataset(Dataset):
    def __init__(self, dataset_dir: Path, image_names: tp.List[str], tags: tp.List[tp.List[int]], transform: tp.Optional[Transform] = None) -> None:
        super().__init__()

        self.dataset_dir = dataset_dir
        self.image_names = image_names
        self.tags = tags
        self.transform = transform

    def __len__(self) -> int:
        return len(self.image_names)

    def __getitem__(self, idx: int) -> tp.Tuple[ntp.NDArray[np.float_], ntp.NDArray[np.int_]]:
        image = self.load_image(idx)
        tags = self.load_tags(idx)
        
        return image, tags

    def load_tags(self, idx: int) -> torch.Tensor:
        tags = self.tags[idx]
        tags = torch.as_tensor(tags)
        tags = tags.float()
        
        return tags
    
    def load_image(self, idx: int) -> torch.Tensor:
        image_name = self.image_names[idx]
        filename = f'{image_name}.jpg'
        filepath = self.dataset_dir / filename

        image = cv2.imread(str(filepath))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = torch.tensor(image)
        image = image.permute(2, 0, 1)
        image = self.transform(image)
        image = image.float()
        
        return image

In [34]:
import torchvision.transforms as T

transform_train = T.Compose([
    T.ToPILImage(),
    T.Resize(224),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    T.RandomHorizontalFlip(),
    T.RandomRotation(180)
])

transform_val = T.Compose([
    T.ToPILImage(),
    T.Resize(224),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    T.RandomHorizontalFlip(),
    T.RandomRotation(180)
])

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader


def create_datasets(
    dataset_dir: Path,
    classes_filepath: Path,
    batch_size: int = 64,
    test_size: float = 0.2,
    shuffle: bool = False,
    limit: tp.Optional[int] = None
) -> tp.Tuple[DataLoader, DataLoader, MultiLabelBinarizer]:
    df = pd.read_csv(classes_filepath)
    
    if limit is not None:
        df = df.head(limit)
    
    df.tags = np.char.split(df.tags.values.astype(str))
    
    df_train, df_val = train_test_split(df, test_size=test_size, shuffle=shuffle)

    encoder = MultiLabelBinarizer()
    tags_train = encoder.fit_transform(df_train.tags)
    tags_val = encoder.transform(df_val.tags)
    
    dataset_train = AmazonDataset(dataset_dir, df_train.image_name.to_numpy(), tags_train, transform_train)
    dataset_val = AmazonDataset(dataset_dir, df_val.image_name.to_numpy(), tags_val, transform_val)

    dataloader_train = DataLoader(
      dataset_train,
      batch_size=batch_size,
      shuffle=True,
    )

    dataloader_val = DataLoader(
      dataset_val,
      batch_size=batch_size,
      shuffle=True,
    )

    return dataloader_train, dataloader_val, encoder

In [36]:
dataloader_train, dataloader_val, encoder = create_datasets(TRAIN_SAMPLES_DIR, TRAIN_LABELS_FILE)

In [37]:
print(f'Training set: {len(dataloader_train)}, Validation set: {len(dataloader_val)}')

Training set: 506, Validation set: 127


For optimal performances, resnet18 need input shape that are multiple of 32 and in our case we have input of size 256. From 256, the closest multiple of 32 is 224.

Therefore, we rescale our input data using this multiple, and we also normalize our dataset based on resnet pretrained mean and standard deviation intensity values. ToTensor() is useful to normalize our image values from 0-255 range to 0-1 range.

In [38]:
from torch import nn
from torchvision import models

class ResNet(nn.Module):
    def __init__(self, freeze: bool = True, dropout: float = 0.2):
        super().__init__()

        self.resnet18 = models.resnet18(pretrained=True)
        for parameter in self.resnet18.parameters():
            parameter.require_grad = not freeze
        
        self.resnet18.avgpool = nn.AdaptiveAvgPool2d(output_size=(1, 1))

        self.resnet18.fc = nn.Sequential(
          nn.Flatten(),
          nn.Linear(512, 128), # 512 for resnet18 or 2048 for resnet 50
          nn.ReLU(inplace=True),
          nn.Dropout(dropout),
          nn.Linear(128, 17),
          nn.Sigmoid()
        )

    def forward(self, batch: torch.Tensor) -> torch.Tensor:
        return self.resnet18(batch)

    @classmethod
    def from_device(cls, *args, device_id="cpu", **kwargs) -> tp.Tuple["ResNet", torch.device]:
        # We firstly initialize an instance of our model
        model = cls(*args, **kwargs)

        # If the cuda backend is available then change the device type to GPU
        if torch.cuda.is_available():
            device_id = "cuda:0"
            # Given that there are multiple GPUs available wrap the model
            # in `nn.DataParallel` in order to take advantage of them
            if torch.cuda.device_count() > 1:
                model = nn.DataParallel(model)

        # Retrieve the `torch.device` corresponding to `device_id`
        # and transfer the model to it
        device = torch.device(device_id)

        return model.to(device), device

    @classmethod
    def from_file(cls, filename: Path, *args, device_id: str = "cpu", **kwargs) -> tp.Tuple["ResNet", torch.device]:
        # Firstly initialize the model and retrieve the device wherein it is located
        model, device = cls.from_device(*args, device_id=device_id, **kwargs)

        # Load the model state from the supplied file
        # and dynamically remap it to the device at hand using the `map_location`
        model.load_state_dict(torch.load(filename, map_location=device))

        return model, device

In [39]:
from datetime import datetime

class EarlyStoppingStrategy(object):
    def __init__(self, 
        tolerance: int = 5,
        min_delta: float = 0,
        checkpoint_dir: tp.Optional[Path] = None,
    ):
        self.tolerance = tolerance
        self.min_delta = min_delta
        self.checkpoint_dir = checkpoint_dir

        self.best_validation_loss = float('inf')
        self.counter = 0

    def __call__(self, validation_loss, model):
        best_validation_loss = self.best_validation_loss
        self.best_validation_loss = min(self.best_validation_loss, validation_loss)

        if best_validation_loss - validation_loss < self.min_delta:
            # if validation loss value at hand is not at least `min_delta`
            # smaller than the so far smallest validation loss
            # increment the tolerance counter by 1
            # If the counter exceeds the specified tolerance level we should
            # halt the training procedure
            self.counter += 1
            if self.counter > self.tolerance:
                return True
        else:
            # Otherwise (meaning the validation loss has decreased considerably)
            # reset the tolerance counter and persist the model state
            self.counter = 0

            if self.checkpoint_dir is not None:
                filename = f'{model.__class__.__name__}_{datetime.now().strftime("%d_%m_%Y_%H_%M_%S_%f")}.pkl'

                torch.save(model.state_dict(), self.checkpoint_dir / filename)

        return False

In [40]:
import time

from torch import nn, optim
from tqdm.notebook import tqdm

def train(
    model: nn.Module,
    device: torch.device,
    train_loader: DataLoader,
    validation_loader: DataLoader,
    early_stopping_strategy: tp.Optional[EarlyStoppingStrategy] = None,
    n_epochs: int = 100,
    weight_decay: float = 0.0,
    lr=0.001,
    eps=1e-08,
) -> tp.Tuple[int, ...]:
    # Define a method to retrieve the tqdm progress bar postfix data
    batch_index, train_losses, validation_losses = 0, [0], [0]
    def get_postfix():
        return {
            'train': f'{train_losses[-1]:.3f}',
            'validation': f'{validation_losses[-1]:.3f}',
            'batch': f'{batch_index:02d} / {len(train_loader):02d}'
        }

    if early_stopping_strategy is not None:
        def get_postfix():
            return {
                'train': f'{train_losses[-1]:.3f}',
                'validation': f'{validation_losses[-1]:.3f}',
                'tolerance': f'{early_stopping_strategy.counter}/{early_stopping_strategy.tolerance}',
                'batch': f'{batch_index:02d} / {len(train_loader):02d}'
            }

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay, lr=lr, eps=eps)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, gamma=0.1, milestones=[0.25 * n_epochs, 0.5 * n_epochs, 0.75 * n_epochs])

    epochs_progress_bar = tqdm(range(n_epochs), desc='epochs', position=0)

    timestamp = time.time()
    for _ in epochs_progress_bar:
        # We set the model to training mode
        model.train()

        train_loss = 0
        for batch_index, (batch_X, batch_y) in enumerate(train_loader):
            # Transfer the data to the available device backend (CPU/GPU)
            batch_X, batch_y, = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()

            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()

            optimizer.step()

            train_loss += loss.item() / len(train_loader)

            epochs_progress_bar.set_postfix(**get_postfix())

        train_losses.append(train_loss)

        epochs_progress_bar.set_postfix(**get_postfix())

        scheduler.step()
        
        # We set the model to evaluation mode
        model.eval()
        with torch.no_grad():
            # Turn of gradient calculation
            validation_loss = 0
            for batch_X, batch_y in validation_loader:
                batch_X, batch_y, = batch_X.to(device), batch_y.to(device)

                output = model(batch_X)
                loss = criterion(output, batch_y)

                validation_loss += loss.item() / len(validation_loader)

            validation_losses.append(validation_loss)

            epochs_progress_bar.set_postfix(**get_postfix())
            
        # Invoke the early stopping strategy with the current validation loss
        # in order to increment/reset its internal tolerance counter
        # and determine whether or not to halt the training process
        if early_stopping_strategy is not None:
            if early_stopping_strategy(validation_losses[-1], model):
                break
    
    return time.time() - timestamp, np.array(train_losses[1:]), np.array(validation_losses[1:])

In [None]:
checkpoint_dir = Path.cwd() / 'models'
checkpoint_dir.mkdir(parents=True, exist_ok=True)

early_stopping_strategy = EarlyStoppingStrategy(
    tolerance=10,
    min_delta=0.005,
    checkpoint_dir=checkpoint_dir
)

model, device = ResNet.from_device()

_, train_losses, validation_losses = train(
    model, device,
    dataloader_train, dataloader_val,
    early_stopping_strategy=early_stopping_strategy, n_epochs=100, weight_decay=0.05, lr=1e-4
)

epochs:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
import plotly.graph_objects as go

def learning_curves(
    train_losses,
    validation_losses,
    title: str = 'Loss per Epoch',
    label_x: str = 'Epochs',
    label_y: str = 'Loss',
) -> None:
    epochs = np.arange(max(len(train_losses), len(validation_losses)))

    go.Figure(data=[
            go.Scatter(name='Training', x=epochs, y=train_losses, mode='lines'),
            go.Scatter(name='Validation', x=epochs, y=validation_losses, mode='lines'),
    ]).update_layout(title=title, xaxis_title=label_x, yaxis_title=label_y).show()

In [None]:
learning_curves(train_losses, validation_losses)

## Evaluating our ResNet architecture

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def evaluate(model: nn.Module, device: torch.device, data_loader: DataLoader):
    model.eval()

    correct = 0
    ground_truth_labels  = []
    predicted_labels = []
    with torch.no_grad():
        for batch_X, batch_y, batch_lengths in tqdmf(data_loader, desc='batches'):
            batch_X, batch_y, batch_lengths = batch_X.to(device), batch_y.to(device), batch_lengths.to(device)

            output = model(batch_X, batch_lengths)
            
            _, predictions = torch.max(output, 1)

            predicted_labels.extend(predictions.detach().cpu().numpy())
            ground_truth_labels.extend(batch_y.detach().cpu().numpy())
            
            correct += (predictions == batch_y).sum().item()

    classes = np.unique(ground_truth_labels)

    clf_report = classification_report(ground_truth_labels, predicted_labels, labels=classes, output_dict = True)
    precision = precision_score(ground_truth_labels, predicted_labels, average='micro')
    recall = recall_score(ground_truth_labels, predicted_labels, average='micro')
    f1 = f1_score(ground_truth_labels, predicted_labels, average='micro')
    
    micro_metrics = [precision ,recall ,f1, float(len(predicted_labels))]
    clf_report_df = pd.DataFrame.from_dict(clf_report)
    clf_report_df['micro avg'] = micro_metrics

    return clf_report_df