# Model Exploration

## Installing dependencies

In [1]:
DEPENDENCIES = [
    'tf-slim==1.1.0',
    'numpy==1.21.6',
    'pandas==1.3.5',
    'seaborn',
    'torch==1.11.0+cpu',
    'torchvision==0.12.0+cpu',
    'matplotlib==3.5.3',
    'opencv-python==4.5.4.60',
    'sklearn==0.0.post1',
    'skorch==0.12.1',
    'tqdm',
    'requests',
    'plotly==5.11.0',
    'scikit-image==0.19.3',
]

In [2]:
import subprocess
import typing as tp
import re

def install_dependencies(dependencies: tp.List[str], show_progress: bool = True) -> tp.Tuple[tp.List[str], tp.List[Exception]]:
    emit = print if show_progress else lambda x: None

    resolved_dependencies, errors = [], []
    for dependency in dependencies:
        emit(f'Installing "{dependency}"...')

        try:
            subprocess.run(["pip", "install", "--root-user-action=ignore", dependency], stdout=subprocess.DEVNULL)
            
            if '==' in dependency:
                dependency = re.search('(.+)==.+', dependency).group(1)

            if '@' in dependency:
                dependency = re.search('(.+) @ .+', dependency).group(1)
            
            pip_freeze = subprocess.Popen(("pip", "freeze"), stdout=subprocess.PIPE)
            output = subprocess.check_output(("grep", "-E", f"^({dependency}==)|({dependency} @).+$"), stdin=pip_freeze.stdout)
            resolved_dependencies.append(output.decode().strip())
        except subprocess.CalledProcessError as e:
            errors.append(e)
    
    return resolved_dependencies, errors

In [3]:
from pathlib import Path

if (Path("/") / "kaggle").is_dir():
    # Running in kaggle
    install_dependencies(DEPENDENCIES)

Installing "tf-slim==1.1.0"...
Installing "numpy==1.21.6"...
Installing "pandas==1.3.5"...
Installing "seaborn"...
Installing "torch==1.11.0+cpu"...
Installing "torchvision==0.12.0+cpu"...
Installing "matplotlib==3.5.3"...
Installing "opencv-python==4.5.4.60"...
Installing "sklearn==0.0.post1"...
Installing "skorch==0.12.1"...
Installing "tqdm"...
Installing "requests"...
Installing "plotly==5.11.0"...
Installing "scikit-image==0.19.3"...


## Loading the dataset

In [4]:
from pathlib import Path

BASE_DIR = Path.cwd()

INPUT_DIR = Path("/") / "kaggle" / "input"
if not INPUT_DIR.is_dir():
    # Not running in Kaggle
    INPUT_DIR = BASE_DIR / 'data'

DATA_DIR = INPUT_DIR / "planets-dataset" / "planet" / "planet" # https://www.kaggle.com/datasets/nikitarom/planets-dataset

TRAIN_SAMPLES_DIR = DATA_DIR / 'train-jpg'
TRAIN_LABELS_FILE = DATA_DIR / 'train_classes.csv'

TEST_SAMPLES_DIR = DATA_DIR / 'test-jpg'
TEST_SAMPLES_DIR_ADDITIONAL = INPUT_DIR / "planets-dataset" / 'test-jpg-additional'
TEST_LABELS_FILE = DATA_DIR / 'sample_submission.csv'

In [5]:
import numpy as np
import numpy.typing as ntp
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
import pandas as pd

class Dataset(object):
    def __init__(
        self,
        filepaths: tp.List[Path],
        labels: tp.List[ntp.NDArray[np.int_]],
        dimensions: tp.Optional[tp.Tuple[int, int]] = None,
        feature_vector: bool = True
    ):
        self.filepaths = filepaths
        self.labels = labels
        
        self.resize = lambda img: img
        if dimensions is not None:
            self.dimensions = dimensions
            self.resize = lambda img: resize(img, dimensions)

        self.feature_vector = feature_vector
    
    @classmethod
    def _extract_features(cls, image: ntp.NDArray[np.float_]) -> ntp.NDArray[np.float_]:
        return hog(image, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(4, 4), block_norm='L2-Hys', channel_axis=2)

    def _load_image(self, filepath: Path) -> ntp.NDArray[np.float_]:
        image = imread(filepath)
        image = self.resize(image)

        return image
        
    def __len__(self) -> int:
        return len(self.filepaths)
    
    def __getitem__(self, index) -> tp.List[tp.Tuple[ntp.NDArray[np.float_], ntp.NDArray[np.int_]]]:
        filepath, labels = self.filepaths[index], self.labels[index]

        sample = self._load_image(filepath)
        
        if self.feature_vector:
            sample = self._extract_features(sample)
        else:
            sample = sample.flatten()

        return sample, labels

In [6]:
import multiprocessing

import joblib

class DataLoader:
    def __init__(self, dataset, batch_size: int = 64, shuffle: bool = True, n_workers: int = -1, prefetch_factor: int = 2, collate_fn: tp.Optional[tp.Callable[[tp.Any], tp.Any]] = None, pin_memory: bool = False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.n_workers = n_workers
        if self.n_workers < 0:
            self.n_workers = multiprocessing.cpu_count()
        
        self.prefetch_factor = prefetch_factor
        self.collate_fn = collate_fn
        
        self.pin_memory = pin_memory
        
        self._indices = np.arange(0, len(self.dataset))
        if self.pin_memory:
            self._samples = self._load_batch(self._indices)

    def _load_batch(self, indices):
        if hasattr(self, '_samples'):
            return [self._samples[index] for index in indices]
        
        def wrapper(index):
            return self.dataset[index]
        
        if self.n_workers == 0:
            return [self.dataset[index] for index in indices]

        return joblib.Parallel(n_jobs=self.n_workers)(joblib.delayed(wrapper)(index) for index in indices)
        
    def __len__(self):
        return max(round(len(self.dataset) / self.batch_size), 1)

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self._indices)

        for prefetched_indices in (self._indices[i:i + self.prefetch_factor * self.batch_size] for i in range(0, len(self._indices), self.batch_size * self.prefetch_factor)):
            for indices in (prefetched_indices[i:i + self.batch_size] for i in range(0, len(prefetched_indices), self.batch_size)):
                results = self._load_batch(indices)

                if self.collate_fn is not None:
                    results = self.collate_fn(results)
                
                yield results

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

def create_train_val_split(dataset_dir, classes_filepath, dimensions=None, feature_vector=False, val_ratio=0.2, batch_size=64, n_workers=-1, pin_memory=False, return_arrays=False, limit=None):
    def collate_fn(results):
        X, y = list(zip(*results))

        return np.vstack(X), np.vstack(y)
    
    df = pd.read_csv(classes_filepath)
    if limit is not None:
        df = df.sample(n=limit, replace=False)

    df.tags = np.char.split(df.tags.values.astype(str))

    train_indices, val_indices = train_test_split(np.arange(len(df)), test_size=val_ratio, shuffle=True)

    df_train, df_val = df.iloc[train_indices], df.iloc[val_indices]
    
    image_names_train, image_names_val = df_train['image_name'].values.tolist(), df_val['image_name'].values.tolist()
    filepaths_train = [dataset_dir / f'{image_name}.jpg' for image_name in image_names_train]
    filepaths_val = [dataset_dir / f'{image_name}.jpg' for image_name in image_names_val]
    
    encoder = MultiLabelBinarizer()
    labels_train = encoder.fit_transform(df_train.tags)
    labels_val = encoder.transform(df_val.tags)

    dataset_train = Dataset(filepaths_train, labels_train, dimensions=dimensions, feature_vector=feature_vector)
    dataset_val = Dataset(filepaths_val, labels_val, dimensions=dimensions, feature_vector=feature_vector)
    
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, n_workers=n_workers, prefetch_factor=10, collate_fn=collate_fn, pin_memory=pin_memory)
    dataloader_val = DataLoader(dataset_val, batch_size=batch_size, n_workers=n_workers, prefetch_factor=10, collate_fn=collate_fn, pin_memory=pin_memory)
    
    if return_arrays:
        X_train, y_train = [], []
        for batch_X, batch_y in dataloader_train:
            X_train.extend(batch_X), y_train.extend(batch_y)

        X_train, y_train = np.vstack(X_train), np.vstack(y_train)
        
        X_val, y_val = [], []
        for batch_X, batch_y in dataloader_val:
            X_val.extend(batch_X), y_val.extend(batch_y)

        X_val, y_val = np.vstack(X_val), np.vstack(y_val)
        
        return encoder, X_train, y_train, X_val, y_val
    
    return encoder, dataloader_train, dataloader_val

## Structuring our models

In [8]:
from sklearn.base import BaseEstimator
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.multioutput import MultiOutputClassifier
from enum import IntEnum, auto

import numpy as np
from sklearn.pipeline import Pipeline

class OnlinePipeline(Pipeline):
    def partial_fit(self, X, y=None, classes=None):
        for i, step in enumerate(self.steps):
            name, est = step
            if i < len(self.steps) - 1:
                est.partial_fit(X, y)
                X = est.transform(X)
            else:
                est.partial_fit(X, y, classes=classes)

        return self

class Reduction(IntEnum):
    SLICE = auto()
    DICTIONARY_LEARNING = auto()
    PCA = auto()
    KMEANS = auto()
    
def create_multioutput_classifier(clf: BaseEstimator, reduction: tp.Optional[Reduction] = None, n_components: tp.Optional[int] = 1024) -> Pipeline:
    if reduction is not None:
        assert n_components is not None, "'n_components' has not been specified"

    transformer = None
    if reduction is not None:
        if reduction == Reduction.DICTIONARY_LEARNING:
            transformer = MiniBatchDictionaryLearning(n_components=n_components)
        elif reduction == Reduction.PCA:
            transformer = IncrementalPCA(n_components=n_components)
        elif reduction == Reduction.KMEANS:
            transformer = MiniBatchKMeans(n_clusters=n_components)
        else:
            raise ValueError(f'{reduction} is not supported')

    if transformer is not None:
        return OnlinePipeline([('scaler', StandardScaler()), ('transformer', transformer), ('model', MultiOutputClassifier(estimator=clf))])
    
    return OnlinePipeline([('scaler', StandardScaler()), ('model', MultiOutputClassifier(estimator=clf))])

## Evaluating our models

In [9]:
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, Perceptron

def get_clfs(**kwargs):
    return {
        'SGDClassifier': create_multioutput_classifier(SGDClassifier(max_iter=1000, tol=1e-3), **kwargs),
        'PassiveAggressiveClassifier': create_multioutput_classifier(PassiveAggressiveClassifier(max_iter=1000, tol=1e-3), **kwargs),
    }

In [10]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, fbeta_score
from functools import partial
from tqdm.notebook import tqdm

def get_scorers() -> tp.List[tp.Tuple[str, tp.Callable[[ntp.NDArray[np.int_], ntp.NDArray[np.int_]], np.float_]]]:
    return [
        ('F1 (micro)', partial(f1_score, average='micro', zero_division=0)),
        ('F1 (macro)', partial(f1_score, average='macro', zero_division=0)),
        ('F1 (samples)', partial(f1_score, average='samples', zero_division=0)),
        ('F2 (micro)', partial(fbeta_score, beta=2, average='micro', zero_division=0)),
        ('F2 (macro)', partial(fbeta_score, beta=2, average='macro', zero_division=0)),
        ('F2 (samples)', partial(fbeta_score, beta=2, average='samples', zero_division=0)),
        ('Accuracy', accuracy_score),
        ('Precision', partial(precision_score, average='macro', zero_division=0)),
        ('Recall', partial(recall_score, average='macro', zero_division=0)),
    ]

def evaluate(
    clf: Pipeline, name: str,
    X_train: ntp.NDArray[np.float_], y_train: ntp.NDArray[np.int_],
    X_val: ntp.NDArray[np.float_], y_val: ntp.NDArray[np.int_]
) -> pd.DataFrame:
    scorers = get_scorers()

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    scores = []
    for _, scorer in scorers:
        scores.append(scorer(y_val, y_pred))

    return pd.DataFrame.from_dict({ name: scores }, columns=[name for name, _ in scorers], orient='index')

In [11]:
def collect_results(
    X_train: ntp.NDArray[np.float_], y_train: ntp.NDArray[np.int_],
    X_val: ntp.NDArray[np.float_], y_val: ntp.NDArray[np.int_],
    dimensions=None, feature_vector=False, limit=None, **kwargs
):
    clfs = get_clfs(**kwargs)
    
    df, progress_bar, scores = None, tqdm(clfs.items(), position=0), {}
    for name, clf in progress_bar:
        progress_bar.set_description(name)
        
        try:
            results = evaluate(clf, name, X_train, y_train, X_val, y_val)

            if df is None:
                df = results
            else:
                df = pd.concat((df, results))
        except Exception as e:
            print(f'Unexpected error: {e}')
    
    identifiers = dict(
        dimensions=f'{dimensions[0]}_{dimensions[0]}' if dimensions is not None else 'None',
        feature_vector=feature_vector,
        limit=limit
    )
    
    suffixes = '_'.join(f'{key}_{value}' for key, value in { **identifiers, **kwargs }.items()).lower()
    filename = f'metrics_{suffixes}.csv'
    df.to_csv(BASE_DIR / filename, index=False)
    
    return clfs, df

Achieving reproducibility in our results, requires initializing (also known as `seeding`) the random number generators (RNG) utilized by our dependencies. In order to do so, we designate a `RANDOM_SEED` number, namely `1234`, and we use it to initialize the following RNGs:

- `numpy` (`np.random.seed`)
- `random` (`random.seed`)
- `torch (CPU)` (`torch.manual_seed`)
- `torch (GPU)` (`torch.cuda.manual_seed`)

The aforementioned RNGs are utilized by `torch`, `numpy` as well as `sklearn` in order to generate random numbers. `random.seed` corresponds to the python standard library RNG. We are seeding each and every one of them in order to cover any possible edge cases, wherein third party code utilizes any of them unbeknownst to us. Lastly, `PYTHONHASHSEED` controls the hashing of str, bytes and datetime objects. More specifically (as stated in the official `Python` documentation):

_"If this variable is not set or set to random, a random value is used to seed the hashes of str, bytes and datetime objects..."_

In [12]:
import os
import torch
import numpy as np
import random

RANDOM_SEED = 1234

if RANDOM_SEED is not None:
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed(RANDOM_SEED)
    os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

In [15]:
LIMIT = 64 * 1 # Set to None to train in the entire dataset

In [17]:
configuration, best_score = {}, -float('inf')
for feature_vector in tqdm([False, True], position=-2, leave=False):
    _, X_train, y_train, X_val, y_val = create_train_val_split(
        TRAIN_SAMPLES_DIR, TRAIN_LABELS_FILE,
        dimensions=None, feature_vector=feature_vector, limit=LIMIT, pin_memory=True, return_arrays=True
    )

    for reduction in tqdm([None, Reduction.PCA, Reduction.KMEANS, Reduction.DICTIONARY_LEARNING], position=-1, leave=False):
        df = collect_results(X_train, y_train, X_val, y_val, dimensions=None, feature_vector=feature_vector, limit=LIMIT, reduction=reduction, n_components=32)[1]
        
        score = df['F1 (samples)'].max()
        if score > best_score:
            configuration, best_score = dict(feature_vector=feature_vector, reduction=reduction, model=df[df['F1 (samples)'] == df['F1 (samples)'].max()].index[0]), score

  0%|          | 0/2 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]