# Model Exploration

## Installing dependencies

In [1]:
DEPENDENCIES = [
    'tf-slim==1.1.0',
    'numpy==1.21.6',
    'pandas==1.3.5',
    'seaborn',
    'torch==1.11.0+cpu',
    'torchvision==0.12.0+cpu',
    'matplotlib==3.5.3',
    'opencv-python==4.5.4.60',
    'sklearn==0.0.post1',
    'skorch==0.12.1',
    'tqdm',
    'requests',
    'plotly==5.11.0',
    'scikit-image==0.19.3',
]

In [2]:
import subprocess
import typing as tp
import re

def install_dependencies(dependencies: tp.List[str], show_progress: bool = True) -> tp.Tuple[tp.List[str], tp.List[Exception]]:
    emit = print if show_progress else lambda x: None

    resolved_dependencies, errors = [], []
    for dependency in dependencies:
        emit(f'Installing "{dependency}"...')

        try:
            subprocess.run(["pip", "install", "--root-user-action=ignore", dependency], stdout=subprocess.DEVNULL)
            
            if '==' in dependency:
                dependency = re.search('(.+)==.+', dependency).group(1)

            if '@' in dependency:
                dependency = re.search('(.+) @ .+', dependency).group(1)
            
            pip_freeze = subprocess.Popen(("pip", "freeze"), stdout=subprocess.PIPE)
            output = subprocess.check_output(("grep", "-E", f"^({dependency}==)|({dependency} @).+$"), stdin=pip_freeze.stdout)
            resolved_dependencies.append(output.decode().strip())
        except subprocess.CalledProcessError as e:
            errors.append(e)
    
    return resolved_dependencies, errors

In [3]:
install_dependencies(DEPENDENCIES)

Installing "tf-slim==1.1.0"...
Installing "numpy==1.21.6"...
Installing "pandas==1.3.5"...
Installing "seaborn"...
Installing "torch==1.11.0+cpu"...
Installing "torchvision==0.12.0+cpu"...
Installing "matplotlib==3.5.3"...
Installing "opencv-python==4.5.4.60"...
Installing "sklearn==0.0.post1"...
Installing "skorch==0.12.1"...
Installing "tqdm"...
Installing "requests"...
Installing "plotly==5.11.0"...
Installing "scikit-image==0.19.3"...


(['tf-slim==1.1.0',
  'numpy==1.21.6',
  'pandas==1.3.5',
  'seaborn @ file:///home/conda/feedstock_root/build_artifacts/seaborn-split_1629095986539/work',
  'torch==1.11.0+cpu',
  'torchvision==0.12.0+cpu',
  'matplotlib==3.5.3',
  'opencv-python==4.5.4.60',
  'sklearn==0.0.post1',
  'skorch==0.12.1',
  'tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1649051611147/work',
  'requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1656534056640/work',
  'plotly==5.11.0',
  'scikit-image==0.19.3'],
 [])

## Loading the dataset

In [4]:
from pathlib import Path

BASE_DIR = Path.cwd()
INPUT_DIR = Path("/") / "kaggle" / "input"
DATA_DIR = INPUT_DIR / "planets-dataset" / "planet" / "planet" # https://www.kaggle.com/datasets/nikitarom/planets-dataset

TRAIN_SAMPLES_DIR = DATA_DIR / 'train-jpg'
TRAIN_LABELS_FILE = DATA_DIR / 'train_classes.csv' 

In [28]:
import joblib
import matplotlib.image
import numpy as np
import numpy.typing as ntp
from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray
from skimage.feature import hog, local_binary_pattern
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm.notebook import tqdm

def extract_features(image: ntp.NDArray[np.float_]) -> ntp.NDArray[np.float_]:
    return np.hstack([
        hog(image, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(4, 4), block_norm='L2-Hys', channel_axis=2),
#         local_binary_pattern(rgb2gray(image), 3 * 8, 3, method='uniform').flatten()
    ])

def load_image(
    filepath: Path,
    dimensions: tp.Optional[tp.Tuple[int, int]] = None,
    feature_vector: bool = True
) -> ntp.NDArray[np.int_]:
    image = imread(filepath)

    if dimensions is not None:
        width, height = dimensions

        width = min(width, image.shape[0]) if width is not None else image.shape[0]
        height = min(height, image.shape[1]) if width is not None else image.shape[1]

        if width != image.shape[0] or height != image.shape[1]:
            image = resize(image, (width, height))

    if feature_vector:
        return extract_features(image)
    else:
        return image.flatten()

def load_data(
    dataset_dir: Path,
    classes_filepath: Path,
    limit: tp.Optional[int] = None,
    **kwargs
) -> tp.Tuple[ntp.NDArray[np.float_], ntp.NDArray[np.int_]]:
    df = pd.read_csv(classes_filepath)
    df.tags = np.char.split(df.tags.values.astype(str))

    encoder = MultiLabelBinarizer().fit(df.tags)

    def load_sample(row: tp.Dict[str, tp.Any]) -> tp.Tuple[ntp.NDArray[np.float_], ntp.NDArray[np.int_]]:
        filename = f'{row["image_name"]}.jpg'

        image = load_image(dataset_dir / filename, **kwargs)
        image = image.reshape(1, *image.shape)
        
        labels = encoder.transform([row['tags']])
        
        return image, labels

    rows = list(df.iterrows())
    if limit is not None:
        rows = rows[:limit]

    results = joblib.Parallel(n_jobs=8)(joblib.delayed(load_sample)(row) for _, row in tqdm(rows))
    X, y = list(zip(*results))

    return np.vstack(X), np.vstack(y), len(encoder.classes_)

In [None]:
X, y, n_classes = load_data(TRAIN_SAMPLES_DIR, TRAIN_LABELS_FILE, limit=10_000, dimensions=(64, 64), feature_vector=True)

  0%|          | 0/10000 [00:00<?, ?it/s]

Achieving reproducibility in our results, requires initializing (also known as `seeding`) the random number generators (RNG) utilized by our dependencies. In order to do so, we designate a `RANDOM_SEED` number, namely `1234`, and we use it to initialize the following RNGs:

- `numpy` (`np.random.seed`)
- `random` (`random.seed`)
- `torch (CPU)` (`torch.manual_seed`)
- `torch (GPU)` (`torch.cuda.manual_seed`)

The aforementioned RNGs are utilized by `torch`, `numpy` as well as `sklearn` in order to generate random numbers. `random.seed` corresponds to the python standard library RNG. We are seeding each and every one of them in order to cover any possible edge cases, wherein third party code utilizes any of them unbeknownst to us. Lastly, `PYTHONHASHSEED` controls the hashing of str, bytes and datetime objects. More specifically (as stated in the official `Python` documentation):

_"If this variable is not set or set to random, a random value is used to seed the hashes of str, bytes and datetime objects..."_

In [None]:
import os
import torch
import numpy as np
import random

RANDOM_SEED = 1234

if RANDOM_SEED is not None:
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed(RANDOM_SEED)
    os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

In [None]:
from sklearn.model_selection import train_test_split

train_indices, val_indices = train_test_split(np.arange(X.shape[0]), test_size=0.2, shuffle=True)

X_train, y_train = X[train_indices], y[train_indices]
X_val, y_val = X[val_indices], y[val_indices]

In [None]:
print(f'Training set: {X_train.shape}, Validation set: {X_val.shape}')

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1024).fit(X_train)

In [None]:
import plotly.graph_objects as go

go.Figure(data=go.Scatter(
    x=np.arange(1, len(pca.explained_variance_ratio_)),
    y=np.cumsum(pca.explained_variance_ratio_),
    fill='tonexty',
)).update_layout(
    title="Principal Component Analysis",
    xaxis_title="Number of Components",
    yaxis_title="Explained Variance"
).show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier

def create_pipeline(
    clf: BaseEstimator,
    n_components: tp.Optional[int] = 1024
) -> Pipeline:
    if n_components is not None:
        return Pipeline([
            ('transformer', PCA(n_components=n_components)),
            ('scaler', StandardScaler()),
            ('model', clf)
        ])

    return Pipeline([
        ('scaler', StandardScaler()),
        ('model', clf)
    ])

def create_multioutput_classifier(clf: BaseEstimator, n_components: tp.Optional[int] = 128) -> Pipeline:
    return create_pipeline(MultiOutputClassifier(estimator=clf), n_components=n_components)

In [None]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clfs = {
    'Gradient Boosting': create_multioutput_classifier(HistGradientBoostingClassifier()),
    'SGD': create_multioutput_classifier(SGDClassifier(max_iter=1000, tol=1e-3)),
    'SVM': create_multioutput_classifier(SVC()),
    'Random Forest': create_pipeline(RandomForestClassifier()),
    'K-NN': create_pipeline(KNeighborsClassifier()),
    'Ridge Regression': create_pipeline(RidgeClassifierCV()),
}

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from functools import partial

def get_scorers() -> tp.List[tp.Tuple[str, tp.Callable[[ntp.NDArray[np.int_], ntp.NDArray[np.int_]], np.float_]]]:
    return [
        ('F1 (micro)', partial(f1_score, average='micro', zero_division=0)),
        ('F1 (macro)', partial(f1_score, average='macro', zero_division=0)),
        ('Accuracy', accuracy_score),
        ('Precision', partial(precision_score, average='macro', zero_division=0)),
        ('Recall', partial(recall_score, average='macro', zero_division=0)),
    ]

def evaluate(clfs: tp.Tuple[Pipeline,...]) -> pd.DataFrame:
    scorers = get_scorers()

    progress_bar, scores = tqdm(list(clfs.items())), {}
    for name, clf in progress_bar:
        progress_bar.set_description(f'Fitting {name} classifier...')
        clf.fit(X_train, y_train)

        progress_bar.set_description(f'Evaluating {name} classifier...')
        y_pred = clf.predict(X_val)

        score_values = []
        for _, scorer in scorers:
            score_values.append(scorer(y_val, y_pred))
        
        scores[name] = score_values

    return pd.DataFrame.from_dict(scores, columns=[name for name, _ in scorers], orient='index')

In [None]:
evaluate(clfs).to_csv(BASE_DIR / 'metrics.csv', index=False)