In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [None]:
SAMPLE = 5000

# Data preparation

We start from loading images into memory.

In [None]:
from tmle.dataloaders import ImageFoldersDataset

Images in both, training and test sets, have different, unregular sizes. Thus, we will use transformers which first resize image to `256` px and then randomly crop them to target dimension of `224x224` px. After that, images will be converted to `torch.Tensors`.

In [None]:
from torchvision import transforms

simple_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor()
])

Then, we load the images in mini-batches and calculate the mean and standard deviations for each of the RGB channels. Obtained values will be used during normalization of inputs which will be passed do `SVM` and `CNN`.

In [None]:
dataset = ImageFoldersDataset(
    path_to_data='../data/cpu/train',
    transform=simple_transform
)

Notice that even for small images (ie. `(224, 224, 3)`) calculating means of each RGB channel for dataset of reasonable size (ie. `17k`) would required operating on vectors of size `17000 * 224 * 224 * 3`. In order to increase the speed of calculation we performed calculations over mini-batches.

In [None]:
%%time
from collections import defaultdict

means, stds = defaultdict(list), defaultdict(list)
counter = 0
for data in dataset.loader(batch_size=170):
    images, _ = data
    for channel in [0, 1, 2]:
        means[channel].append(images[:, channel, :, :].mean().item())
        stds[channel].append(images[:, channel, :, :].std().item())
    counter += 1
    if counter % 10 == 0:
        print('Mean calculated for {n} batches'.format(n=counter / (17000 / 170)))

In [None]:
means = [np.mean(means[channel]) for channel in [0, 1, 2]]
stds = [np.mean(stds[channel]) for channel in [0, 1, 2]]

In [None]:
simple_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=means, std=stds)
])

In [None]:
dataset_normalized = ImageFoldersDataset(
    path_to_data='../data/cpu/train/',
    transform=simple_transform
)
if SAMPLE:
    X_train, y_train = dataset_normalized.load_all_images()
    random_sample_idx = np.random.randint(low=0, high=len(X_train), size=SAMPLE)
    X_train, y_train = X_train[random_sample_idx], y_train[random_sample_idx]
else:
    X_train, y_train = dataset_normalized.load_all_images()

***Important note about reproducibility***.

Completely reproducible results are not guaranteed across PyTorch releases, individual commits or different platforms. Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds.

However, in order to make computations deterministic on specific problem on one specific platform and Pytorch release, there are a couple of steps to take.

There are two pseudorandom number generators involved in PyTorch, which we had to seed manually to made runs reproducible. We implemented `tmle.dataloaders.ImageFolderDataset` setting seed as follows:

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

# Shallow classifier

Ogranicz zbiór danych na potrzeby testów.

    idx = np.random.randint(low=0, high=17000, size=17000)
    X_train, y_train = images[idx], labels[idx]

    del(images, labels)

In [None]:
from tmle.transformers import HOGTransformer

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC

pipeline = Pipeline(steps=[
    ('hog', HOGTransformer(
        orientations=9,
        pixels_per_cell=(12, 12),
        cells_per_block=(1, 1))
    ),
    ('svm', LinearSVC())
])

In [None]:
%%time
pipeline.fit(X_train, y_train)

In [None]:
%%time
y_train_preds = pipeline.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, balanced_accuracy_score

print('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(
    acc=accuracy_score(y_train, y_train_preds),
    bal_acc=balanced_accuracy_score(y_train, y_train_preds)
))

In [None]:
print(classification_report(y_train, y_train_preds))

In [None]:
test_dataset = ImageFoldersDataset(
    path_to_data='../data/cpu/test/',
    transform=simple_transform
)
X_test, y_test = test_dataset.load_all_images()
y_test_preds = pipeline.predict(X_test)

In [None]:
print('Accuracy: {acc:.5f}. Balanced accuracy: {bal_acc:.5f}'.format(
    acc=accuracy_score(y_test, y_test_preds),
    bal_acc=balanced_accuracy_score(y_test, y_test_preds)
))

# Hyperparameters tuning

We start from definition of `Pipeline` which will be fed with parameters sampled from *hyperparameters space*. Our `Pipeline` consists of three steps:

* `HOGTransformer`: implements method of transforming images into features vector based on histograms of oriented gradients,
* `PCA`: reduces dimensionality of features vector outputed by `HOGTransformer`. In some scenarios the final features vector may have dimensions that will be hard to train on single-CPU machine,
* `LinearSVC`: classifies the images. It scales good in terms of both: number of instances and number of features. We will be optimizing the value of *regularization* parameters and experimenting with different loss functions.

In [None]:
pipe = Pipeline(steps=[
    ('hog', HOGTransformer()),
    ('pca', PCA()),
    ('svm', LinearSVC(max_iter=50000))
])

## Define hyperparameters space

Our definition of *hyperparameters space* encourages `TPE` algorithm to suggest `Pipelines` which differs not only in terms of classifier, but also in terms of operations applied to data in *preprocessing* stage.

In [None]:
import hyperopt
from hyperopt import tpe, fmin, hp, Trials, STATUS_OK

space = dict()
space['hog__orientations'] = hp.choice('orientations', [9, 12, 18])
space['hog__pixels_per_cell'] = hp.choice('pixels_per_cell', [(8, 8), (12, 12), (24, 24)])
space['hog__cells_per_block'] = hp.choice('cells_per_block', [(1, 1), (2, 2), (4, 4)])
space['hog__block_norm'] = hp.choice('block_norm', ['L1', 'L2-Hys'])
space['pca__n_components'] = hp.choice('n_components', np.arange(50, 550, 50))
space['svm__loss'] = hp.choice('loss', ['hinge', 'squared_hinge'])
space['svm__C'] = hp.uniform('C', 0.001, 5)

## Conduct experiments

We use:

* `sklearn.model_selection.StratifiedKFold` because of class imbalance present in the training set (we set the `n_splits` to `3`),
* `sklearn.metrics.balanced_accuracy_score` to measure the performance of given classifier on both: training and validation sets. We try to minimize the objective with return score calculated as `1 - balanced_accuracy_score(validation_set)`.

In [None]:
from tmle.model_selection import ClassifierOptimizer
from sklearn.metrics import balanced_accuracy_score

clf_optim = ClassifierOptimizer(
    classifier=pipe,
    space=space,
    metric=balanced_accuracy_score
)
clf_optim.find_best_params(
    X_train,
    y_train,
    experiments_path='../experiments/',
    experiments_name='shallow_clf_tpe_pipeline',
    max_evals=5
)