# Experiment 04: Amazon Planet

This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.resnet50 import ResNet50
import seaborn
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict

from libs.planet_kaggle import *
from libs.timer import Timer

% matplotlib inline
% load_ext autoreload
% autoreload 2


Using TensorFlow backend.


In [2]:
random_seed=42
PLANET_KAGGLE_ROOT = '/mnt/data/planet'
PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')
PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')
PLANET_KAGGLE_JPEG_VAL_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'validate-jpg')
assert os.path.exists(PLANET_KAGGLE_ROOT)
assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)
assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)
assert os.path.exists(PLANET_KAGGLE_JPEG_VAL_DIR)

Below we can see that for each image name there are a number of tags describing the scene.

In [3]:
labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)
labels_df.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


We need to convert the tags to one hot encoded labels

In [4]:
label_list = labels_from(labels_df)

In [5]:
labels_df = enrich_with_feature_encoding(labels_df)

In [6]:
multi_label_dict = to_multi_label_dict(labels_df)

In [7]:
nb_train_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_DIR, '*.jpg'))
nb_validation_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_VAL_DIR, '*.jpg'))

Here we use the ResNet50 model available from Keras to extract the features from the images

In [8]:
model = ResNet50(include_top=False)

In [9]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [10]:
def read_images(filepath, filenames):
    """ Read images in batches
    """
    img_data = list()
    for name in filenames:
        img_path = os.path.join(filepath, name+'.jpg')
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        img_data.append(preprocess_input(x))
    return np.concatenate(img_data)

In [11]:
def featurise_images(model, filepath, nameformat, num_iter, batch_size=32):
    """ Use DL model to featurise images
    """
    features = list()
    img_names = list()
    num_list = list(num_iter)
    num_batches = np.ceil(len(num_list)/batch_size)
    
    for num_chunk in tqdm(chunks(num_list, batch_size), total=num_batches):
        filenames = [nameformat.format(index) for index in num_chunk]
        batch_images = read_images(filepath, filenames)
        img_names.extend(filenames)
        features.extend(model.predict_on_batch(batch_images).squeeze())
    return np.array(features), img_names

Below we extract the features using the ResNet model

In [12]:
train_features, train_names = featurise_images(model, 
                                        PLANET_KAGGLE_JPEG_DIR, 
                                        'train_{}', 
                                        range(nb_train_samples))

100%|██████████| 1094/1094.0 [09:03<00:00,  1.88it/s]


In [13]:
validation_features, validation_names = featurise_images(model, 
                                                  PLANET_KAGGLE_JPEG_VAL_DIR, 
                                                  'train_{}', 
                                                  range(nb_train_samples, nb_train_samples+nb_validation_samples))

100%|██████████| 172/172.0 [01:25<00:00,  2.27it/s]


In [14]:
# Prepare data
y_train = np.array([multi_label_dict[name] for name in train_names])
y_val = np.array([multi_label_dict[name] for name in validation_names])

## XGBoost vs LightGBM benchmark¶

We will compare both libraries on speed and preformance.

In [15]:
xgb_model = XGBClassifier(max_depth=7, learning_rate=0.1, n_estimators=300, \
                                  silent=True, objective='binary:logistic', nthread=-1, \
                                  gamma=0, min_child_weight=1, max_delta_step=0, \
                                  subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                                  base_score=0.5, seed=random_seed, missing=None)

We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image

In [20]:
def train_and_validate(model, train_features, train_labels, validation_features):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        with Timer() as t:
            model.fit(train_features, train_labels[:, class_i])
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results

In [21]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),
    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [22]:
results_dict = dict()

In [30]:
y_val_pred, timing_results = train_and_validate(xgb_model, train_features, y_train, validation_features)

100%|██████████| 17/17 [36:26<00:00, 107.94s/it]


In [31]:
results_dict['xgb']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [32]:
lgbm_model = LGBMClassifier(num_leaves=(2**7)-1, 
                            learning_rate=0.1, 
                            n_estimators=300, 
                            silent=True, 
                            seed=random_seed)

In [33]:
y_val_pred, timing_results = train_and_validate(lgbm_model, train_features, y_train, validation_features)

100%|██████████| 17/17 [10:55<00:00, 25.50s/it]


In [34]:
results_dict['lgbm']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [35]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))

{
    "lgbm": {
        "performance": {
            "Accuracy": 0.6243840116809637,
            "F1": 0.8962781636444713,
            "Precision": 0.9267527963914166,
            "Recall": 0.8840336262265447
        },
        "test_time": 7.74908040324226,
        "train_time": 647.7121447487734
    },
    "xgb": {
        "performance": {
            "Accuracy": 0.6252965869684249,
            "F1": 0.8959568265280986,
            "Precision": 0.9265072701831234,
            "Recall": 0.883163855065662
        },
        "test_time": 1.5602951580658555,
        "train_time": 2186.1748406253755
    }
}
