# Experiment 04: Amazon Planet

This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.

In [2]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import seaborn
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict
import pkg_resources
from libs.planet_kaggle import (read_images, chunks, labels_from, enrich_with_feature_encoding, 
                                get_file_count, to_multi_label_dict, threshold_prediction)
from libs.timer import Timer
from libs.utils import get_number_processors
import warnings
from libs.loaders import load_planet_kaggle

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

warnings.filterwarnings("ignore", category=DeprecationWarning) 
% matplotlib inline
% load_ext autoreload
% autoreload 2

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
random_seed=42

In [4]:
%env MOUNT_POINT=/datadrive

env: MOUNT_POINT=/datadrive


The images are loaded and featurised using a pretrained ResNet50 model available from Keras

In [6]:
X_train, y_train, X_test, y_test = load_planet_kaggle()

INFO:libs.loaders:Reading in labels
Featurising training images: 100%|██████████| 1094/1094.0 [09:29<00:00,  1.85it/s]
Featurising validation images: 100%|██████████| 172/172.0 [01:26<00:00,  2.42it/s]


## XGBoost vs LightGBM benchmark

We will compare both libraries on speed and preformance.

In [7]:
number_processors = get_number_processors()

We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image

In [8]:
def train_and_validate(model, train_features, train_labels, validation_features):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        with Timer() as t:
            model.fit(train_features, train_labels[:, class_i])
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results

In [9]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),
    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [10]:
results_dict = dict()

In [11]:
xgb_model = XGBClassifier(max_depth=7, 
                         learning_rate=0.1, 
                         scale_pos_weight=2,
                         n_estimators=300,
                         gamma=0.1,
                         min_child_weight=1,
                         reg_lambda=1,
                         subsample=1,
                         n_jobs=number_processors,
                         random_state=random_seed) 

In [12]:
y_pred, timing_results = train_and_validate(xgb_model, X_train, y_train, X_test)

100%|██████████| 17/17 [33:21<00:00, 99.61s/it] 


In [13]:
results_dict['xgb']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [14]:
xgb_hist_model = XGBClassifier(max_depth=0, 
                              learning_rate=0.1, 
                              scale_pos_weight=2,
                              n_estimators=300,
                              gamma=0.1,
                              min_child_weight=1,
                              reg_lambda=1,
                              subsample=1,
                              max_leaves=2**7,
                              grow_policy='lossguide',
                              tree_method='hist',
                              n_jobs=number_processors,
                              random_state=random_seed)

In [None]:
y_pred, timing_results = train_and_validate(xgb_hist_model, X_train, y_train, X_test)

  0%|          | 0/17 [00:00<?, ?it/s]

In [18]:
results_dict['xgb_hist']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [20]:
lgbm_model = LGBMClassifier(num_leaves=2**7, 
                           learning_rate=0.1, 
                           scale_pos_weight=2,
                           n_estimators=300,
                           min_split_gain=0.1,
                           min_child_weight=1,
                           reg_lambda=1,
                           subsample=1,
                           nthread=number_processors,
                           seed=random_seed)

In [21]:
y_pred, timing_results = train_and_validate(lgbm_model, X_train, y_train, X_test)

100%|██████████| 17/17 [11:51<00:00, 18.41s/it]


In [None]:
results_dict['lgbm']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

In [None]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))