# Experiment 04: Amazon Planet

This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.

For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb).

In [2]:
import sys
from collections import defaultdict
import numpy as np
import pkg_resources
from libs.loaders import load_planet_kaggle
from libs.planet_kaggle import threshold_prediction
from libs.timer import Timer
from libs.utils import get_number_processors
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))


System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2


In [3]:
random_seed=42

In [4]:
%env MOUNT_POINT=/datadrive

env: MOUNT_POINT=/datadrive


The images are loaded and featurised using a pretrained ResNet50 model available from Keras

In [5]:
X_train, y_train, X_test, y_test = load_planet_kaggle()

Featurising training images: 100%|██████████| 1094/1094.0 [34:11<00:00,  1.73s/it]
Featurising validation images: 100%|██████████| 172/172.0 [05:21<00:00,  1.46s/it]


## XGBoost vs LightGBM benchmark

We will compare both libraries on speed and preformance.

In [6]:
number_processors = get_number_processors()
print("Number of processors: ", number_processors)

Number of processors:  24


We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image

In [7]:
def train_and_validate_xgboost(params, train_features, train_labels, validation_features, num_boost_round=300):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        dtrain = xgb.DMatrix(data=train_features, label=train_labels[:, class_i])
        dtest = xgb.DMatrix(data=validation_features)
        with Timer() as t:
            model = xgb.train(params, dtrain, num_boost_round=num_boost_round)
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(dtest)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results

In [8]:
def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round=300):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        dtrain = lgb.Dataset(train_features.values, train_labels[:, class_i].values, free_raw_data=False)
        with Timer() as t:
            model = lgb.train(params, dtrain, num_boost_round=num_boost_round)
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results

In [9]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),
    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [10]:
results_dict = dict()

In [11]:
xgb_params = {'max_depth':8, 
              'objective':'binary:logistic', 
              'min_child_weight':1, 
              'eta':0.1, 
              'colsample_bytree':0.80,
              'scale_pos_weight':2, 
              'gamma':0.1, 
              'reg_lamda':1, 
              'subsample':1,
              'nthread':number_processors
             }

In [12]:
y_pred, timing_results = train_and_validate_xgboost(xgb_params, X_train, y_train, X_test)

100%|██████████| 17/17 [28:39<00:00, 83.42s/it] 


In [13]:
results_dict['xgb']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [13]:
xgb_hist_params = {'max_depth':0, 
                  'objective':'binary:logistic', 
                  'min_child_weight':1, 
                  'eta':0.1, 
                  'colsample_bytree':0.80,
                  'scale_pos_weight':2, 
                  'gamma':0.1, 
                  'reg_lamda':1, 
                  'subsample':1,
                  'tree_method':'hist', 
                  'max_leaves':255, 
                  'grow_policy':'lossguide',
                  'nthread':number_processors
                 }

In [None]:
y_pred, timing_results = train_and_validate_xgboost(xgb_hist_params, X_train, y_train, X_test)

 12%|█▏        | 2/17 [32:25<4:11:22, 1005.53s/it]

In [None]:
results_dict['xgb_hist']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

In [None]:
lgbm_params = {'num_leaves': 2**8,
               'learning_rate': 0.1,
               'scale_pos_weight': 1,
               'min_split_gain': 0.1,
               'min_child_weight': 1,
               'reg_lambda': 1,
               'subsample': 1,
               'objective':'binary',
               'task': 'train',
               'nthread': number_processors
               }

In [None]:
y_pred, timing_results = train_and_validate(lgbm_model, X_train, y_train, X_test)

In [None]:
results_dict['lgbm']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_test, 
                                          threshold_prediction(y_pred, threshold=0.1)) 
}

In [None]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))