# Experiment 04: Amazon Planet

This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.

In [24]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.resnet50 import ResNet50
import seaborn
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict
import pkg_resources
from libs.planet_kaggle import read_images, chunks, labels_from, enrich_with_feature_encoding, get_file_count, to_multi_label_dict
from libs.timer import Timer
from libs.utils import get_number_processors
import warnings

print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

warnings.filterwarnings("ignore", category=DeprecationWarning) 
% matplotlib inline
% load_ext autoreload
% autoreload 2

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
random_seed=42
PLANET_KAGGLE_ROOT = '/mnt/data/planet'
PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')
PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')
PLANET_KAGGLE_JPEG_VAL_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'validate-jpg')
assert os.path.exists(PLANET_KAGGLE_ROOT)
assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)
assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)
assert os.path.exists(PLANET_KAGGLE_JPEG_VAL_DIR)

Below we can see that for each image name there are a number of tags describing the scene.

In [5]:
labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)
labels_df.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


We need to convert the tags to one hot encoded labels

In [6]:
label_list = labels_from(labels_df)

In [7]:
labels_df = enrich_with_feature_encoding(labels_df)

In [8]:
multi_label_dict = to_multi_label_dict(labels_df)

In [9]:
nb_train_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_DIR, '*.jpg'))
nb_validation_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_VAL_DIR, '*.jpg'))

Here we use the ResNet50 model available from Keras to extract the features from the images

In [10]:
model = ResNet50(include_top=False)

  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect.getargspec(cls.from_config)
  arg_spec = inspect

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [13]:
def featurise_images(model, filepath, nameformat, num_iter, batch_size=32):
    """ Use DL model to featurise images
    """
    features = list()
    img_names = list()
    num_list = list(num_iter)
    num_batches = np.ceil(len(num_list)/batch_size)
    
    for num_chunk in tqdm(chunks(num_list, batch_size), total=num_batches):
        filenames = [nameformat.format(index) for index in num_chunk]
        batch_images = read_images(filepath, filenames)
        img_names.extend(filenames)
        features.extend(model.predict_on_batch(batch_images).squeeze())
    return np.array(features), img_names

Below we extract the features using the ResNet model

In [14]:
train_features, train_names = featurise_images(model, 
                                        PLANET_KAGGLE_JPEG_DIR, 
                                        'train_{}', 
                                        range(nb_train_samples))
print(train_features.shape)
print(len(train_names))

100%|██████████| 1265/1265.0 [10:28<00:00,  1.79it/s]


In [26]:
validation_features, validation_names = featurise_images(model, 
                                                  PLANET_KAGGLE_JPEG_VAL_DIR, 
                                                  'train_{}', 
                                                  range(nb_train_samples, nb_train_samples+nb_validation_samples))
print(validation_features.shape)
print(len(validation_names))


0it [00:00, ?it/s]

(0,)
0


[A
[A

In [16]:
# Prepare data
y_train = np.array([multi_label_dict[name] for name in train_names])
y_val = np.array([multi_label_dict[name] for name in validation_names])

## XGBoost vs LightGBM benchmark

We will compare both libraries on speed and preformance.

In [17]:
number_processors = get_number_processors()

We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image

In [18]:
def train_and_validate(model, train_features, train_labels, validation_features):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        with Timer() as t:
            model.fit(train_features, train_labels[:, class_i])
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results

In [19]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),
    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}

In [20]:
results_dict = dict()

In [21]:
xgb_model = XGBClassifier(max_depth=7, 
                         learning_rate=0.1, 
                         scale_pos_weight=2,
                         n_estimators=300,
                         gamma=0.1,
                         min_child_weight=1,
                         reg_lambda=1,
                         subsample=1,
                         n_jobs=number_processors,
                         random_state=random_seed) 

In [23]:
validation_features.shape

(0,)

In [None]:
y_val_pred, timing_results = train_and_validate(xgb_model, train_features, y_train, validation_features)

In [None]:
results_dict['xgb']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

In [None]:
xgb_hist_model = XGBClassifier(max_depth=0, 
                              learning_rate=0.1, 
                              scale_pos_weight=2,
                              n_estimators=300,
                              gamma=0.1,
                              min_child_weight=1,
                              reg_lambda=1,
                              subsample=1,
                              max_leaves=2**7,
                              grow_policy='lossguide',
                              tree_method='hist',
                              n_jobs=number_processors,
                              random_state=random_seed)

In [None]:
y_val_pred, timing_results = train_and_validate(xgb_hist_model, train_features, y_train, validation_features)

In [None]:
results_dict['xgb_hist']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

In [None]:
lgbm_model = LGBMClassifier(num_leaves=2**7, 
                           learning_rate=0.1, 
                           scale_pos_weight=2,
                           n_estimators=300,
                           min_split_gain=0.1,
                           min_child_weight=1,
                           reg_lambda=1,
                           subsample=1,
                           nthread=number_processors,
                           seed=random_seed

In [None]:
y_val_pred, timing_results = train_and_validate(lgbm_model, train_features, y_train, validation_features)

In [None]:
results_dict['lgbm']={
    'train_time': np.sum(timing_results['train_time']),
    'test_time': np.sum(timing_results['test_time']),
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

In [None]:
# Results
print(json.dumps(results_dict, indent=4, sort_keys=True))