In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.resnet50 import ResNet50
import seaborn
import matplotlib.pyplot as plt

from data_utils import *
from modeling import *

% matplotlib inline
% load_ext autoreload
% autoreload 2


Using TensorFlow backend.


 multilabel_classes: Dictionary, mapping filepaths to numpy array of (multiple) classes 
        (e.g.  {'folder/some_image.jpg': np.array([ 0.,  1.,  0.,  1.]),...})

In [3]:
random_seed=42
PLANET_KAGGLE_ROOT = '/mnt/data/planet'
PLANET_KAGGLE_LABEL_CSV = os.path.join(PLANET_KAGGLE_ROOT, 'train_v2.csv')
PLANET_KAGGLE_JPEG_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'train-jpg')
PLANET_KAGGLE_JPEG_VAL_DIR = os.path.join(PLANET_KAGGLE_ROOT, 'validate-jpg')
assert os.path.exists(PLANET_KAGGLE_ROOT)
assert os.path.exists(PLANET_KAGGLE_JPEG_DIR)
assert os.path.exists(PLANET_KAGGLE_LABEL_CSV)
assert os.path.exists(PLANET_KAGGLE_JPEG_VAL_DIR)

In [4]:
labels_df = pd.read_csv(PLANET_KAGGLE_LABEL_CSV)
labels_df.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [5]:
label_list = labels_from(labels_df)

In [6]:
labels_df = enrich_with_feature_encoding(labels_df)

In [7]:
multi_label_dict = to_multi_label_dict(labels_df)

In [8]:
nb_train_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_DIR, '*.jpg'))
nb_validation_samples = get_file_count(os.path.join(PLANET_KAGGLE_JPEG_VAL_DIR, '*.jpg'))

In [9]:
model = ResNet50(include_top=False)

In [None]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
def read_images(filepath, filenames):
    """ Read images in batches
    """
    img_data = list()
    for name in filenames:
        img_path = os.path.join(filepath, name+'.jpg')
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        img_data.append(preprocess_input(x))
    return np.concatenate(img_data)

In [None]:
def featurise_images(model, filepath, nameformat, num_iter, batch_size=32):
    """ Use DL model to featurise images
    """
    features = list()
    img_names = list()
    num_list = list(num_iter)
    num_batches = np.ceil(len(num_list)/batch_size)
    
    for num_chunk in tqdm(chunks(num_list, batch_size), total=num_batches):
        filenames = [nameformat.format(j) for index in num_chunk]
        batch_images, batch_image_names = read_images(filepath, filenames)
        img_names.extend(filenames)
        features.extend(model.predict_on_batch(batch_images).squeeze())
    return np.array(features), img_names

In [11]:
train_features, train_names = featurise_images(model, 
                                        PLANET_KAGGLE_JPEG_DIR, 
                                        'train_{}.jpg', 
                                        range(nb_train_samples))

In [13]:
validation_features, validation_names = featurise_images(model, 
                                                  PLANET_KAGGLE_JPEG_VAL_DIR, 
                                                  'train_{}.jpg', 
                                                  range(nb_train_samples, nb_train_samples+nb_validation_samples))

In [29]:
# Prepare data
y_train = np.array([multi_label_dict[name] for name in train_names])
y_val = np.array([multi_label_dict[name] for name in validation_names])

In [34]:
xgb_model = XGBClassifier(max_depth=7, learning_rate=0.1, n_estimators=300, \
                                  silent=True, objective='binary:logistic', nthread=-1, \
                                  gamma=0, min_child_weight=1, max_delta_step=0, \
                                  subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                                  base_score=0.5, seed=random_seed, missing=None)

In [35]:
def train_and_validate(model, train_features, train_labels, validation_features):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    for class_i in tqdm(range(n_classes)): 
        model.fit(train_features, train_labels[:, class_i])
        y_val_pred[:, class_i] = model.predict_proba(validation_features)[:, 1]
    return y_val_pred

In [None]:
metrics_dict = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score,
    'F2':f2_pred
}

def classification_metrics(metrics, y_true, y_pred):
    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.iteritems()}

In [32]:
results_dict = dict()

In [36]:
with Timer() as t:
    y_val_pred = train_and_validate(xgb_model, train_features, y_train, validation_features)

100%|██████████| 17/17 [36:34<00:00, 109.04s/it]


In [37]:
results_dict['xgb']={
    'time': t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

In [38]:
lgbm_model = LGBMClassifier(num_leaves=(2**7)-1, 
                            learning_rate=0.1, 
                            n_estimators=300, 
                            silent=True, 
                            seed=random_seed)

In [39]:
with Timer() as t:
    y_val_pred = train_and_validate(lgbm_model, train_features, y_train, validation_features)

100%|██████████| 17/17 [11:14<00:00, 26.75s/it]


In [40]:
results_dict['lgbm']={
    'time': t.interval,
    'performance': classification_metrics(metrics_dict, 
                                          y_val, 
                                          threshold_prediction(y_val_pred, threshold=0.1)) 
}

In [41]:
print(results_dict)

{'lgbm': {'f2': 0.91587689477618017, 'time': '664.159'}, 'xgb': {'f2': 0.91455112015089102, 'time': '2159.710'}}
