In [1]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.utils import shuffle

import logging
import logging.config
logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('bayesian_opt.log') ##### change the log file name!
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
logger.addHandler(fh)

from sklearn.metrics import *
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.metrics import precision_score as user_acc
from sklearn.metrics import recall_score as prod_acc
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

from madmex.validation import validate, pprint_val_dict

import warnings
import random

from skopt import gp_minimize
from skopt.space import Real, Integer

In [2]:
warnings.filterwarnings('ignore')

In [3]:
ds_file = '/shared_volume/datacube/training_objects/training_mex_landsat_2017_002_L7L8_1415.pkl'

In [4]:
with open(ds_file, 'rb') as f:
    training_dataset = pickle.load(f)

In [5]:
training_dataset[1].shape

(4327748,)

In [6]:
# Data per class. Classes number correspond to id field in madmex_tag table, not to numeric_code filed!!
pxpcl = np.array([training_dataset[1][training_dataset[1] == cl].size for cl in range(1,32)])

In [7]:
pxpcl

array([  3075, 391632, 360178,  41292,  66594,  44811, 141600,  22293,
        76187,   2410, 390418, 119866, 561611,  22691,  86128,  43360,
       136050,  12731,   8686, 213173,   1576,  20744,   1906,  41933,
        15784,  40880, 523000, 763402,  56011,  43334,  74392])

In [8]:
# modification of remove_outliers method to also give as output the outliers, in case we want to explore them
from sklearn.ensemble import IsolationForest
from madmex.util.numpy import groupby

def remove_outliers(X, y, n_estimators=101, max_samples='auto', contamination=0.25,
                        bootstrap=True, n_jobs=-1, **kwargs):
        """Performs outliers detection and removal using Isolation Forest anomaly score
        Args:
            X (np.ndarray): Array of independent variables of shape (n,m)
            y (np.ndarray): Array of dependent variable of shape (n,)
            contamination (float): The amount of contamination of the data set,
                i.e. the proportion of outliers in the data set. Used when
                fitting to define the threshold on the decision function.
            max_sample (float): Proportion of observations to draw from X to fit
                each estimator
            **kwargs: Arguments passed to ``sklearn.ensemble.IsolationForest``
        Example:
            >>> from sklearn.datasets import make_classification
            >>> from madmex.modeling import BaseModel
            >>> X, y = make_classification(n_samples=10000, n_features=10,
            >>>                            n_classes=5, n_informative=6)
            >>> X_clean, y_clean = BaseModel.remove_outliers(X, y)
            >>> print('Input shape:', X.shape, 'Output shape:', X_clean.shape)
        Return:
            tuple: Tuple of filtered X and y arrays (X, y)
        """
        # Split X
        grouped = groupby(X, y)
        X_list = []
        y_list = []
        Xo_list = []
        yo_list = []
        for g in grouped:
            isolation_forest = IsolationForest(n_estimators=n_estimators,
                                               max_samples=max_samples,
                                               contamination=contamination,
                                               bootstrap=bootstrap,
                                               n_jobs=n_jobs,
                                               **kwargs)
            isolation_forest.fit(g[1])
            is_inlier = isolation_forest.predict(g[1])
            is_outlier = np.where(is_inlier == 1, False, True)
            is_inlier = np.where(is_inlier == 1, True, False)
            X_out = g[1][is_inlier,:]
            X_list.append(X_out)
            Xo_out = g[1][is_outlier,:]
            Xo_list.append(Xo_out)

            y_out = np.empty_like(X_out[:,0], dtype=np.int16)
            y_out[:] = g[0]
            y_list.append(y_out)
            yo_out = np.empty_like(Xo_out[:,0], dtype=np.int16)
            yo_out[:] = g[0]
            yo_list.append(yo_out)
        # Concatenate returned arrays
        X = np.concatenate(X_list)
        y = np.concatenate(y_list)
        Xo = np.concatenate(Xo_list)
        yo = np.concatenate(yo_list)
        return (X, y, Xo, yo)

In [9]:
# X, y is the data without outliers and Xo, yo are the outliers
X, y, Xo, yo = remove_outliers(training_dataset[0],training_dataset[1])

In [10]:
pxpcl_ro = np.array([y[y == cl].size for cl in range(1,32)])

In [11]:
pxpcl_o = np.array([yo[yo == cl].size for cl in range(1,32)])

In [12]:
pxpcl

array([  3075, 391632, 360178,  41292,  66594,  44811, 141600,  22293,
        76187,   2410, 390418, 119866, 561611,  22691,  86128,  43360,
       136050,  12731,   8686, 213173,   1576,  20744,   1906,  41933,
        15784,  40880, 523000, 763402,  56011,  43334,  74392])

In [13]:
pxpcl_ro

array([  2306, 293724, 270133,  30969,  49945,  33608, 106200,  16720,
        57140,   1807, 292813,  89899, 421208,  17018,  64596,  32520,
       102037,   9548,   6514, 159880,   1182,  15558,   1429,  31450,
        11838,  30660, 392250, 572551,  42008,  32500,  55794])

In [14]:
pxpcl_o

array([   769,  97908,  90045,  10323,  16649,  11203,  35400,   5573,
        19047,    603,  97605,  29967, 140403,   5673,  21532,  10840,
        34013,   3183,   2172,  53293,    394,   5186,    477,  10483,
         3946,  10220, 130750, 190851,  14003,  10834,  18598])

#### split dataset
to have a subsample of X, y we take 20000 per class and in case a class has less than 20000, the whole data of that class is taken. We shuffle the data, because X,y is ordered and that is not convenient for training sake

In [15]:
indexes = [np.where(y== cl)[0] for cl in range(1,32)]

In [16]:
len(indexes[0])

2306

In [17]:
def shuffled(l):
    o = []
    _ = [o.append(i) for i in l]
    random.shuffle(o)
    return o

In [18]:
random.seed(35473234)
new_indexes = [ shuffled(indexes[cl])[:20000] if len(indexes[cl]) >= 20000 else shuffled(indexes[cl]) for cl in range(31) ]

In [19]:
len(new_indexes[21])

15558

In [20]:
merged_new_indexes = np.array(np.concatenate([new_indexes[0],new_indexes[1],new_indexes[2],new_indexes[3],new_indexes[4],
                                              new_indexes[5],new_indexes[6],new_indexes[7],new_indexes[8],new_indexes[9],
                                              new_indexes[10],new_indexes[11],new_indexes[12],new_indexes[13],
                                              new_indexes[14],new_indexes[15],new_indexes[16],new_indexes[17],
                                              new_indexes[18],new_indexes[19],new_indexes[20],new_indexes[21],
                                              new_indexes[22],new_indexes[23],new_indexes[24],new_indexes[25],
                                              new_indexes[26],new_indexes[27],new_indexes[28],new_indexes[29],
                                              new_indexes[30]]),int)

In [21]:
len(merged_new_indexes)

503920

In [22]:
new_X = X[merged_new_indexes,:]

In [23]:
new_y = y[merged_new_indexes]

In [24]:
new_X.shape

(503920, 15)

In [25]:
new_y.shape

(503920,)

In [26]:
# We split the new subset into training and test datasets, where the test dataset is the 20%
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=546842346)

In [27]:
X_shuff, y_shuff = shuffle(X_train,y_train,random_state=658432434)

In [28]:
# data per class of final training dataset
pxpcl_tr = np.array([y_train[y_train == cl].size for cl in range(1,32)])

In [29]:
pxpcl_tr

array([ 1847, 16082, 15960, 16015, 16092, 15900, 15984, 13385, 15998,
        1460, 16002, 16040, 15970, 13560, 15989, 15894, 16007,  7615,
        5134, 16080,   954, 12512,  1174, 15989,  9442, 15950, 15992,
       16015, 16129, 15951, 16014])

### New hyperparameters with bayesian optimization for lgboost

In [30]:
search_space = [Real(0.1, 0.5, name='reg_lambda'),
                Real(0.1, 0.5, name='reg_alpha'),
                Integer(500, 1500, name='n_estimators'),
                Integer(5, 25, name='max_depth'),
                Real(0.001,0.1, name='learning_rate')]

In [31]:
def BayesianOptimization(values):
    params = {'reg_lambda': values[0], 'reg_alpha': values[1], 'random_state': 234567943, 'n_jobs': 5, 
              'n_estimators': values[2], 'max_depth': values[3], 'learning_rate': values[4]}

    print('\nTesting next set of paramaters...', params)
    logger.debug('\nTesting next set of paramaters... {}'.format(params))
    
    modeloLGB = LGBMClassifier(**params)
    modeloLGB.fit(X_shuff,y_shuff)
    
    y_pred_test_lgb = modeloLGB.predict(X_test)
    y_pred_prob_test_lgb = modeloLGB.predict_proba(X_test)
    
    classification_report(y_pred_test_lgb, y_test)
    acc = accuracy_score(y_test, y_pred_test_lgb)
    print("Test accuracy: {}".format(acc))
    logger.debug("Test accuracy: {}".format(acc))
    
    print("Test log loss: {}".format(log_loss(y_test, y_pred_prob_test_lgb)))
    logger.debug("Test log loss: {}".format(log_loss(y_test, y_pred_prob_test_lgb)))
    
    y_pred_shuff_lgb = modeloLGB.predict(X_shuff)
    y_pred_prob_shuff_lgb = modeloLGB.predict_proba(X_shuff)
    
    classification_report(y_pred_shuff_lgb, y_shuff)
    print("Training accuracy: {}".format(accuracy_score(y_shuff, y_pred_shuff_lgb)))
    logger.debug("Training accuracy: {}".format(accuracy_score(y_shuff, y_pred_shuff_lgb)))
    print("Training log loss: {}".format(log_loss(y_shuff, y_pred_prob_shuff_lgb)))
    logger.debug("Training log loss: {}".format(log_loss(y_shuff, y_pred_prob_shuff_lgb)))
    
    return -acc

In [None]:
res_gp = gp_minimize(BayesianOptimization, search_space, random_state=23471416, n_jobs=5, verbose=True, n_random_starts=10)

Iteration No: 1 started. Evaluating function at random point.

Testing next set of paramaters... {'reg_lambda': 0.25514521380251476, 'reg_alpha': 0.24878301265635247, 'random_state': 234567943, 'n_jobs': 5, 'n_estimators': 1288, 'max_depth': 17, 'learning_rate': 0.04637138312542715}
