In [1]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#from matplotlib.ticker import NullFormatter
import numpy as np
import pickle
#from sklearn.manifold import TSNE
#from imblearn.over_sampling import SMOTE
#from time import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.utils import shuffle

import logging
import logging.config
logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('randomsearch.log') ##### change the log file name!
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
logger.addHandler(fh)

from sklearn.metrics import *
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.metrics import precision_score as user_acc
from sklearn.metrics import recall_score as prod_acc
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
#from scipy import interp

from madmex.validation import validate, pprint_val_dict

import warnings
import random

In [2]:
warnings.filterwarnings('ignore')

In [3]:
ds_file = '/shared_volume/datacube/training_objects/training_mex_landsat_2017_002_L7L8_1415.pkl'

In [4]:
with open(ds_file, 'rb') as f:
    training_dataset = pickle.load(f)

In [5]:
training_dataset[1].shape

(4327748,)

In [6]:
# Data per class. Classes number correspond to id field in madmex_tag table, not to numeric_code filed!!
pxpcl = np.array([training_dataset[1][training_dataset[1] == cl].size for cl in range(1,32)])

In [7]:
pxpcl

array([  3075, 391632, 360178,  41292,  66594,  44811, 141600,  22293,
        76187,   2410, 390418, 119866, 561611,  22691,  86128,  43360,
       136050,  12731,   8686, 213173,   1576,  20744,   1906,  41933,
        15784,  40880, 523000, 763402,  56011,  43334,  74392])

In [8]:
# modification of remove_outliers method to also give as output the outliers, in case we want to explore them
from sklearn.ensemble import IsolationForest
from madmex.util.numpy import groupby

def remove_outliers(X, y, n_estimators=101, max_samples='auto', contamination=0.25,
                        bootstrap=True, n_jobs=-1, **kwargs):
        """Performs outliers detection and removal using Isolation Forest anomaly score
        Args:
            X (np.ndarray): Array of independent variables of shape (n,m)
            y (np.ndarray): Array of dependent variable of shape (n,)
            contamination (float): The amount of contamination of the data set,
                i.e. the proportion of outliers in the data set. Used when
                fitting to define the threshold on the decision function.
            max_sample (float): Proportion of observations to draw from X to fit
                each estimator
            **kwargs: Arguments passed to ``sklearn.ensemble.IsolationForest``
        Example:
            >>> from sklearn.datasets import make_classification
            >>> from madmex.modeling import BaseModel
            >>> X, y = make_classification(n_samples=10000, n_features=10,
            >>>                            n_classes=5, n_informative=6)
            >>> X_clean, y_clean = BaseModel.remove_outliers(X, y)
            >>> print('Input shape:', X.shape, 'Output shape:', X_clean.shape)
        Return:
            tuple: Tuple of filtered X and y arrays (X, y)
        """
        # Split X
        grouped = groupby(X, y)
        X_list = []
        y_list = []
        Xo_list = []
        yo_list = []
        for g in grouped:
            isolation_forest = IsolationForest(n_estimators=n_estimators,
                                               max_samples=max_samples,
                                               contamination=contamination,
                                               bootstrap=bootstrap,
                                               n_jobs=n_jobs,
                                               **kwargs)
            isolation_forest.fit(g[1])
            is_inlier = isolation_forest.predict(g[1])
            is_outlier = np.where(is_inlier == 1, False, True)
            is_inlier = np.where(is_inlier == 1, True, False)
            X_out = g[1][is_inlier,:]
            X_list.append(X_out)
            Xo_out = g[1][is_outlier,:]
            Xo_list.append(Xo_out)

            y_out = np.empty_like(X_out[:,0], dtype=np.int16)
            y_out[:] = g[0]
            y_list.append(y_out)
            yo_out = np.empty_like(Xo_out[:,0], dtype=np.int16)
            yo_out[:] = g[0]
            yo_list.append(yo_out)
        # Concatenate returned arrays
        X = np.concatenate(X_list)
        y = np.concatenate(y_list)
        Xo = np.concatenate(Xo_list)
        yo = np.concatenate(yo_list)
        return (X, y, Xo, yo)

In [9]:
# X, y is the data without outliers and Xo, yo are the outliers
X, y, Xo, yo = remove_outliers(training_dataset[0],training_dataset[1])

In [10]:
pxpcl_ro = np.array([y[y == cl].size for cl in range(1,32)])

In [11]:
pxpcl_o = np.array([yo[yo == cl].size for cl in range(1,32)])

In [12]:
pxpcl

array([  3075, 391632, 360178,  41292,  66594,  44811, 141600,  22293,
        76187,   2410, 390418, 119866, 561611,  22691,  86128,  43360,
       136050,  12731,   8686, 213173,   1576,  20744,   1906,  41933,
        15784,  40880, 523000, 763402,  56011,  43334,  74392])

In [13]:
pxpcl_ro

array([  2306, 293724, 270133,  30969,  49945,  33608, 106200,  16720,
        57140,   1807, 292813,  89899, 421208,  17018,  64596,  32520,
       102037,   9548,   6514, 159880,   1182,  15558,   1429,  31450,
        11838,  30660, 392250, 572551,  42008,  32500,  55794])

In [14]:
pxpcl_o

array([   769,  97908,  90045,  10323,  16649,  11203,  35400,   5573,
        19047,    603,  97605,  29967, 140403,   5673,  21532,  10840,
        34013,   3183,   2172,  53293,    394,   5186,    477,  10483,
         3946,  10220, 130750, 190851,  14003,  10834,  18598])

#### split dataset
to have a subsample of X, y we take 20000 per class and in case a class has less than 20000, the whole data of that class is taken. We shuffle the data, because X,y is ordered and that is not convenient for training sake

In [15]:
indexes = [np.where(y== cl)[0] for cl in range(1,32)]

In [16]:
len(indexes[0])

2306

In [17]:
def shuffled(l):
    o = []
    _ = [o.append(i) for i in l]
    random.shuffle(o)
    return o

In [18]:
random.seed(35473234)
new_indexes = [ shuffled(indexes[cl])[:20000] if len(indexes[cl]) >= 20000 else shuffled(indexes[cl]) for cl in range(31) ]

In [19]:
len(new_indexes[21])

15558

In [20]:
merged_new_indexes = np.array(np.concatenate([new_indexes[0],new_indexes[1],new_indexes[2],new_indexes[3],new_indexes[4],
                                              new_indexes[5],new_indexes[6],new_indexes[7],new_indexes[8],new_indexes[9],
                                              new_indexes[10],new_indexes[11],new_indexes[12],new_indexes[13],
                                              new_indexes[14],new_indexes[15],new_indexes[16],new_indexes[17],
                                              new_indexes[18],new_indexes[19],new_indexes[20],new_indexes[21],
                                              new_indexes[22],new_indexes[23],new_indexes[24],new_indexes[25],
                                              new_indexes[26],new_indexes[27],new_indexes[28],new_indexes[29],
                                              new_indexes[30]]),int)

In [21]:
len(merged_new_indexes)

503920

In [22]:
new_X = X[merged_new_indexes,:]

In [23]:
new_y = y[merged_new_indexes]

In [24]:
new_X.shape

(503920, 15)

In [25]:
new_y.shape

(503920,)

In [26]:
# We split the new subset into training and test datasets, where the test dataset is the 20%
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=546842346)

In [27]:
X_shuff, y_shuff = shuffle(X_train,y_train,random_state=658432434)

In [28]:
# data per class of final training dataset
pxpcl_tr = np.array([y_train[y_train == cl].size for cl in range(1,32)])

In [30]:
pxpcl_tr

array([ 1847, 16082, 15960, 16015, 16092, 15900, 15984, 13385, 15998,
        1460, 16002, 16040, 15970, 13560, 15989, 15894, 16007,  7615,
        5134, 16080,   954, 12512,  1174, 15989,  9442, 15950, 15992,
       16015, 16129, 15951, 16014])

## Random search

In [34]:
def define_hyper_params():
    """
        Esta función devuelve un diccionario con
        los clasificadores que vamos a utilizar y
        una rejilla de hiperparámetros
    """
    seed = 234567943
    clfs = {'RF': RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=seed),
        'LGB': LGBMClassifier(n_estimators=10, n_jobs=-1, random_state=seed),
        'XGB': XGBClassifier(objective='multi:softmax', n_estimators=10, n_jobs=-1, random_state=seed)
            }

    grid = { 
    'RF':{'n_estimators': [500,700,900], 'max_depth': [None,30,45], 'max_features': ['sqrt','log2', None],'min_samples_split': [2,3,5], 'n_jobs': [17], 'class_weight': [None, 'balanced'], 'random_state': [seed]},
    'LGB': { 'n_estimators': [500,700,900], 'learning_rate' : [0.001,0.01,0.1,0.5], 'n_jobs': [17], 'max_depth': [-1,15,30], 'reg_alpha': [0.0,0.01,0.1,0.5], 'reg_lambda': [0.0,0.01,0.1,0.5], 'random_state': [seed]},
    'XGB': { 'objective': ['multi:softmax'], 'n_estimators': [500,700,900], 'n_jobs': [17], 'max_depth': [5,10,15], 
             'reg_alpha': [0.0,0.01,0.1,0.5], 'reg_lambda': [1,0.9,0.5], 'learning_rate' : [0.001,0.0005], 'gamma': [0,10,20], 'random_state': [seed]}
           }

    return clfs, grid

In [35]:
def magic_loop(models_to_run, clfs, grid, X_train, y_train, X_test, y_test, cv = 5):
        modelos = []
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            logger.debug(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            try:
#                    grid_search = GridSearchCV(clf, parameter_values, cv=cv)
                    random_search = RandomizedSearchCV(clf, parameter_values, cv=cv, n_iter=10)
                    start = time()
#                    modelo = grid_search.fit(X_train, y_train)
                    modelo = random_search.fit(X_train, y_train)
#                    y_pred = modelo.predict(X_test)
#                    y_pred_prob = modelo.predict_proba(X_test)[:,1]

                    logger.debug("----------------------------------------")
                    logger.debug("{}".format(models_to_run[index]))
                    logger.debug("----------------------------------------")
                    logger.debug("Best parameters set found on development set with GridSearchCV:")
                    logger.debug(" ")
                    logger.debug(modelo.best_params_)
                    logger.debug(" ")
                    logger.debug("Grid scores on development set:")
                    logger.debug(" ")
                    means = modelo.cv_results_['mean_test_score']
                    stds = modelo.cv_results_['std_test_score']
                    for mean, std, params in zip(means, stds, modelo.cv_results_['params']):
                        logger.debug("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
                    logger.debug(" ")
                    modelos.append(modelo)
#                    logger.debug(metrics.classification_report(y_pred, y_test))
#                    logger.debug("Best model accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))
#                    logger.debug("Best model log loss: {}".format(metrics.log_loss(y_test, y_pred_prob)))
#                    logger.debug("grid_search took %.2f seconds" % (time() - start))

            except IndexError as e:
                logger.debug('Error:', e)
                continue
        return modelos

In [None]:
# hyperparameters
clfs, grid = define_hyper_params()
#models to train
models_to_run=['RF','LGB','XGB']

modelos = magic_loop(models_to_run, clfs, grid, X_shuff, y_shuff, X_test, y_test, cv = 5)

In [37]:
len(modelos)

3

In [38]:
modelos[0].best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=900,
                       n_jobs=17, oob_score=False, random_state=234567943,
                       verbose=0, warm_start=False)

In [40]:
print("-------------")
print("Best parameters: {}".format(modelos[0].best_params_))

-------------
Best parameters: {'random_state': 234567943, 'n_jobs': 17, 'n_estimators': 900, 'min_samples_split': 3, 'max_features': 'sqrt', 'max_depth': None, 'class_weight': None}


In [43]:
modelos[2].best_score_

0.7722654389585648

In [47]:
modelos[2].cv_results_

{'mean_fit_time': array([2780.9299521 , 2539.46438828,  797.07712936, 2515.12980523,
        3494.61757379, 1554.08900809, 1549.03772554, 3583.1464467 ,
        1473.28001885,  818.72612305]),
 'std_fit_time': array([150.18664026,  34.43072209,  21.28272577,  46.98247102,
        125.60693383,  32.44984775,  40.50586882, 294.69847297,
         15.46881802,   6.22328495]),
 'mean_score_time': array([16.00794172, 52.63443503,  8.38592067, 17.16482635, 36.21229987,
        16.35490947, 16.595929  , 45.42249007, 29.05839672,  8.48639474]),
 'std_score_time': array([0.48056158, 0.4275225 , 0.08191802, 0.51925698, 0.70183604,
        0.64252379, 0.59420411, 1.05945265, 1.05812072, 0.21452484]),
 'param_reg_lambda': masked_array(data=[0.5, 1, 1, 0.5, 0.5, 1, 0.9, 1, 0.9, 0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_reg_alpha': masked_array(data=[0.5, 0.1, 0.1, 0.

## Try new hyperparameters

#### xgboost

In [119]:
params = {'reg_lambda': 0.8, 'reg_alpha': 0.2, 'random_state': 234567943, 'objective': 'multi:softmax', 
          'n_jobs': 17, 'n_estimators': 900, 'max_depth': 15, 'learning_rate': 0.0005, 'gamma': 0}

In [120]:
modeloXGB = XGBClassifier(**params)

In [None]:
modeloXGB.fit(X_shuff,y_shuff)

In [122]:
modeloXGB

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.0005, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=900, n_jobs=17,
              nthread=None, objective='multi:softprob', random_state=234567943,
              reg_alpha=0.2, reg_lambda=0.8, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

### metrics test

In [123]:
y_pred_test_xgb = modeloXGB.predict(X_test)
y_pred_prob_test_xgb = modeloXGB.predict_proba(X_test)

In [124]:
classification_report(y_pred_test_xgb, y_test)
"Best model accuracy: {}".format(accuracy_score(y_test, y_pred_test_xgb))

'Best model accuracy: 0.7777821876488331'

In [125]:
"Best model log loss: {}".format(log_loss(y_test, y_pred_prob_test_xgb))

'Best model log loss: 1.9005280435239063'

### metrics trainig

In [126]:
y_pred_shuff_xgb = modeloXGB.predict(X_shuff)
y_pred_prob_shuff_xgb = modeloXGB.predict_proba(X_shuff)

In [127]:
classification_report(y_pred_shuff_xgb, y_shuff)
"Best model accuracy: {}".format(accuracy_score(y_shuff, y_pred_shuff_xgb))

'Best model accuracy: 0.8576733410065089'

In [128]:
"Best model log loss: {}".format(log_loss(y_shuff, y_pred_prob_shuff_xgb))

'Best model log loss: 1.7649682254908547'

### metrics test

In [58]:
tmp = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
classes_31 = {index: tmp[index-1] for index in range(1,32)}

In [59]:
# confusion matrix and producer and user accuracies
scheme = 'madmex_31'

acc_dict = validate(y_true=[classes_31[x] for x in y_test], y_pred=[classes_31[x] for x in y_pred_test_xgb],
                            scheme=scheme)
pprint_val_dict(acc_dict)

Numeric code    User's Accuracy      Producer's Accuracy  Class Name                                        
1               0.94                 0.93                 Bosque de Coníferas: Oyamel, Ayarín y Cedro       
2               0.74                 0.75                 Bosque de Coníferas: de Pino y Táscate            
3               0.63                 0.59                 Bosque de Encino y Bosque de Galería              
4               0.83                 0.78                 Chaparral                                         
5               0.78                 0.83                 Mezquital y Matorral Submontano                   
6               0.86                 0.88                 Bosque Mesófilo y Selva Baja Perennifolia         
7               0.83                 0.81                 Selva Baja y Mediana Subperennifolia, Bosque de Galería y Palmar Natural
8               0.90                 0.86                 Manglar y Petén                                 

### metrics trainig

In [60]:
# confusion matrix and producer and user accuracies
scheme = 'madmex_31'

acc_dict = validate(y_true=[classes_31[x] for x in y_shuff], y_pred=[classes_31[x] for x in y_pred_shuff_xgb],
#acc_dict = validate(y_true=y_shuff, y_pred=y_pred_shuff_xgb,
                            scheme=scheme)
pprint_val_dict(acc_dict)

Numeric code    User's Accuracy      Producer's Accuracy  Class Name                                        
1               0.97                 0.97                 Bosque de Coníferas: Oyamel, Ayarín y Cedro       
2               0.84                 0.87                 Bosque de Coníferas: de Pino y Táscate            
3               0.79                 0.74                 Bosque de Encino y Bosque de Galería              
4               0.93                 0.88                 Chaparral                                         
5               0.87                 0.91                 Mezquital y Matorral Submontano                   
6               0.92                 0.93                 Bosque Mesófilo y Selva Baja Perennifolia         
7               0.90                 0.89                 Selva Baja y Mediana Subperennifolia, Bosque de Galería y Palmar Natural
8               0.94                 0.92                 Manglar y Petén                                 

### metrics with coe dataset

In [31]:
coe_ds_file = '/shared_volume/datacube/training_objects/training_mexico_coe_nacional_1M_L7L8_1415.pkl'

In [32]:
with open(coe_ds_file, 'rb') as f:
    coe_training_dataset = pickle.load(f)

In [63]:
coe_X, coe_y_id, coe_Xo, coe_yo_id = remove_outliers(coe_training_dataset[0],coe_training_dataset[1])

In [64]:
tmp = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,20,21,22,23,27,28,29,30,31,15,16,17,18,24,25,26,19]
coe_y = [tmp[y-1] for y in coe_y_id]
coe_yo = [tmp[y-1] for y in coe_yo_id]

In [129]:
y_pred_coe_xgb = modeloXGB.predict(coe_X)
y_pred_prob_coe_xgb = modeloXGB.predict_proba(coe_X)

In [130]:
classification_report(y_pred_coe_xgb, coe_y)
"Best model accuracy: {}".format(accuracy_score(coe_y, y_pred_coe_xgb))

'Best model accuracy: 0.6866415181271057'

In [131]:
"Best model log loss: {}".format(log_loss(coe_y, y_pred_prob_coe_xgb))

'Best model log loss: 2.062787560020533'

In [68]:
tmp = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,20,21,22,23,27,28,29,30,31,15,16,17,18,24,25,26,19]
classes_31_coe = {index: tmp[index-1] for index in range(1,32)}

In [69]:
# confusion matrix and producer and user accuracies
scheme = 'madmex_31'

acc_dict = validate(y_true=[classes_31_coe[x] for x in coe_y], y_pred=[classes_31_coe[x] for x in y_pred_coe_xgb],
#acc_dict = validate(y_true=coe_y, y_pred=y_pred_coe_xgb,
                            scheme=scheme)
pprint_val_dict(acc_dict)

Numeric code    User's Accuracy      Producer's Accuracy  Class Name                                        
1               0.83                 0.85                 Bosque de Coníferas: Oyamel, Ayarín y Cedro       
2               0.73                 0.74                 Bosque de Coníferas: de Pino y Táscate            
3               0.62                 0.58                 Bosque de Encino y Bosque de Galería              
4               0.52                 0.65                 Chaparral                                         
5               0.53                 0.81                 Mezquital y Matorral Submontano                   
6               0.72                 0.89                 Bosque Mesófilo y Selva Baja Perennifolia         
7               0.92                 0.75                 Selva Baja y Mediana Subperennifolia, Bosque de Galería y Palmar Natural
8               0.91                 0.83                 Manglar y Petén                                 