In [195]:
import os
import random
from time import time
from glob import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
#import dill as pickle

from plotly import graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from matplotlib import pyplot as plt
%matplotlib inline

import cv2
from functools import partial

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils.class_weight import compute_sample_weight

from hyperopt import Trials, fmin, tpe, rand, STATUS_OK, hp

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.metrics
import numpy as np
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import tensorflow.keras.backend as K
import tensorflow as tf
#import tensorflow_addons as tfa

In [33]:
random.seed(101)
np.random.seed(101)

In [34]:
!ls /data2/ntua/data/planet/planet


fold_0.tfrecords  fold_4.tfrecords	 test_2000.csv	 train_classes.csv
fold_1.tfrecords  sample_submission.csv  train-jpg
fold_2.tfrecords  test-jpg		 train_1000.csv
fold_3.tfrecords  test_1000.csv		 train_2000.csv


In [35]:
path = "/data2/ntua/data/planet/planet"
path_train = os.path.join(path, "train-jpg")
path_test = os.path.join(path, "test-jpg")
print(
    f"train files: {len(os.listdir(path_train))}, "
    f"test files: {len(os.listdir(path_test))}"
)

train files: 40479, test files: 40669


In [36]:
def load_img(path_file):
    img = cv2.imread(path_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (100, 100), cv2.INTER_LINEAR).astype(float)
    img = cv2.normalize(img, None, 0.0, 1.0, cv2.NORM_MINMAX)
    img = img.reshape(1, -1)
    return img

In [37]:
def load_dataset(path, csvfile):
    path_class = os.path.join(path, csvfile)
    df_class = pd.read_csv(path_class)
    #df_class = df_class.sample(n=10000) # limit images
    #print(df_class.shape)
    print('df shape')
    print(df_class.shape)
    #df_class.head()
    #filenames = df_class.image_name.sample(500).values
    df_class["list_tags"] = df_class.tags.str.split(" ")
    filenames = df_class.image_name.values
    path_files = [os.path.join(path_train, filename+".jpg") for filename in filenames]
    X = np.vstack([load_img(path_file) for path_file in path_files])
    print('X shape')
    print(X.shape)
    return df_class, X

In [38]:
def apply_pca(X, pca=None):
    if pca is None:
        pca = PCA(n_components=0.95, random_state=2020)
        X_pca = pca.fit_transform(X)
    else:
        X_pca = pca.transform(X)
    df_pca=pd.DataFrame(X_pca)
    return df_pca, pca

In [39]:
def get_data_ML(df_orig, df_part, df_pca, encoder=None):

    fitenc=False
    if encoder is None:
        encoder = MultiLabelBinarizer()
        fitenc=True
    if fitenc:
        ohe_tags = encoder.fit(df_orig.list_tags.values)
    ohe_tags = encoder.transform(df_part.list_tags.values)
    Y=ohe_tags

    X_img = df_part.loc[:, ~df_part.columns.isin(['list_tags','tags'])]
    X = pd.merge(X_img, df_pca, left_index=True, right_index=True)
    X = X.loc[:, ~X.columns.isin(['Unnamed: 0','image_name'])]
    
    return X.values, Y, encoder

In [222]:
def fit_model(model, params, X_train, y_train, X_val=None, y_val=None):
    es = EarlyStopping(monitor=params['ES_monitor'], patience=params['ES_patience'], min_delta=params['ES_mindelta'])
    if X_val is not None and y_val is not None:
        res = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['max_epochs'], verbose=0, callbacks=[es],
                        validation_data=(X_val, y_val), class_weight=params['class_weights'])
    else:
        res = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['max_epochs'], verbose=0, callbacks=[es],
                        class_weight=params['class_weights'])
    return model, res


In [81]:
def create_NN_model(params, X, Y):
    # initializer
    #initializer = initializers.Constant(0.5)
    # define model
    model = Sequential()
    n_features = X.shape[1]
    intlayers = int(params['n_internal_layers'][0])
    model.add(Dense(params['n_internal_layers'][1]['layer_1_' + str(intlayers) + '_nodes'], activation='relu', input_shape=(n_features,))) #kernel_initializer=initializer))
    if not params['dropout'] is None:
        model.add(Dropout(params['dropout']))
    for i in range(2, intlayers + 2):
        model.add(Dense(int(params['n_internal_layers'][1]['layer_' + str(i) + '_' + str(intlayers) + '_nodes']),
                        activation='relu', )) #kernel_initializer=initializer))
        if not params['dropout'] is None:
            model.add(Dropout(params['dropout']))

        # model.add(Dense(1, activation='sigmoid'))
    #model.add(Dense(Y.shape[1], activation='softmax'))
    model.add(Dense(Y.shape[1], activation='softmax'))

    # compile the model

    if params['optimizer']['name']=='Adam':
        # adam = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        if params['optimizer']['adam_params'] is None:
            opt = Adam()
        else:
            opt = Adam(learning_rate=params['optimizer']['adam_params']['learning_rate_adam'], beta_1=params['optimizer']['adam_params']['beta_1'],
                       beta_2=params['optimizer']['adam_params']['beta_2'],amsgrad=params['optimizer']['adam_params']['amsgrad'])
    elif params['optimizer']['name']=='SGD':
        opt = SGD(learning_rate=params['optimizer']['learning_rate_SGD'])

    if params['metric'] == 'accuracy':
        metrics = ['accuracy']
    if 'loss' in params and params['loss'] == 'unbalanced':
        lossf=unbalanced_loss
    else:
        lossf='sparse_categorical_crossentropy'
    #model.compile(optimizer=opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=metrics)  # AUC(multi_label=False)])
    model.compile(optimizer=opt, loss=tf.keras.losses.BinaryCrossentropy(), metrics=metrics)
    # model.compile(optimizer=opt, loss=recallloss, metrics=metrics)
    return model


In [104]:
def fit_model(model, params, X_train, y_train, X_val=None, y_val=None):
    es = EarlyStopping(monitor=params['ES_monitor'], patience=params['ES_patience'], min_delta=params['ES_mindelta'])
    if X_val is not None and y_val is not None:
        res = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['max_epochs'], verbose=0, callbacks=[es],
                        validation_data=(X_val, y_val))#, class_weight=params['class_weights'])
    else:
        res = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['max_epochs'], verbose=0, callbacks=[es])
        #,class_weight=params['class_weights'])


In [42]:
def find_best_thresholds(Y_hat, Y):
    N_tags = Y.shape[1]
    best_threshs = [0.2] * N_tags
    resolution = 100
    #for jdx in tqdm(range(N_tags)):
    for jdx in range(N_tags):
        best_score = 0
        #threshs = np.zeros_like(best_threshs)
        threshs = best_threshs.copy()
        for kdx in range(resolution):
            kdx /= resolution
            threshs[jdx] = kdx
            Y_hat_thresh = (Y_hat > threshs).astype(float)
            score = fbeta_score(Y, Y_hat_thresh, beta=2, average="samples")
            if score > best_score:
                best_score = score
                best_threshs[jdx] = kdx
    
    global_best_score = fbeta_score(Y, (Y_hat > best_threshs).astype(float), beta=2, average="samples")
    print(f"threshs: {best_threshs} -- best score: {global_best_score}")
    
    return best_threshs, global_best_score

In [131]:
def global_best_score(Y_hat, Y, thresholds):
    return fbeta_score(Y, (Y_hat > thresholds).astype(float), beta=2, average="samples")    

## Model cross validation

In [172]:
def validatemodel(cv, X, Y, params):

    model = create_NN_model(params, X, Y)
    print('params : %s'%params)
    metrics={'f2score':[]}
    cnt=0
    
    for train_index, val_index in cv.split(X):
        cnt+=1
        print("Fitting fold %d"%cnt)
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        
        fit_model(model, params, X_train, Y_train, X_val, Y_val)
        Y_pred_train=model.predict(X_train)
        threshs, best_score_train = find_best_thresholds(Y_pred_train, Y_train)
        Y_pred_val=model.predict(X_val)
        best_score = global_best_score(Y_pred_val, Y_val, threshs)

        metrics['f2score'].append(best_score)


    mean_metrics = {}
    mean_metrics['f2score']=sum(metrics['f2score'])/len(metrics['f2score'])
    print('Mean fbeta: %.5f'%mean_metrics['f2score'])

    return {
        'loss': -mean_metrics['f2score'],
        'status': STATUS_OK,
        'metrics': mean_metrics,
        'thresholds': threshs,
        'params': '%s'%params
    }


In [173]:
    space = {'n_internal_layers': hp.choice('n_internal_layers',
                [
                    (0, {'layer_1_0_nodes': hp.quniform('layer_1_0_w_nodes', 100, 2100, 100)}),
                    (1, {'layer_1_1_nodes': hp.quniform('layer_1_1_w_nodes', 100, 2100, 100), 'layer_2_1_nodes': hp.quniform('layer_2_1_w_nodes', 100, 2100, 100)}),
                    (0, {'layer_1_0_nodes': hp.quniform('layer_1_0_nodes', 10, 100, 10)}),
                    (1, {'layer_1_1_nodes': hp.quniform('layer_1_1_nodes', 10, 100, 10), 'layer_2_1_nodes': hp.quniform('layer_2_1_nodes', 10, 100, 10)}),
                ]
                ),
             'dropout': hp.choice('dropout',[None, 0.1, 0.2, 0.3]),
             'class_weights': None,
             'feature_drop': [],
             'max_epochs': hp.choice('max_epochs', [2000]),
             'metric': hp.choice('metric',['accuracy']),
             'optimizer': hp.choice('optimizer',[{'name': 'Adam','adam_params':hp.choice('adam_params',[None])}]),
             'ES_monitor':hp.choice('ES_monitor', ['val_loss']),#'val_loss','loss'
             'ES_patience':hp.choice('ES_patience', [10]),
             'ES_mindelta':hp.choice('ES_mindelta', [0.0001]),
             'batch_size':hp.choice('batch_size', [512])
             }


In [256]:
# Load Dataset
df_train, X_train = load_dataset(path, 'train_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [257]:
# get_data
X, Y, encoder = get_data_ML(df_train, df_train, df_train_pca)

In [258]:
# Apply pca
df_train_pca, pca = apply_pca(X_train)
df_train_pca.shape

(2005, 551)

In [268]:
X_train = df_train_pca.values
Y_train = Y
type(X_train), type(Y_train), X_train.shape, Y_train.shape

(numpy.ndarray, numpy.ndarray, (2005, 551), (2005, 17))

In [48]:
w=compute_sample_weight('balanced', Y)

In [180]:
random_state=42
kf = KFold(n_splits=5)
trials = Trials() # trials will contain logging information
validatemodelpart = partial(validatemodel, kf, X_train, Y_train)

In [181]:
best=fmin(fn=validatemodelpart, # function to optimize
          space=space,
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=50, # maximum number of iterations
          trials=trials,# logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )


params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.2, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 70.0, 'layer_2_1_nodes': 20.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fitting fold 1                                      
threshs: [0.01, 0.01, 0.01, 0.02, 0.02, 0.0, 0.01, 0.03, 0.01, 0.01, 0.01, 0.01, 0.23, 0.01, 0.03, 0.05, 0.01] -- best score: 0.8023411091768877
Fitting fold 2                                      
threshs: [0.01, 0.01, 0.01, 0.01, 0.02, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.22, 0.01, 0.02, 0.06, 0.01] -- best score: 0.8017502652890677
Fitting fold 3                                      
threshs: [0.01, 0.01, 0.01, 0.01, 0.04, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.21, 0.01, 0.02, 0.04, 0.01] -- best score: 0.7888279659813013
Fitting fold 4                                      
threshs: [0.0, 0.02, 0.01

Fitting fold 3                                                                  
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.11, 0.03, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.0] -- best score: 0.7891401648478356
Fitting fold 4                                                                  
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.04, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8182607265761312
Fitting fold 5                                                                  
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8271827164204762
Mean fbeta: 0.75281                                                             
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 1800.

Mean fbeta: 0.71698                                                            
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.2, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 900.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fitting fold 1                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.05, 0.01, 0.01, 0.01, 0.01, 0.0, 0.04, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7914794864207909
Fitting fold 2                                                                 
threshs: [0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.04, 0.01, 0.01, 0.01, 0.02, 0.01, 0.05, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8042999860201332
Fitting fold 3                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.03, 0.0

Fitting fold 2                                                                 
threshs: [0.0, 0.01, 0.01, 0.02, 0.02, 0.0, 0.03, 0.02, 0.01, 0.01, 0.02, 0.01, 0.0, 0.01, 0.01, 0.01, 0.0] -- best score: 0.7833570280136224
Fitting fold 3                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.02, 0.0, 0.04, 0.02, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.02, 0.0] -- best score: 0.7823209922743899
Fitting fold 4                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.03, 0.02, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.02, 0.01, 0.01] -- best score: 0.788456584868
Fitting fold 5                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.07, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.0] -- best score: 0.8060856616364065
Mean fbeta: 0.75328                                                            
params : {'ES_mindelta': 0.0001, 'E

Fitting fold 5                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.03, 0.03, 0.01, 0.01, 0.01, 0.01, 0.04, 0.01, 0.01, 0.02, 0.0] -- best score: 0.7795149097038717
Mean fbeta: 0.71634                                                            
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.2, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 100.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fitting fold 1                                                                 
threshs: [0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.03, 0.03, 0.01, 0.01, 0.01, 0.01, 0.05, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7985661631403119
Fitting fold 2                                                                 
threshs: [0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.05, 0

Fitting fold 1                                                                 
threshs: [0.01, 0.02, 0.01, 0.01, 0.01, 0.0, 0.02, 0.03, 0.01, 0.01, 0.02, 0.01, 0.11, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7964788846093324
Fitting fold 2                                                                 
threshs: [0.0, 0.02, 0.01, 0.01, 0.02, 0.0, 0.02, 0.03, 0.01, 0.01, 0.02, 0.01, 0.08, 0.01, 0.01, 0.01, 0.0] -- best score: 0.7661085756975012
Fitting fold 3                                                                 
threshs: [0.0, 0.02, 0.01, 0.01, 0.01, 0.0, 0.02, 0.03, 0.01, 0.01, 0.01, 0.0, 0.15, 0.01, 0.01, 0.02, 0.0] -- best score: 0.7591854819148298
Fitting fold 4                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.02, 0.02, 0.01, 0.01, 0.01, 0.0, 0.13, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7682858057702338
Fitting fold 5                                                                 
threshs: [0.0, 0.01, 0.01, 

Fitting fold 4                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.02, 0.0, 0.03, 0.03, 0.01, 0.01, 0.01, 0.01, 0.14, 0.01, 0.01, 0.03, 0.01] -- best score: 0.758899529223467
Fitting fold 5                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.02, 0.0, 0.06, 0.02, 0.01, 0.01, 0.01, 0.01, 0.13, 0.01, 0.01, 0.02, 0.0] -- best score: 0.7832490512480753
Mean fbeta: 0.73472                                                            
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.3, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 80.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fitting fold 1                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.02, 0.0, 0.05, 0.01, 0.01, 0.01, 0.01, 0.01, 0.12, 0.01,

Fitting fold 1                                                                 
threshs: [0.0, 0.01, 0.02, 0.01, 0.01, 0.0, 0.03, 0.02, 0.01, 0.01, 0.02, 0.01, 0.03, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7940169479185067
Fitting fold 2                                                                 
threshs: [0.0, 0.01, 0.02, 0.02, 0.01, 0.0, 0.05, 0.01, 0.01, 0.01, 0.03, 0.01, 0.04, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7809050694887811
Fitting fold 3                                                                 
threshs: [0.01, 0.01, 0.02, 0.01, 0.02, 0.0, 0.07, 0.01, 0.01, 0.01, 0.02, 0.01, 0.02, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7908146029332465
Fitting fold 4                                                                 
threshs: [0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.05, 0.01, 0.01, 0.01, 0.01, 0.01, 0.03, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8067378522802116
Fitting fold 5                                                                 
threshs: [0.01, 0.01, 

threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.02, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7819603711080038
Fitting fold 4                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7887404720508966
Fitting fold 5                                                                 
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8120755652057965
Mean fbeta: 0.75691                                                            
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 90.0, 'layer_2_1_nodes': 80.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fit

Mean fbeta: 0.75566                                                              
params : {'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 80.0, 'layer_2_1_nodes': 70.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}
Fitting fold 1                                                                   
threshs: [0.01, 0.01, 0.01, 0.01, 0.02, 0.0, 0.02, 0.03, 0.01, 0.01, 0.01, 0.01, 0.08, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7900496888595214
Fitting fold 2                                                                   
threshs: [0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.19, 0.01, 0.01, 0.01, 0.01] -- best score: 0.7952193925925207
Fitting fold 3                                                                   
threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.05, 0.01

## Model test

In [215]:
def model_test(params, X_test, Y_test, X_train, Y_train):
    model = create_NN_model(params, X_test, Y_test)
    fit_model(model, params, X_train, Y_train)
    Y_pred_train=model.predict(X_train)
    Y_pred_test=model.predict(X_test)
    threshs, train_best_score = find_best_thresholds(Y_pred_train, Y_train)
    test_best_score = global_best_score(Y_pred_test, Y_test, threshs)
    print('Test score: %s'%test_best_score)
    return Y_pred_train, Y_pred_test, threshs

In [156]:
trials.best_trial['result']['params']

"{'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.3, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (0, {'layer_1_0_nodes': 60.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}"

In [262]:
df_test, X_test = load_dataset(path, 'test_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [263]:
df_test_pca, pca = apply_pca(X_test, pca)
df_test_pca.shape

(2005, 551)

In [264]:
X_test, Y_test, encoder = get_data_ML(df_test, df_test, df_test_pca, encoder)
X_test.shape, Y_test.shape

((2005, 551), (2005, 17))

In [182]:
for t in trials:
    print(t['result']['loss'])

-0.7550637929651071
-0.7787216593563283
-0.769078807237902
-0.7702976334885304
-0.7539444130309023
-0.7528132118817503
-0.7556604333525252
-0.7564802678234237
-0.7697745933874937
-0.7489511888631533
-0.7169813451405082
-0.7467738990337904
-0.7700764960047574
-0.7528749207926685
-0.7746282725123186
-0.7498531091418765
-0.753277814545023
-0.7465662030684076
-0.7571727747525502
-0.7422047337335422
-0.7583622542617747
-0.7163436749921333
-0.760884949043629
-0.7726426089941896
-0.7648521594595169
-0.7512169228737913
-0.7598891898663187
-0.730960488033357
-0.7623371956236789
-0.7603311909802719
-0.7773450961431416
-0.7660777975703967
-0.7347152625340494
-0.7542965764416282
-0.7702329793295196
-0.7255194156337119
-0.7683060115595669
-0.7693534391362726
-0.766265650964247
-0.7502937454601855
-0.7200352968857214
-0.7729329884996516
-0.7871359088133831
-0.7569107091469217
-0.7477250110196592
-0.7412372467417748
-0.7526100089508659
-0.7397192253741567
-0.7556560684239413
-0.7623482939507429


In [266]:
params={'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 100.0, 'layer_2_1_nodes': 100.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}

params={'ES_mindelta': 0.0001, 'ES_monitor': 'loss', 'ES_patience': 10, 'batch_size': 512,\
'dropout': 0.3,\
'max_epochs': 2000, 'metric': 'accuracy','class_weights': None,\
'n_internal_layers': (1, {'layer_1_1_nodes': 1200.0, 'layer_2_1_nodes': 200.0}),\
'optimizer': {'adam_params': None, 'name': 'Adam'}}

In [186]:
sorted_trials = sorted(trials, key=lambda x:x['result']['loss'])

In [187]:
sorted_trials

[{'state': 2,
  'tid': 42,
  'spec': None,
  'result': {'loss': -0.7871359088133831,
   'status': 'ok',
   'metrics': {'f2score': 0.7871359088133831},
   'thresholds': [0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.01,
    0.0,
    0.01,
    0.01,
    0.01,
    0.01],
   'params': "{'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 100.0, 'layer_2_1_nodes': 100.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}"},
  'misc': {'tid': 42,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'ES_mindelta': [42],
    'ES_monitor': [42],
    'ES_patience': [42],
    'adam_params': [42],
    'batch_size': [42],
    'dropout': [42],
    'layer_1_0_nodes': [],
    'layer_1_0_w_nodes': [],
    'layer_1_1_nodes': [4

In [212]:
cnt=1
for t in sorted_trials:
    if cnt>1:
        break
    print('CV best score: %s:'%t['result']['loss'])
    print(t['result']['params'])
    Y_pred_train, Y_pred_test, threshs = model_test(eval(t['result']['params']), X_test, Y_test, X, Y)
    cnt+=1

CV best score: -0.7871359088133831:
{'ES_mindelta': 0.0001, 'ES_monitor': 'val_loss', 'ES_patience': 10, 'batch_size': 512, 'class_weights': None, 'dropout': 0.1, 'feature_drop': (), 'max_epochs': 2000, 'metric': 'accuracy', 'n_internal_layers': (1, {'layer_1_1_nodes': 100.0, 'layer_2_1_nodes': 100.0}), 'optimizer': {'adam_params': None, 'name': 'Adam'}}


KeyError: 'result'

In [269]:
Y_pred_train, Y_pred_test, threshs = model_test(params, X_test, Y_test, X_train, Y_train)

threshs: [0.0, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0, 0.01, 0.01, 0.01, 0.01] -- best score: 0.8309075220307095
Test score: 0.6894332841217348


In [425]:
threshs, best_score = find_best_thresholds(Y_hat_val, Y_test)

threshs: [0.0, 0.99, 0.99, 0.99, 0.0, 0.0, 0.99, 0.0, 0.94, 0.95, 0.97, 0.97, 0.0, 0.89, 0.99, 0.99, 0.89] -- best score: 0.7353907446178702


In [270]:
#Y_hat_val = np.array(train_results["Y_hat_val"])
#Y_val = np.array(train_results["Y_val"])

Y_hat_val = Y_pred_test
Y_val = Y_test

pos_probas, neg_probas = [], []
for class_, idx in encoder._cached_dict.items():
    pos_probas.append(Y_hat_val[np.where(Y_val[:, idx] != 0), idx].mean())
    neg_probas.append(Y_hat_val[np.where(Y_val[:, idx] == 0), idx].mean())
go.Figure([
    go.Bar(x=list(encoder._cached_dict), y=pos_probas, name="Y_hat proba | Y = 1"),
    go.Bar(x=list(encoder._cached_dict), y=neg_probas, name="Y_hat proba | Y = 0")
]).show()

In [271]:
class_scores = {}
classes = encoder.classes_
for jdx in range(Y_val.shape[1]):
    y_val = Y_val[:, jdx].ravel()
    y_hat_val = (Y_hat_val[:, jdx].ravel() > threshs[jdx]).astype(float)
    score = fbeta_score(y_val, y_hat_val, beta=2)
    class_scores[classes[jdx]] = round(score, 4)

df_score = pd.DataFrame(dict(
    label=list(class_scores.keys()), score=list(class_scores.values()),
)).sort_values("score", ascending=False)
fig = px.bar(df_score, x="label", y="score", color="score")
fig.show()

In [276]:
fig = make_subplots(cols=5, rows=4)
for jdx in range(Y_val.shape[1]):
    y_val = Y_val[:, jdx].ravel()
    y_hat_val = (Y_hat_val[:, jdx].ravel() > threshs[jdx]).astype(float)
    tn, fp, fn, tp = confusion_matrix(y_val, y_hat_val).ravel()
    mat = np.array([[fn, tn], [tp, fp]])
    col = jdx // 4+1
    row = jdx % 4+1
    fig.add_trace(
        go.Heatmap(
            z=mat, text=[[f"fn: {fn}", f"tn: {tn}"], [f"tp: {tp}", f"fp: {fp}"]], 
            texttemplate="%{text}", colorscale='Viridis', name=encoder.classes_[jdx],
            showscale=False, textfont={"size":20}
        ),
        col=col, row=row, 
    )
    fig.update_xaxes(title=encoder.classes_[jdx], showticklabels=False, row=row, col=col)
    fig.update_yaxes(showticklabels=False, row=row, col=col)
    
fig.update_layout(
    width=1200, height=800, title="Confusion matrices", 
)
fig.show()

In [275]:
plt.savefig('matrices_tf_2000.png', bbox_inches='tight')

<Figure size 432x288 with 0 Axes>