In [8]:
import os
import random
from time import time
from glob import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
import dill as pickle

from plotly import graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from matplotlib import pyplot as plt
%matplotlib inline

import cv2
from functools import partial

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, KFold, GroupKFold

from hyperopt import Trials, fmin, tpe, rand, STATUS_OK, hp

#import torch
#import torch.nn as nn
#import torch.nn.functional as F
#from torch.utils.data import DataLoader, Dataset
#from torchvision import transforms as T, models
#from torch.optim import Adam
#from torch.optim.lr_scheduler import StepLR
#!pip install -q torchsummary --user
# from torchsummary import summary


#device = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"device: {device}")

In [9]:
random.seed(101)
np.random.seed(101)
torch.manual_seed(101);

NameError: name 'torch' is not defined

In [10]:
!ls /data2/ntua/data/planet/planet


fold_0.tfrecords  fold_4.tfrecords	 test_2000.csv	 train_classes.csv
fold_1.tfrecords  sample_submission.csv  train-jpg
fold_2.tfrecords  test-jpg		 train_1000.csv
fold_3.tfrecords  test_1000.csv		 train_2000.csv


In [11]:
path = "/data2/ntua/data/planet/planet"
path_train = os.path.join(path, "train-jpg")
path_test = os.path.join(path, "test-jpg")
print(
    f"train files: {len(os.listdir(path_train))}, "
    f"test files: {len(os.listdir(path_test))}"
)

train files: 40479, test files: 40669


In [12]:
def load_img(path_file):
    img = cv2.imread(path_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (100, 100), cv2.INTER_LINEAR).astype(float)
    img = cv2.normalize(img, None, 0.0, 1.0, cv2.NORM_MINMAX)
    img = img.reshape(1, -1)
    return img

In [13]:
def load_dataset(path, csvfile):
    path_class = os.path.join(path, csvfile)
    df_class = pd.read_csv(path_class)
    #df_class = df_class.sample(n=10000) # limit images
    #print(df_class.shape)
    print('df shape')
    print(df_class.shape)
    #df_class.head()
    #filenames = df_class.image_name.sample(500).values
    df_class["list_tags"] = df_class.tags.str.split(" ")
    filenames = df_class.image_name.values
    path_files = [os.path.join(path_train, filename+".jpg") for filename in filenames]
    X = np.vstack([load_img(path_file) for path_file in path_files])
    print('X shape')
    print(X.shape)
    return df_class, X

In [14]:
def apply_pca(X, pca=None):
    if pca is None:
        pca = PCA(n_components=0.95, random_state=2020)
        X_pca = pca.fit_transform(X)
    else:
        X_pca = pca.transform(X)
    df_pca=pd.DataFrame(X_pca)
    return df_pca, pca

In [15]:
def get_data_ML(df_orig, df_part, df_pca, encoder=None):

    fitenc=False
    if encoder is None:
        encoder = MultiLabelBinarizer()
        fitenc=True
    if fitenc:
        ohe_tags = encoder.fit(df_orig.list_tags.values)
    ohe_tags = encoder.transform(df_part.list_tags.values)
    Y=ohe_tags

    X_img = df_part.loc[:, ~df_part.columns.isin(['list_tags','tags'])]
    X = pd.merge(X_img, df_pca, left_index=True, right_index=True)
    X = X.loc[:, ~X.columns.isin(['Unnamed: 0','image_name'])]
    
    return X, Y, encoder

In [None]:
def create_NN_model(params, X):
    # initializer
    #initializer = initializers.Constant(0.5)
    # define model
    model = Sequential()
    n_features = X.shape[1]
    intlayers = int(params['n_internal_layers'][0])
    model.add(Dense(params['n_internal_layers'][1]['layer_1_' + str(intlayers) + '_nodes'], activation='relu', input_shape=(n_features,))) #kernel_initializer=initializer))
    if not params['dropout'] is None:
        model.add(Dropout(params['dropout']))
    for i in range(2, intlayers + 2):
        model.add(Dense(int(params['n_internal_layers'][1]['layer_' + str(i) + '_' + str(intlayers) + '_nodes']),
                        activation='relu', )) #kernel_initializer=initializer))
        if not params['dropout'] is None:
            model.add(Dropout(params['dropout']))

        # model.add(Dense(1, activation='sigmoid'))
    model.add(Dense(2, activation='softmax'))

    # compile the model

    if params['optimizer']['name']=='Adam':
        # adam = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        if params['optimizer']['adam_params'] is None:
            opt = Adam()
        else:
            opt = Adam(learning_rate=params['optimizer']['adam_params']['learning_rate_adam'], beta_1=params['optimizer']['adam_params']['beta_1'],
                       beta_2=params['optimizer']['adam_params']['beta_2'],amsgrad=params['optimizer']['adam_params']['amsgrad'])
    elif params['optimizer']['name']=='SGD':
        opt = SGD(learning_rate=params['optimizer']['learning_rate_SGD'])

    if params['metric'] == 'accuracy':
        metrics = ['accuracy']
    elif params['metric'] == 'sparse':
        metrics = [tensorflow.metrics.SparseCategoricalAccuracy()]
    elif params['metric'] == 'tn':
        metrics = [tensorflow.metrics.TrueNegatives(),tensorflow.metrics.TruePositives()]
    if 'loss' in params and params['loss'] == 'unbalanced':
        lossf=unbalanced_loss
    else:
        lossf='sparse_categorical_crossentropy'
    model.compile(optimizer=opt, loss=lossf, metrics=metrics)  # , AUC(multi_label=False)])
    # model.compile(optimizer=opt, loss=recallloss, metrics=metrics)
    return model


In [339]:
def find_best_thresholds(Y_hat, Y):
    N_tags = Y.shape[1]
    best_threshs = [0.2] * N_tags
    resolution = 100
    #for jdx in tqdm(range(N_tags)):
    for jdx in range(N_tags):
        best_score = 0
        #threshs = np.zeros_like(best_threshs)
        threshs = best_threshs.copy()
        for kdx in range(resolution):
            kdx /= resolution
            threshs[jdx] = kdx
            Y_hat_thresh = (Y_hat > threshs).astype(float)
            score = fbeta_score(Y, Y_hat_thresh, beta=2, average="samples")
            if score > best_score:
                best_score = score
                best_threshs[jdx] = kdx
    
    global_best_score = fbeta_score(Y, (Y_hat > best_threshs).astype(float), beta=2, average="samples")
    print(f"threshs: {best_threshs} -- best score: {global_best_score}")
    
    return best_threshs, global_best_score

## Model cross validation

In [369]:
def validatemodel(cv, X, Y, params):

    model = create_sklearn_model(params, X_df)
    print('params : %s'%params)
    metrics={'f2score':[]}
    cnt=0
    
    for train_index, val_index in cv.split(X):
        cnt+=1
        print("Fitting fold %d"%cnt)
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        #y_val = y_val[:,0]
        #y_train = y_train[:,0]

        model.fit(X_train, Y_train)
        #train_results_proba=model.predict_proba(X_train)
        #train_results=model.predict(X_train)
        
        val_results_proba=model.predict_proba(X_val)
        val_results=model.predict(X_val)

        Y_hat_val=val_results_proba[0][:,0]
        for i in range (1,len(val_results_proba)):
            Y_hat_val=np.vstack((Y_hat_val,val_results_proba[i][:,0]))
        Y_hat_val=Y_hat_val.T
        threshs, best_score = find_best_thresholds(Y_hat_val, Y_val)
        metrics['f2score'].append(best_score)


    mean_metrics = {}
    mean_metrics['f2score']=sum(metrics['f2score'])/len(metrics['f2score'])
    print('Mean fbeta: %.5f'%mean_metrics['f2score'])

    return {
        'loss': -mean_metrics['f2score'],
        'status': STATUS_OK,
        'metrics': mean_metrics,
        'thresholds': threshs,
        'params': '%s'%params
    }


In [6]:
space = {'algo':'XT','n_estimators': hp.choice('n_estimators',[10, 20, 40, 60, 80, 100, 200, 400, 600, 800, 1000]),
        'criterion': hp.choice('criterion',['gini', 'entropy']),
        'max_depth': hp.quniform('max_depth',2, 40, 2),
        'min_samples_split': hp.choice('min_samples_split',[2, 10, 50, 70, 100, 120, 150, 180, 200, 250, 400, 600, 1000, 1300, 2000]),
        'min_samples_leaf': hp.choice('min_samples_leaf',[5, 10, 15, 20, 25, 30, 35, 40, 45]),
        'max_features': hp.quniform('max_features', 1,10,1),
        'bootstrap': hp.choice('bootstrap',[True, False]),
        'class_weights':'balanced'}


In [394]:
# Load Dataset
df_train, X_train = load_dataset(path, 'train_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [403]:
# Apply pca
df_train_pca, pca = apply_pca(X_train)
df_train_pca.shape

(2005, 551)

In [1]:
# get_data
X_df, Y, encoder = get_data_ML(df_train, df_train, df_train_pca)

NameError: name 'get_data_ML' is not defined

In [4]:
w=compute_sample_weight('balanced', Y)

NameError: name 'Y' is not defined

In [367]:
random_state=42
kf = KFold(n_splits=5)

# trials will contain logging information
trials = Trials()

validatemodelpart = partial(validatemodel, kf, X_df, Y)



In [370]:
best=fmin(fn=validatemodelpart, # function to optimize
          space=space,
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=5, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )


params : {'algo': 'XT', 'bootstrap': False, 'criterion': 'gini', 'max_depth': 20.0, 'max_features': 4.0, 'min_samples_leaf': 15, 'min_samples_split': 400, 'n_estimators': 800}
Fitting fold 1                                     
threshs: [0.0, 0.99, 0.98, 0.99, 0.0, 0.0, 0.99, 0.0, 0.95, 0.97, 0.99, 0.97, 0.0, 0.92, 0.99, 0.99, 0.87] -- best score: 0.7172166069131742
Fitting fold 2                                     
threshs: [0.0, 0.99, 0.99, 0.99, 0.0, 0.0, 0.99, 0.0, 0.95, 0.98, 0.97, 0.97, 0.0, 0.93, 0.99, 0.99, 0.88] -- best score: 0.7427979894896967
Fitting fold 3                                     
threshs: [0.0, 0.99, 0.99, 0.99, 0.0, 0.0, 0.99, 0.99, 0.96, 0.98, 0.96, 0.97, 0.0, 0.94, 0.99, 0.99, 0.88] -- best score: 0.7207852977059992
Fitting fold 4                                     
threshs: [0.0, 0.99, 0.99, 0.99, 0.0, 0.0, 0.99, 0.0, 0.95, 0.98, 0.96, 0.97, 0.0, 0.93, 0.99, 0.99, 0.92] -- best score: 0.738324361416399
Fitting fold 5                                     


## Model test

In [372]:
trials.best_trial['result']['params']

"{'algo': 'XT', 'bootstrap': True, 'criterion': 'gini', 'max_depth': 24.0, 'max_features': 7.0, 'min_samples_leaf': 5, 'min_samples_split': 2000, 'n_estimators': 200}"

In [408]:
df_test, X_test = load_dataset(path, 'test_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [409]:
X_test

array([[0.4       , 0.68      , 0.4       , ..., 0.4       , 0.6       ,
        0.24      ],
       [0.48484848, 0.66666667, 0.24242424, ..., 0.42424242, 0.66666667,
        0.18181818],
       [0.52941176, 0.58823529, 0.32352941, ..., 0.32352941, 0.5       ,
        0.29411765],
       ...,
       [0.18691589, 0.3271028 , 0.23364486, ..., 0.21495327, 0.34579439,
        0.24299065],
       [0.31428571, 0.45714286, 0.41904762, ..., 0.37142857, 0.52380952,
        0.44761905],
       [0.18055556, 0.30555556, 0.25      , ..., 0.56944444, 0.65277778,
        0.47222222]])

In [418]:
df_test_pca, pca = apply_pca(X_test, pca)
df_test_pca.shape

(2005, 551)

In [421]:
X_test, y_test, encoder = get_data_ML(df_test, df_test, df_test_pca)
X_test.shape, y_test.shape

((2005, 551), (2005, 17))

In [422]:
params=eval(trials.best_trial['result']['params'])
params

{'algo': 'XT',
 'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 24.0,
 'max_features': 7.0,
 'min_samples_leaf': 5,
 'min_samples_split': 2000,
 'n_estimators': 200}

In [423]:
model = create_sklearn_model(params, X_test)
model.fit(X_test, y_test)
test_results_proba=model.predict_proba(X_test)
test_results=model.predict(X_test)

In [424]:
Y_hat_val=val_results_proba[0][:,0]
for i in range (1,len(val_results_proba)):
    Y_hat_val=np.vstack((Y_hat_val,val_results_proba[i][:,0]))
Y_hat_val=Y_hat_val.T
Y_val=val_results

In [425]:
threshs, best_score = find_best_thresholds(Y_hat_val, Y_val)

threshs: [0.0, 0.99, 0.99, 0.99, 0.0, 0.0, 0.99, 0.0, 0.94, 0.95, 0.97, 0.97, 0.0, 0.89, 0.99, 0.99, 0.89] -- best score: 0.7353907446178702


In [426]:
#Y_hat_val = np.array(train_results["Y_hat_val"])
#Y_val = np.array(train_results["Y_val"])

pos_probas, neg_probas = [], []
for class_, idx in encoder._cached_dict.items():
    pos_probas.append(Y_hat_val[np.where(Y_val[:, idx] != 0), idx].mean())
    neg_probas.append(Y_hat_val[np.where(Y_val[:, idx] == 0), idx].mean())
go.Figure([
    go.Bar(x=list(encoder._cached_dict), y=pos_probas, name="Y_hat proba | Y = 1"),
    go.Bar(x=list(encoder._cached_dict), y=neg_probas, name="Y_hat proba | Y = 0")
]).show()


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.



In [427]:
class_scores = {}
classes = encoder.classes_
for jdx in range(Y_val.shape[1]):
    y_val = Y_val[:, jdx].ravel()
    y_hat_val = (Y_hat_val[:, jdx].ravel() > threshs[jdx]).astype(float)
    score = fbeta_score(y_val, y_hat_val, beta=2)
    class_scores[classes[jdx]] = round(score, 4)

df_score = pd.DataFrame(dict(
    label=list(class_scores.keys()), score=list(class_scores.values()),
)).sort_values("score", ascending=False)
fig = px.bar(df_score, x="label", y="score", color="score")
fig.show()


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.

