In [438]:
import os
import random
from time import time
from glob import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
import dill as pickle

from plotly import graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from matplotlib import pyplot as plt
%matplotlib inline

import cv2
from functools import partial

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, KFold, GroupKFold

from hyperopt import Trials, fmin, tpe, rand, STATUS_OK, hp


In [439]:
random.seed(101)
np.random.seed(101)
torch.manual_seed(101);

In [440]:
!ls /data2/ntua/data/planet/planet


fold_0.tfrecords  fold_4.tfrecords	 test_2000.csv	 train_classes.csv
fold_1.tfrecords  sample_submission.csv  train-jpg
fold_2.tfrecords  test-jpg		 train_1000.csv
fold_3.tfrecords  test_1000.csv		 train_2000.csv


In [441]:
path = "/data2/ntua/data/planet/planet"
path_train = os.path.join(path, "train-jpg")
path_test = os.path.join(path, "test-jpg")
print(
    f"train files: {len(os.listdir(path_train))}, "
    f"test files: {len(os.listdir(path_test))}"
)

train files: 40479, test files: 40669


In [442]:
def load_img(path_file):
    img = cv2.imread(path_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (100, 100), cv2.INTER_LINEAR).astype(float)
    img = cv2.normalize(img, None, 0.0, 1.0, cv2.NORM_MINMAX)
    img = img.reshape(1, -1)
    return img

In [443]:
def load_dataset(path, csvfile):
    path_class = os.path.join(path, csvfile)
    df_class = pd.read_csv(path_class)
    #df_class = df_class.sample(n=10000) # limit images
    #print(df_class.shape)
    print('df shape')
    print(df_class.shape)
    #df_class.head()
    #filenames = df_class.image_name.sample(500).values
    df_class["list_tags"] = df_class.tags.str.split(" ")
    filenames = df_class.image_name.values
    path_files = [os.path.join(path_train, filename+".jpg") for filename in filenames]
    X = np.vstack([load_img(path_file) for path_file in path_files])
    print('X shape')
    print(X.shape)
    return df_class, X

In [444]:
def apply_pca(X, pca=None):
    if pca is None:
        pca = PCA(n_components=0.95, random_state=2020)
        X_pca = pca.fit_transform(X)
    else:
        X_pca = pca.transform(X)
    df_pca=pd.DataFrame(X_pca)
    return df_pca, pca

In [506]:
def get_data_ML(df_orig, df_part, df_pca, encoder=None):

    fitenc=False
    if encoder is None:
        encoder = MultiLabelBinarizer()
        fitenc=True
    if fitenc:
        ohe_tags = encoder.fit(df_orig.list_tags.values)
    ohe_tags = encoder.transform(df_part.list_tags.values)
    Y=ohe_tags

    X_img = df_part.loc[:, ~df_part.columns.isin(['list_tags','tags'])]
    X = pd.merge(X_img, df_pca, left_index=True, right_index=True)
    X = X.loc[:, ~X.columns.isin(['Unnamed: 0','image_name'])]
    
    return X.values, Y, encoder

In [456]:
def create_sklearn_model(params, X):
    n_features = X.shape[1]
    if params['algo']=='RF':
        max_feat = int(n_features/10*params['max_features'])
        model = RandomForestClassifier(max_depth=params['max_depth'], n_estimators=params['n_estimators'], min_samples_split=params['min_samples_split'], \
                                       min_samples_leaf=params['min_samples_leaf'],criterion=params['criterion'],max_features=max_feat,
                                       bootstrap=params['bootstrap'], class_weight=params['class_weights']
                                       )
    if params['algo']=='XT':
        max_feat = int(n_features/10*params['max_features'])
        model = ExtraTreesClassifier(max_depth=params['max_depth'], n_estimators=params['n_estimators'], min_samples_split=params['min_samples_split'], \
                                       min_samples_leaf=params['min_samples_leaf'],criterion=params['criterion'],max_features=max_feat,
                                       bootstrap=params['bootstrap'], class_weight=params['class_weights']
                                       )
    if params['algo']=='XGB':
        model = XGBClassifier(max_depth=int(params['max_depth']), n_estimators=params['n_estimators'], subsample=params['subsample'],\
                              reg_alpha=params['alpha'], gamma=params['gamma'], reg_lambda=params['lambda'],\
                              scale_pos_weight=params['scale_pos_weight'], n_jobs=8)

    return model


In [447]:
def find_best_thresholds(Y_hat, Y):
    N_tags = Y.shape[1]
    best_threshs = [0.2] * N_tags
    resolution = 100
    #for jdx in tqdm(range(N_tags)):
    for jdx in range(N_tags):
        best_score = 0
        #threshs = np.zeros_like(best_threshs)
        threshs = best_threshs.copy()
        for kdx in range(resolution):
            kdx /= resolution
            threshs[jdx] = kdx
            Y_hat_thresh = (Y_hat > threshs).astype(float)
            score = fbeta_score(Y, Y_hat_thresh, beta=2, average="samples")
            if score > best_score:
                best_score = score
                best_threshs[jdx] = kdx
    
    global_best_score = fbeta_score(Y, (Y_hat > best_threshs).astype(float), beta=2, average="samples")
    print(f"threshs: {best_threshs} -- best score: {global_best_score}")
    
    return best_threshs, global_best_score

## Model cross validation

In [523]:
def validatemodel(cv, X, Y, params):

    model = create_sklearn_model(params, X)
    print('params : %s'%params)
    metrics={'f2score':[]}
    cnt=0
    
    for train_index, val_index in cv.split(X):
        cnt+=1
        print("Fitting fold %d"%cnt)
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
        #y_val = y_val[:,0]
        #y_train = y_train[:,0]

        model.fit(X_train, y_train)
        #train_results_proba=model.predict_proba(X_train)
        #train_results=model.predict(X_train)
        
        val_results_proba=model.predict_proba(X_val)
        #val_results=model.predict(X_val)

        Y_hat_val=val_results_proba[0][:,0]
        for i in range (1,len(val_results_proba)):
            Y_hat_val=np.vstack((Y_hat_val,val_results_proba[i][:,0]))
        Y_hat_val=Y_hat_val.T
        threshs, best_score = find_best_thresholds(Y_hat_val, y_val)
        metrics['f2score'].append(best_score)


    mean_metrics = {}
    mean_metrics['f2score']=sum(metrics['f2score'])/len(metrics['f2score'])
    print('Mean fbeta: %.5f'%mean_metrics['f2score'])

    return {
        'loss': -mean_metrics['f2score'],
        'status': STATUS_OK,
        # -- store other results like this
        # 'eval_time': time.time(),
        'metrics': mean_metrics,
        'thresholds': threshs,
        'params': '%s'%params
        # -- attachments are handled differently
        # 'attachments':
        #    {'time_module': pickle.dumps(time.time)}
    }


In [520]:
space = {'algo':'XT','n_estimators': hp.choice('n_estimators',[10, 20, 40, 60, 80, 100, 200, 400, 600, 800, 1000]),
        'criterion': hp.choice('criterion',['gini', 'entropy']),
        'max_depth': hp.quniform('max_depth',2, 40, 2),
        'min_samples_split': hp.choice('min_samples_split',[2, 10, 50, 70, 100, 120, 150, 180, 200, 250, 400, 600, 1000, 1300, 2000]),
        'min_samples_leaf': hp.choice('min_samples_leaf',[5, 10, 15, 20, 25, 30, 35, 40, 45]),
        'max_features': hp.quniform('max_features', 1,10,1),
        'bootstrap': hp.choice('bootstrap',[True, False]), 'class_weights':'balanced'}


In [486]:
# Load Dataset
df_train, X_train = load_dataset(path, 'train_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [451]:
# Apply pca
df_train_pca, pca = apply_pca(X_train)
df_train_pca.shape

(2005, 551)

In [511]:
# get data
X_train, Y_train, encoder = get_data_ML(df_train, df_train, df_train_pca)

In [453]:
compute_sample_weight('balanced',Y)

array([0.01027361, 0.00356953, 0.00356953, ..., 0.10789315, 0.04103978,
       1.02403645])

In [524]:
random_state=42
kf = KFold(n_splits=5) # define folds
trials = Trials() # trials will contain logging information
validatemodelpart = partial(validatemodel, kf, X_train, Y_train) # function to call from hyperopt optimizer

In [525]:
best=fmin(fn=validatemodelpart, # function to optimize
          space=space,
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=50, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )


params : {'algo': 'XT', 'bootstrap': False, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 20.0, 'max_features': 4.0, 'min_samples_leaf': 15, 'min_samples_split': 400, 'n_estimators': 800}
Fitting fold 1                                      
threshs: [0.0, 0.97, 0.94, 0.99, 0.99, 0.0, 0.95, 0.99, 0.89, 0.88, 0.81, 0.66, 0.0, 0.0, 0.99, 0.95, 0.0] -- best score: 0.7197770053439013
Fitting fold 2                                      
threshs: [0.0, 0.98, 0.96, 0.99, 0.99, 0.0, 0.96, 0.99, 0.93, 0.87, 0.89, 0.67, 0.0, 0.0, 0.99, 0.97, 0.69] -- best score: 0.7605248518880195
Fitting fold 3                                      
threshs: [0.0, 0.99, 0.95, 0.99, 0.99, 0.0, 0.95, 0.99, 0.56, 0.94, 0.92, 0.96, 0.0, 0.0, 0.99, 0.96, 0.44] -- best score: 0.7900937041354253
Fitting fold 4                                      
threshs: [0.37, 0.98, 0.94, 0.99, 0.99, 0.0, 0.96, 0.99, 0.94, 0.51, 0.93, 0.98, 0.0, 0.0, 0.99, 0.97, 0.0] -- best score: 0.7848160074979146
Fitting fold 5  

threshs: [0.33, 0.98, 0.95, 0.99, 0.99, 0.0, 0.0, 0.99, 0.0, 0.0, 0.85, 0.0, 0.0, 0.0, 0.99, 0.99, 0.0] -- best score: 0.6915812909287226
Mean fbeta: 0.74865                                                            
params : {'algo': 'XT', 'bootstrap': True, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 34.0, 'max_features': 6.0, 'min_samples_leaf': 25, 'min_samples_split': 250, 'n_estimators': 60}
Fitting fold 1                                                                 
threshs: [0.4, 0.97, 0.94, 0.99, 0.99, 0.0, 0.97, 0.99, 0.91, 0.89, 0.82, 0.0, 0.0, 0.0, 0.99, 0.96, 0.0] -- best score: 0.7197159829914319
Fitting fold 2                                                                 
threshs: [0.0, 0.98, 0.98, 0.99, 0.99, 0.0, 0.97, 0.99, 0.54, 0.91, 0.91, 0.66, 0.0, 0.0, 0.99, 0.97, 0.68] -- best score: 0.7472304702853066
Fitting fold 3                                                                 
threshs: [0.39, 0.98, 0.94, 0.99, 0.99, 0.0, 0.97, 0.99, 

threshs: [0.46, 0.96, 0.87, 0.98, 0.99, 0.0, 0.88, 0.97, 0.59, 0.72, 0.89, 0.94, 0.0, 0.47, 0.97, 0.89, 0.51] -- best score: 0.7967548128172511
Fitting fold 4                                                                 
threshs: [0.45, 0.92, 0.89, 0.99, 0.96, 0.0, 0.9, 0.98, 0.78, 0.58, 0.91, 0.94, 0.0, 0.0, 0.97, 0.89, 0.48] -- best score: 0.7896009709427483
Fitting fold 5                                                                 
threshs: [0.42, 0.84, 0.83, 0.99, 0.99, 0.0, 0.0, 0.95, 0.0, 0.45, 0.83, 0.7, 0.0, 0.0, 0.98, 0.93, 0.32] -- best score: 0.6672549380396412
Mean fbeta: 0.74577                                                            
params : {'algo': 'XT', 'bootstrap': True, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 10.0, 'max_features': 7.0, 'min_samples_leaf': 25, 'min_samples_split': 2, 'n_estimators': 600}
Fitting fold 1                                                                 
threshs: [0.46, 0.97, 0.94, 0.99, 0.99, 0.0, 0.96, 0

Fitting fold 2                                                                 
threshs: [0.0, 0.96, 0.96, 0.99, 0.98, 0.0, 0.96, 0.98, 0.87, 0.91, 0.93, 0.0, 0.0, 0.0, 0.98, 0.9, 0.74] -- best score: 0.7586635135526599
Fitting fold 3                                                                 
threshs: [0.0, 0.98, 0.93, 0.99, 0.99, 0.0, 0.95, 0.98, 0.49, 0.88, 0.92, 0.97, 0.0, 0.0, 0.97, 0.91, 0.44] -- best score: 0.7855591840588352
Fitting fold 4                                                                 
threshs: [0.38, 0.95, 0.93, 0.99, 0.98, 0.0, 0.94, 0.99, 0.85, 0.0, 0.95, 0.96, 0.0, 0.27, 0.97, 0.92, 0.44] -- best score: 0.7884176878398375
Fitting fold 5                                                                 
threshs: [0.34, 0.92, 0.88, 0.99, 0.99, 0.0, 0.0, 0.97, 0.0, 0.0, 0.85, 0.0, 0.0, 0.0, 0.99, 0.93, 0.31] -- best score: 0.6822402335452085
Mean fbeta: 0.74648                                                            
params : {'algo': 'XT', 'bootstrap':

params : {'algo': 'XT', 'bootstrap': False, 'class_weights': 'balanced', 'criterion': 'entropy', 'max_depth': 8.0, 'max_features': 10.0, 'min_samples_leaf': 25, 'min_samples_split': 250, 'n_estimators': 600}
Fitting fold 1                                                                   
threshs: [0.0, 0.97, 0.97, 0.99, 0.99, 0.0, 0.96, 0.99, 0.94, 0.94, 0.8, 0.62, 0.0, 0.0, 0.99, 0.96, 0.4] -- best score: 0.715552779196276
Fitting fold 2                                                                   
threshs: [0.0, 0.97, 0.97, 0.99, 0.99, 0.0, 0.97, 0.99, 0.51, 0.92, 0.9, 0.0, 0.0, 0.0, 0.99, 0.98, 0.71] -- best score: 0.7504896095109822
Fitting fold 3                                                                   
threshs: [0.0, 0.99, 0.96, 0.99, 0.99, 0.0, 0.96, 0.99, 0.54, 0.95, 0.93, 0.98, 0.0, 0.0, 0.99, 0.97, 0.41] -- best score: 0.7857057298311209
Fitting fold 4                                                                   
threshs: [0.33, 0.97, 0.96, 0.99, 0.99, 0.0

## Model test

In [526]:
def model_test(trial, X_test, Y_test, X_train, Y_train):
    model = create_sklearn_model(eval(trial['result']['params']), X_test)
    model.fit(X_train, Y_train)
    test_results_proba=model.predict_proba(X_test)
    #test_results=model.predict(X_test)
    Y_hat_val=test_results_proba[0][:,0]
    for i in range (1,len(val_results_proba)):
        Y_hat_val=np.vstack((Y_hat_val,test_results_proba[i][:,0]))
    Y_hat_val=Y_hat_val.T
    threshs, best_score = find_best_thresholds(Y_hat_val, Y_test)
    print('Test score: %s'%best_score)

In [527]:
trials.best_trial['result']['params']

"{'algo': 'XT', 'bootstrap': True, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 14.0, 'max_features': 9.0, 'min_samples_leaf': 25, 'min_samples_split': 1000, 'n_estimators': 600}"

In [528]:
df_test, X_test = load_dataset(path, 'test_2000.csv')

df shape
(2005, 4)
X shape
(2005, 30000)


In [529]:
df_test_pca, pca = apply_pca(X_test, pca)
df_test_pca.shape

(2005, 551)

In [530]:
X_test, Y_test, encoder = get_data_ML(df_test, df_test, df_test_pca, encoder)
X_test.shape, y_test.shape

((2005, 551), (2005, 17))

In [531]:
sorted_trials = sorted(trials, key=lambda x:x['result']['loss'])

In [532]:
cnt=1
for t in sorted_trials:
    if cnt>5:
        break
    print('CV best score: %s:'%t['result']['loss'])
    print(t['result']['params'])
    model_test(t, X_test, Y_test, X_train, Y_train)
    cnt+=1

CV best score: -0.7592057041430268:
{'algo': 'XT', 'bootstrap': True, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 14.0, 'max_features': 9.0, 'min_samples_leaf': 25, 'min_samples_split': 1000, 'n_estimators': 600}
threshs: [0.0, 0.93, 0.89, 0.99, 0.96, 0.46, 0.88, 0.96, 0.64, 0.8, 0.85, 0.0, 0.0, 0.0, 0.98, 0.88, 0.0] -- best score: 0.720317199168663
Test score: 0.720317199168663
CV best score: -0.7566696445235801:
{'algo': 'XT', 'bootstrap': False, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 2.0, 'max_features': 9.0, 'min_samples_leaf': 10, 'min_samples_split': 1300, 'n_estimators': 600}
threshs: [0.0, 0.92, 0.83, 0.99, 0.95, 0.0, 0.89, 0.91, 0.67, 0.63, 0.91, 0.0, 0.0, 0.37, 0.98, 0.86, 0.47] -- best score: 0.7210488484435428
Test score: 0.7210488484435428
CV best score: -0.7551501244680534:
{'algo': 'XT', 'bootstrap': True, 'class_weights': 'balanced', 'criterion': 'gini', 'max_depth': 8.0, 'max_features': 1.0, 'min_samples_leaf': 10, 'min_sample

In [533]:
#Y_hat_val = np.array(train_results["Y_hat_val"])
#Y_val = np.array(train_results["Y_val"])

pos_probas, neg_probas = [], []
for class_, idx in encoder._cached_dict.items():
    pos_probas.append(Y_hat_val[np.where(Y_val[:, idx] != 0), idx].mean())
    neg_probas.append(Y_hat_val[np.where(Y_val[:, idx] == 0), idx].mean())
go.Figure([
    go.Bar(x=list(encoder._cached_dict), y=pos_probas, name="Y_hat proba | Y = 1"),
    go.Bar(x=list(encoder._cached_dict), y=neg_probas, name="Y_hat proba | Y = 0")
]).show()


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.



In [534]:
class_scores = {}
classes = encoder.classes_
for jdx in range(Y_val.shape[1]):
    y_val = Y_val[:, jdx].ravel()
    y_hat_val = (Y_hat_val[:, jdx].ravel() > threshs[jdx]).astype(float)
    score = fbeta_score(y_val, y_hat_val, beta=2)
    class_scores[classes[jdx]] = round(score, 4)

df_score = pd.DataFrame(dict(
    label=list(class_scores.keys()), score=list(class_scores.values()),
)).sort_values("score", ascending=False)
fig = px.bar(df_score, x="label", y="score", color="score")
fig.show()


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defi