In [1]:
%reload_ext autoreload
%autoreload 2

In [6]:
import warnings
warnings.filterwarnings('ignore')

from IPython.display import IFrame, clear_output

In [7]:
!pip install xgboost
!pip install lightgbm
!pip install catboost
!pip install scikit-optimize
!pip install pickle

clear_output()

In [8]:
import os
import time
import logging
from functools import wraps
import pickle

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from numpy import hstack

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [9]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif

from sklearn.utils import shuffle
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

In [10]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser'

In [11]:
logger = logging.getLogger(__name__)
logger.setLevel("INFO")
handler = logging.StreamHandler()
logger.addHandler(handler)
#https://gist.github.com/bradmontgomery/bd6288f09a24c06746bbe54afe4b8a82

def timed(func):
    """This decorator prints the execution time for the decorated function."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        logger.info("EXECUTION TIME : {} ran in {}s".format(func.__name__, round(end - start, 2)))
        return result

    return wrapper


def pickle_results(results, file, path='nested-cv-results'):
    file_path = os.path.join(path, file)
    pickle.dump(favorite_color, open( file_path, "wb" ), protocol=pickle.HIGHEST_PROTOCOL)
    
def unpickle_results(file, path='nested-cv-results'):
    file_path = os.path.join(path, file)
    return pickle.load(open( file_path, "rb" ))

In [12]:
def get_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    return df

def print_proportion(df, label = 'label'):
    print('Proportion : {:.2f}%'.format(100*sum(df.label)/len(df)))

def create_X_y(folder_path, file_name, drop_null_columns=False, index_value = 'md5'):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, index_col = index_value)
    X = df.drop('label', axis=1)
    #X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['label']
    return shuffle(X, y)

def create_X_y_(df, drop_null_columns=False, index_value = 'md5'):
    X = df.drop('label', axis=1)
    #X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['label']
    return shuffle(X, y)

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=2)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

def k_best_selection_(X, y, select_function=f_classif, k=10):
    selector = SelectKBest(select_function, k=k).fit(X, y)
    selected_columns_indices = selector.get_support(indices=True)
    selected_df = X.iloc[:,selected_columns_indices]
    selected_columns = selected_df.columns.tolist()
    return selected_columns

In [13]:
@timed
def score_model_dataset(df, model):
    X, y = create_X_y_(df)
    scores = evaluate_model(model, X, y)
    return scores


def join_dfs(dfs, labels=['label']):
    joined = dfs[0]
    for df in dfs[1:]:
        joined = joined.join(df.drop(labels, axis=1)).fillna(0)
    return joined


def retrieve_subset(original_df, X, y = None, label='label', features_only=False):
    filtered_columns = original_df.columns.tolist()
    filtered_columns.remove(label)
    if features_only == False :
        filtered_index = original_df.index.intersection(X.index)
        X_new = X.loc[filtered_index, filtered_columns]
        y_new = y.loc[filtered_index]
        return X_new, y_new
    else :
        X_new = X.loc[:, filtered_columns]
        return X_new
    
    
def prepare_datasets(original_dfs, X_train_full, X_test, y_train_full, y_test, test_size = 0.4):
    datasets = []
    X_train, X_eval, y_train, y_eval = train_test_split(X_train_full, y_train_full, test_size = test_size, random_state = 1)
    for df in original_dfs:
        dataset = dict()
        dataset['train'] = retrieve_subset(df, X_train, y_train)
        dataset['eval'] = ( retrieve_subset(df, X_eval, features_only=True), y_eval )
        dataset['test'] = ( retrieve_subset(df, X_test, features_only=True), y_test )
        datasets.append(dataset)
    return datasets
    
    
def fit_ensemble(models, datasets):
    X_meta = list()
    for model, dataset in zip(models, datasets):
        model.fit(*dataset['train'])
        y_pred = model.predict(dataset['eval'][0])
        y_pred = y_pred.reshape(len(y_pred), 1)
        X_meta.append(y_pred)
    X_meta = np.hstack(X_meta)
    blender = LogisticRegression()
    blender.fit(X_meta, dataset['eval'][1])
    return blender


def predict_ensemble(models, blender, datasets):
    X_meta = list()
    for model, dataset in zip(models, datasets):
        y_pred = model.predict(dataset['test'][0])
        y_pred = y_pred.reshape(len(y_pred), 1)
        X_meta.append(y_pred)
    X_meta = np.hstack(X_meta)
    return blender.predict(X_meta)


def get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models):
    datasets = prepare_datasets(original_dfs, X_train_full, X_test, y_train_full, y_test, test_size = 0.4)
    blender = fit_ensemble(models, datasets)
    y_pred = predict_ensemble(models, blender, datasets)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [14]:
def get_gb_models():
    models = dict()
    models['Gradient Boosting'] = GradientBoostingClassifier()
    models['XGBoost'] = XGBClassifier()
    models['LightGBM'] = LGBMClassifier()
    models['CatBoost'] = CatBoostClassifier()
    return models

### API

In [15]:
file_name_1 = 'onehot_encoded_apistats_dataset.csv'

oh_apis = get_data(folder_path, file_name_1)

oh_apis.shape

(3761, 304)

In [16]:
print_proportion(oh_apis)

Proportion : 58.23%


In [59]:
#compute api cross-validation accuracy scores
api_scores = score_model_dataset(oh_apis, RandomForestClassifier())
np.mean(api_scores)

EXECUTION TIME : score_model_dataset ran in 10.24s


0.9467353782192374

In [60]:
api_scores_ = score_model_dataset(oh_apis, ExtraTreesClassifier())
np.mean(api_scores_)

EXECUTION TIME : score_model_dataset ran in 12.34s


0.945936800421393

In [43]:
for name, model in get_gb_models().items():
    api_scores_ = score_model_dataset(oh_apis, model)
    print(f'{name} : {np.mean(api_scores_)}')

EXECUTION TIME : score_model_dataset ran in 24.66s
Gradient Boosting : 0.9172190398254227
EXECUTION TIME : score_model_dataset ran in 60.18s
XGBoost : 0.9451375171661305
EXECUTION TIME : score_model_dataset ran in 11.25s
LightGBM : 0.9447850236093083
EXECUTION TIME : score_model_dataset ran in 296.85s
CatBoost : 0.941149801531313


**Variable selection**

In [62]:
for selector_func in [f_classif, chi2, mutual_info_classif]:
    pipe = Pipeline(steps=[('selector', SelectKBest(selector_func, k=150)), ('clf', RandomForestClassifier())])
    api_selected_scores = score_model_dataset(oh_apis, pipe)
    print(f'{selector_func.__name__} : {np.mean(api_selected_scores)}')

f_classif : 0.9425661248753694
chi2 : 0.9005589574279964
mutual_info_classif : 0.9387561845100362


In [63]:
X, y = create_X_y_(oh_apis)
api_k_best_cols = k_best_selection_(X, y, f_classif, k=150)

### DLL

In [32]:
file_name_2 = 'onehot_encoded_dll_dataset.csv'

dll_loaded = get_data(folder_path, file_name_2)

dll_loaded.shape

(3162, 2242)

In [33]:
print_proportion(dll_loaded)

Proportion : 58.82%


In [30]:
#compute dll cross-validation accuracy scores
dll_scores = score_model_dataset(dll_loaded, RandomForestClassifier())
np.mean(dll_scores)

EXECUTION TIME : score_model_dataset ran in 15.37s


0.8678043764724672

In [35]:
#XGBoost doesnt accept some characters in column names
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

dll_loaded.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in dll_loaded.columns.values]

#compute dll cross-validation accuracy scores
dll_scores_ = score_model_dataset(dll_loaded, XGBClassifier())
np.mean(dll_scores_)

EXECUTION TIME : score_model_dataset ran in 167.58s


0.878773842324535

**Variable Selection**

In [64]:
for selector_func in [f_classif, chi2, mutual_info_classif]:
    pipe = Pipeline(steps=[('selector', SelectKBest(selector_func, k=150)), ('clf', RandomForestClassifier())])
    dll_selected_scores = score_model_dataset(dll_loaded, pipe)
    print(f'{selector_func.__name__} : {np.mean(dll_selected_scores)}')

f_classif : 0.8721312675531421
chi2 : 0.8535685287971355
mutual_info_classif : 0.8377507354017756


In [79]:
for selector_func in [f_classif, chi2]:
    pipe = Pipeline(steps=[('selector', SelectKBest(selector_func, k=230)), ('clf', RandomForestClassifier())])
    dll_selected_scores = score_model_dataset(dll_loaded, pipe)
    print(f'{selector_func.__name__} : {np.mean(dll_selected_scores)}')

f_classif : 0.8752878382515408
chi2 : 0.8733901023572788


In [80]:
X, y = create_X_y_(dll_loaded)
dll_k_best_cols = k_best_selection_(X, y, f_classif, k=230)

### PE Imports

In [29]:
file_name_3 = 'pe_entropy_dataset.csv'

pe_imports = get_data(folder_path, file_name_3)

if 'sublabel' in pe_imports.columns.tolist():
    pe_imports.drop('sublabel', axis=1, inplace=True)

pe_imports.shape

(4308, 795)

In [30]:
print_proportion(pe_imports)

Proportion : 60.35%


In [31]:
#compute pe_imports cross-validation accuracy scores
pe_imports_scores = score_model_dataset(pe_imports, RandomForestClassifier())
np.mean(pe_imports_scores)

0.902198240975557

## Joining Datasets

### API and DLL

In [83]:
original_dfs = [oh_apis, dll_loaded]

joined_ = join_dfs(original_dfs)

#### Joining

In [84]:
joining_scores = score_model_dataset(joined_, RandomForestClassifier())

**Joining Variable Selected Datasets**

In [81]:
original_dfs_vs = [oh_apis[api_k_best_cols+['label']], dll_loaded[dll_k_best_cols+['label']]]

joined_vs = join_dfs(original_dfs_vs)

joining_scores_vs = score_model_dataset(joined_vs, RandomForestClassifier())

#### Blending

*Train-Test-Split*

In [None]:
X, y = create_X_y_(joined_)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

models = [RandomForestClassifier(), RandomForestClassifier()]

score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)

print('Blending Accuracy: %.3f' % score)

*Cross-validation*

In [85]:
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

models = [RandomForestClassifier(), RandomForestClassifier()]

blending_scores = []

X, y = create_X_y_(joined_)

for train_index, test_index in CV.split(X, y):
    X_train_full, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train_full, y_test = y[train_index], y[test_index]
    score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)
    blending_scores.append(score)

In [87]:
fig = plot_evaluation_boxplots([api_scores, dll_scores, blending_scores, joining_scores, joining_scores_vs], 
                               names = ['API', 'DLL', 'Blended', 'Joined All', 'Joined Selected'], 
                               title = 'Model Performance on joined "apistats" and "dll" Data', 
                               y_axis = 'Accuracy')

figure_path = 'joined/evaluation_joined_api_dll.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### API and PE Imports

In [176]:
original_dfs = [pe_imports, oh_apis]

joined = join_dfs(original_dfs)

#### Joining

In [178]:
joining_scores = score_model_dataset(joined, RandomForestClassifier())

#### Blending

In [179]:
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

models = [RandomForestClassifier(), RandomForestClassifier()]

blending_scores = []

X, y = create_X_y_(joined)

for train_index, test_index in CV.split(X, y):
    X_train_full, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train_full, y_test = y[train_index], y[test_index]
    score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)
    blending_scores.append(score)

In [180]:
fig = plot_evaluation_boxplots([api_scores, pe_imports_scores, blending_scores, joining_scores], 
                               names = ['API', 'PE Imports', 'Blended', 'Joined'], 
                               title = 'Model Performance on joined "apistats" and "pe imports" Data', 
                               y_axis = 'Accuracy')

figure_path = 'joined/evaluation_joined_api_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### API, DLL and PE Imports

In [34]:
original_dfs = [oh_apis, dll_loaded, pe_imports]

joined = join_dfs(original_dfs)

In [35]:
joined.shape

(3761, 3339)

**Joining**

In [36]:
joining_scores = score_model_dataset(joined, RandomForestClassifier())
np.mean(joining_scores)

0.9571921383825271

**Blending**

In [39]:
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

models = [RandomForestClassifier(), RandomForestClassifier(), RandomForestClassifier()]

blending_scores = []

X, y = create_X_y_(joined)

for train_index, test_index in CV.split(X, y):
    X_train_full, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train_full, y_test = y[train_index], y[test_index]
    score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)
    blending_scores.append(score)

In [40]:
fig = plot_evaluation_boxplots([api_scores, dll_scores, pe_imports_scores, blending_scores, joining_scores], 
                               names = ['API', 'DLL', 'PE Imports', 'Blended', 'Joined'], 
                               title = 'Model Performance on joined "apistats", "dll" and "pe imports" Data', 
                               y_axis = 'Accuracy')

figure_path = 'joined/evaluation_joined_api_dll_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

## Hyperparameter tuning with nested cross-validation

In [28]:
def get_ensemble_models():
    models = dict()
    models['RF'] = RandomForestClassifier()
    models['ExRF'] = ExtraTreesClassifier()
    models['AdaBoost'] = AdaBoostClassifier()
    #models['GB'] = GradientBoostingClassifier()
    models['XGBoost'] = XGBClassifier(objective = 'binary:logistic', eval_metric = 'logloss', silent=1, tree_method='approx')
    models['LightGBM'] = LGBMClassifier(objective='binary', metric='binary_logloss', verbose=0)
    models['CatBoost'] = CatBoostClassifier(thread_count=2, loss_function='Logloss', od_type = 'Iter', verbose= False)
    return models

In [None]:
tuning_params['GB'] = {
        'loss' : Categorical(['deviance']), #['deviance', 'exponential']
        'n_estimators': Integer(50, 100),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'max_depth': Integer(1, 50),
        'max_features' : Categorical(['sqrt', 'log2']),
        'min_samples_split' : Integer(2, 10),
        'min_samples_leaf' : Integer(1, 5),
        'criterion' : Categorical(['friedman_mse', 'mae']),
        'subsample' : Real(0.5, 1.0, 'uniform')}

In [30]:
tuning_params = dict()

tuning_params['RF'] = {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(1, 50),
        'max_features' : Categorical(['sqrt', 'log2']),
        'min_samples_split' : Integer(2, 10),
        'min_samples_leaf' : Integer(1, 5),
        'criterion' : Categorical(['gini', 'entropy']),
        'bootstrap' : Categorical([True, False]),
        'max_samples' : Real(0.5, 0.99, 'uniform'),
        #'warm_start' : Categorical([True, False]),  
        #'ccp_alpha' : Real(1e-9, 1.0, 'log-uniform')
}

tuning_params['ExRF'] = {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(1, 50),
        'max_features' : Categorical(['sqrt', 'log2']),
        'min_samples_split' : Integer(2, 10),
        'min_samples_leaf' : Integer(1, 5),
        'criterion' : Categorical(['gini', 'entropy']),
        'bootstrap' : Categorical([True, False]),
        'max_samples' : Real(0.5, 0.99, 'uniform'),
        #'warm_start' : Categorical([True, False]),        
        #'ccp_alpha' : Real(1e-9, 1.0, 'log-uniform')
}

tuning_params['AdaBoost'] = {
        'base_estimator' : Categorical([DecisionTreeClassifier(max_depth=2)]),
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'algorithm' : Categorical(['SAMME', 'SAMME.R'])
}

tuning_params['XGBoost'] = {
        'n_estimators': Integer(50, 100),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'min_child_weight': Integer(0, 10),
        'max_depth': Integer(1, 20), #(0,50)
        'gamma': Real(1e-9, 0.5, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'), #(0.01, 1.0, 'uniform')                      
        'colsample_bytree': Real(0.1, 1.0, 'uniform'), #Real(0.01, 1.0, 'uniform')        
        'lambda': Real(1e-9, 1000, 'log-uniform'),
        'alpha': Real(1e-9, 1.0, 'log-uniform'),
        #'colsample_bylevel': Real(0.1, 1.0, 'uniform'), #Real(0.01, 1.0, 'uniform')
        #'max_delta_step': Integer(0, 10),  
        #'scale_pos_weight': Real(1e-6, 500, 'log-uniform')
}

tuning_params['LightGBM'] = {
        'n_estimators': Integer(50, 100),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'min_child_weight': Integer(0, 10),
        'num_leaves': Integer(1, 100),      
        'max_depth': Integer(1, 50),
        'min_child_samples': Integer(0, 50),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.1, 1.0, 'uniform'),
        'lambda': Real(1e-9, 1000, 'log-uniform'),
        'alpha': Real(1e-9, 1.0, 'log-uniform'),
        #'max_bin': Integer(100, 1000),
        #'subsample_freq': Integer(0, 10),
        #'subsample_for_bin': Integer(100000, 500000),
        #'scale_pos_weight': Real(1e-6, 500, 'log-uniform')
}
    
tuning_params['CatBoost'] = {
        'iterations': Integer(10, 1000),
        'depth': Integer(1, 10),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'random_strength': Real(1e-9, 10, 'log-uniform'),
        'bagging_temperature': Real(0.0, 1.0, 'uniform'),
        'l2_leaf_reg': Integer(2, 30),
        'grow_policy' : Categorical(['SymmetricTree', 'Depthwise', 'Lossguide']),
        #'border_count': Integer(1, 255),       
        #'scale_pos_weight':Real(0.01, 1.0, 'uniform')}
}


cv_strategy = StratifiedKFold( n_splits=5, shuffle=True, random_state=2 )

In [31]:
def hyperparameter_tuning(X_train, y_train, model, tuning_params, scoring='accuracy', cv_strategy=None, n_settings = 25, verbose=True, file_prefix='', best_model=False):

    bsearch = BayesSearchCV(estimator = model,
                            search_spaces = tuning_params,                        
                            scoring = scoring,
                            cv = cv_strategy,
                            n_jobs = -1,
                            verbose = 0,
                            random_state = 1,
                            n_iter = n_settings)
    
    
    def status_print(optim_result):
        """Status callback durring bayesian hyperparameter search"""

        # Get all the models tested so far in DataFrame format
        all_models = pd.DataFrame(bsearch.cv_results_)    

        # Get current parameters and the best parameters    
        best_params = pd.Series(bsearch.best_params_)
        if verbose == True:
                print('Model #{}\nBest {}: {}\nBest params: {}\n'.format(
                len(all_models),
                scoring.upper(),
                np.round(bsearch.best_score_, 4),
                bsearch.best_params_
                ))

        # Save all model results
        clf_name = bsearch.estimator.__class__.__name__
        all_models.to_csv(os.path.join("bayes-search-models", file_prefix+'_'+clf_name+"_cv_results.csv"))

        
    bsearch.fit(X_train,y_train, callback=status_print)

    result = bsearch.best_params_
    
    if best_model == True:
        result = bsearch.best_params_, bsearch.best_estimator_
    
    return result

**Testing RF**

In [28]:
X, y = create_X_y_(oh_apis)

rf_model = RandomForestClassifier()

rf_best_params = hyperparameter_tuning(X, y, 
                                       model = rf_model, 
                                       tuning_params = tuning_params['RF'],
                                       scoring = 'accuracy',
                                       cv_strategy = cv_strategy,
                                       n_settings = 25)

Model #1
Best ACCURACY: 0.9109
Best params: OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 19), ('max_features', 'log2'), ('max_samples', 0.7416559821292414), ('min_samples_leaf', 5), ('min_samples_split', 2), ('n_estimators', 88)])

Model #2
Best ACCURACY: 0.9109
Best params: OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 19), ('max_features', 'log2'), ('max_samples', 0.7416559821292414), ('min_samples_leaf', 5), ('min_samples_split', 2), ('n_estimators', 88)])

Model #3
Best ACCURACY: 0.9115
Best params: OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 39), ('max_features', 'log2'), ('max_samples', 0.7194966829176728), ('min_samples_leaf', 4), ('min_samples_split', 7), ('n_estimators', 103)])

Model #4
Best ACCURACY: 0.9115
Best params: OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 39), ('max_features', 'log2'), ('max_samples', 0.7194966829176728), ('min_samples_leaf', 4), ('min

EXECUTION TIME : hyperparameter_tuning ran in 94.41s


In [29]:
api_scores_ = score_model_dataset(oh_apis, RandomForestClassifier(**rf_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 17.27s


 0.9480644599958615 ( +/- 0.010331328795484306 )


**Testing ExRF**

In [30]:
X, y = create_X_y_(oh_apis)

ex_model = ExtraTreesClassifier()

ex_best_params = hyperparameter_tuning(X, y, 
                                       model = ex_model, 
                                       tuning_params = tuning_params['ExRF'],
                                       scoring = 'accuracy',
                                       cv_strategy = cv_strategy,
                                       n_settings = 25)

Model #1
Best ACCURACY: 0.8899
Best params: OrderedDict([('bootstrap', True), ('ccp_alpha', 3.5881086588351177e-09), ('criterion', 'entropy'), ('max_depth', 8), ('max_features', 'log2'), ('max_samples', 0.9355734484361333), ('min_samples_leaf', 1), ('min_samples_split', 4), ('n_estimators', 122), ('warm_start', True)])

Model #2
Best ACCURACY: 0.9378
Best params: OrderedDict([('bootstrap', True), ('ccp_alpha', 1.0267362767027556e-06), ('criterion', 'entropy'), ('max_depth', 22), ('max_features', 'log2'), ('max_samples', 0.8657545635146762), ('min_samples_leaf', 1), ('min_samples_split', 5), ('n_estimators', 179), ('warm_start', True)])

Model #3
Best ACCURACY: 0.9378
Best params: OrderedDict([('bootstrap', True), ('ccp_alpha', 1.0267362767027556e-06), ('criterion', 'entropy'), ('max_depth', 22), ('max_features', 'log2'), ('max_samples', 0.8657545635146762), ('min_samples_leaf', 1), ('min_samples_split', 5), ('n_estimators', 179), ('warm_start', True)])

Model #4
Best ACCURACY: 0.9378
B

EXECUTION TIME : hyperparameter_tuning ran in 99.11s


In [31]:
api_scores_ = score_model_dataset(oh_apis, ExtraTreesClassifier(**ex_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 8.94s


 0.9444297082228115 ( +/- 0.01088449203327669 )


**Testing GradientBoostingClassifier**

In [17]:
X, y = create_X_y_(oh_apis)

gb_model = GradientBoostingClassifier()

gb_best_params = hyperparameter_tuning(X, y, 
                                       model = gb_model, 
                                       tuning_params = tuning_params['GB'],
                                       scoring = 'accuracy',
                                       cv_strategy = cv_strategy,
                                       n_settings = 25)

Model #1
Best ACCURACY: 0.8785
Best params: OrderedDict([('criterion', 'mae'), ('learning_rate', 0.01328322299832468), ('loss', 'deviance'), ('max_depth', 8), ('max_features', 'log2'), ('min_samples_leaf', 5), ('min_samples_split', 2), ('n_estimators', 63), ('subsample', 0.7394782887996654)])

Model #2
Best ACCURACY: 0.9295
Best params: OrderedDict([('criterion', 'mae'), ('learning_rate', 0.04668884070205238), ('loss', 'deviance'), ('max_depth', 22), ('max_features', 'log2'), ('min_samples_leaf', 4), ('min_samples_split', 2), ('n_estimators', 68), ('subsample', 0.928723943060398)])

Model #3
Best ACCURACY: 0.9295
Best params: OrderedDict([('criterion', 'mae'), ('learning_rate', 0.04668884070205238), ('loss', 'deviance'), ('max_depth', 22), ('max_features', 'log2'), ('min_samples_leaf', 4), ('min_samples_split', 2), ('n_estimators', 68), ('subsample', 0.928723943060398)])

Model #4
Best ACCURACY: 0.9357
Best params: OrderedDict([('criterion', 'mae'), ('learning_rate', 0.0665496829713096

EXECUTION TIME : hyperparameter_tuning ran in 1353.42s


In [None]:
api_scores_ = score_model_dataset(oh_apis, GradientBoostingClassifier(**gb_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

**Testing AdaBoost**

In [22]:
X, y = create_X_y_(oh_apis)

ada_model = AdaBoostClassifier()

ada_best_params = hyperparameter_tuning(X, y, 
                                       model = ada_model, 
                                       tuning_params = tuning_params['AdaBoost'],
                                       scoring = 'accuracy',
                                       cv_strategy = cv_strategy,
                                       n_settings = 25)

Model #1
Best ACCURACY: 0.8897
Best params: OrderedDict([('algorithm', 'SAMME.R'), ('base_estimator', DecisionTreeClassifier(max_depth=2)), ('learning_rate', 0.05672620790508927), ('n_estimators', 116)])

Model #2
Best ACCURACY: 0.8897
Best params: OrderedDict([('algorithm', 'SAMME.R'), ('base_estimator', DecisionTreeClassifier(max_depth=2)), ('learning_rate', 0.05672620790508927), ('n_estimators', 116)])

Model #3
Best ACCURACY: 0.9295
Best params: OrderedDict([('algorithm', 'SAMME.R'), ('base_estimator', DecisionTreeClassifier(max_depth=2)), ('learning_rate', 0.3465259624840772), ('n_estimators', 176)])

Model #4
Best ACCURACY: 0.9295
Best params: OrderedDict([('algorithm', 'SAMME.R'), ('base_estimator', DecisionTreeClassifier(max_depth=2)), ('learning_rate', 0.3465259624840772), ('n_estimators', 176)])

Model #5
Best ACCURACY: 0.9295
Best params: OrderedDict([('algorithm', 'SAMME.R'), ('base_estimator', DecisionTreeClassifier(max_depth=2)), ('learning_rate', 0.3465259624840772), ('n

EXECUTION TIME : hyperparameter_tuning ran in 300.32s


In [23]:
api_scores_ = score_model_dataset(oh_apis, AdaBoostClassifier(**ada_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 68.75s


 0.9366297006979327 ( +/- 0.01356393599292852 )


**Testing XGB**

In [27]:
X, y = create_X_y_(oh_apis)

xgb_model = XGBClassifier(objective = 'binary:logistic', eval_metric = 'error', silent=1, tree_method='approx')

xgb_best_params = hyperparameter_tuning(X, y, 
                                        model = XGBClassifier(), 
                                        tuning_params = tuning_params['XGBoost'],
                                        scoring = 'accuracy',
                                        cv_strategy = cv_strategy,
                                        n_settings = 25)

Model #1
Best ACCURACY: 0.9373
Best params: OrderedDict([('alpha', 0.002107681249770302), ('colsample_bytree', 0.15548655863280092), ('gamma', 1.8992018546362966e-06), ('lambda', 5.593977880531548e-08), ('learning_rate', 0.09690606246621411), ('max_depth', 18), ('min_child_weight', 1), ('n_estimators', 63), ('subsample', 0.7394782887996654)])

Model #2
Best ACCURACY: 0.9455
Best params: OrderedDict([('alpha', 0.0014191129358860485), ('colsample_bytree', 0.4011458906809745), ('gamma', 1.8023871181327715e-09), ('lambda', 0.00016493015774619377), ('learning_rate', 0.06410712948858617), ('max_depth', 15), ('min_child_weight', 0), ('n_estimators', 68), ('subsample', 0.928723943060398)])

Model #3
Best ACCURACY: 0.9455
Best params: OrderedDict([('alpha', 0.0014191129358860485), ('colsample_bytree', 0.4011458906809745), ('gamma', 1.8023871181327715e-09), ('lambda', 0.00016493015774619377), ('learning_rate', 0.06410712948858617), ('max_depth', 15), ('min_child_weight', 0), ('n_estimators', 68)

Model #25
Best ACCURACY: 0.9489
Best params: OrderedDict([('alpha', 0.00019924519155244455), ('colsample_bytree', 0.32989933298568475), ('gamma', 1e-09), ('lambda', 1.0342642810019773e-09), ('learning_rate', 0.29762436795987596), ('max_depth', 15), ('min_child_weight', 0), ('n_estimators', 50), ('subsample', 0.857340870632541)])



EXECUTION TIME : hyperparameter_tuning ran in 230.05s


In [28]:
api_scores_ = score_model_dataset(oh_apis, XGBClassifier(**xgb_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 33.11s


 0.9483283010704142 ( +/- 0.009336448192940898 )


**Testing LightGBM**

In [103]:
X, y = create_X_y_(oh_apis)

lgb_model = LGBMClassifier(objective='binary', metric='auc', verbose=0)

lgb_best_params = hyperparameter_tuning(X, y, 
                                       model = lgb_model, 
                                       tuning_params = tuning_params['LightGBM'],
                                       scoring = 'roc_auc',
                                       cv_strategy = cv_strategy,
                                       n_settings = 100)

Model #1
Best ROC_AUC: 0.9722
Best params: OrderedDict([('learning_rate', 0.2542669917403192), ('max_depth', 3), ('min_child_samples', 19), ('n_estimators', 57), ('num_leaves', 50)])

Model #2
Best ROC_AUC: 0.9837
Best params: OrderedDict([('learning_rate', 0.2328707166744412), ('max_depth', 17), ('min_child_samples', 1), ('n_estimators', 72), ('num_leaves', 42)])

Model #3
Best ROC_AUC: 0.9837
Best params: OrderedDict([('learning_rate', 0.2328707166744412), ('max_depth', 17), ('min_child_samples', 1), ('n_estimators', 72), ('num_leaves', 42)])

Model #4
Best ROC_AUC: 0.9837
Best params: OrderedDict([('learning_rate', 0.2551996554556929), ('max_depth', 21), ('min_child_samples', 6), ('n_estimators', 70), ('num_leaves', 43)])

Model #5
Best ROC_AUC: 0.9837
Best params: OrderedDict([('learning_rate', 0.2551996554556929), ('max_depth', 21), ('min_child_samples', 6), ('n_estimators', 70), ('num_leaves', 43)])

Model #6
Best ROC_AUC: 0.9837
Best params: OrderedDict([('learning_rate', 0.2551

Model #45
Best ROC_AUC: 0.9838
Best params: OrderedDict([('learning_rate', 0.10304111908436134), ('max_depth', 48), ('min_child_samples', 21), ('n_estimators', 100), ('num_leaves', 37)])

Model #46
Best ROC_AUC: 0.9838
Best params: OrderedDict([('learning_rate', 0.10304111908436134), ('max_depth', 48), ('min_child_samples', 21), ('n_estimators', 100), ('num_leaves', 37)])

Model #47
Best ROC_AUC: 0.9838
Best params: OrderedDict([('learning_rate', 0.10304111908436134), ('max_depth', 48), ('min_child_samples', 21), ('n_estimators', 100), ('num_leaves', 37)])

Model #48
Best ROC_AUC: 0.9838
Best params: OrderedDict([('learning_rate', 0.10304111908436134), ('max_depth', 48), ('min_child_samples', 21), ('n_estimators', 100), ('num_leaves', 37)])

Model #49
Best ROC_AUC: 0.9838
Best params: OrderedDict([('learning_rate', 0.10304111908436134), ('max_depth', 48), ('min_child_samples', 21), ('n_estimators', 100), ('num_leaves', 37)])

Model #50
Best ROC_AUC: 0.9838
Best params: OrderedDict([('l

Model #89
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learning_rate', 0.10955273917171476), ('max_depth', 0), ('min_child_samples', 12), ('n_estimators', 78), ('num_leaves', 100)])

Model #90
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learning_rate', 0.10955273917171476), ('max_depth', 0), ('min_child_samples', 12), ('n_estimators', 78), ('num_leaves', 100)])

Model #91
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learning_rate', 0.10955273917171476), ('max_depth', 0), ('min_child_samples', 12), ('n_estimators', 78), ('num_leaves', 100)])

Model #92
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learning_rate', 0.10955273917171476), ('max_depth', 0), ('min_child_samples', 12), ('n_estimators', 78), ('num_leaves', 100)])

Model #93
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learning_rate', 0.10955273917171476), ('max_depth', 0), ('min_child_samples', 12), ('n_estimators', 78), ('num_leaves', 100)])

Model #94
Best ROC_AUC: 0.9852
Best params: OrderedDict([('learni

EXECUTION TIME : hyperparameter_tuning ran in 328.26s


In [104]:
api_scores_ = score_model_dataset(oh_apis, LGBMClassifier(**lgb_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 2.33s


 0.9483283010704141 ( +/- 0.011791962996857922 )


**Testing CatBoost**

In [None]:
X, y = create_X_y_(oh_apis)

cat_model = CatBoostClassifier(thread_count=2, loss_function='Logloss', od_type = 'Iter', verbose= False)

cat_best_params = hyperparameter_tuning(X, y, 
                                       model = cat_model, 
                                       tuning_params = tuning_params['CatBoost'],
                                       scoring = 'accuracy',
                                       cv_strategy = cv_strategy,
                                       n_settings = 25)

In [104]:
api_scores_ = score_model_dataset(oh_apis, CatBoostClassifier(**cat_best_params))
print(f' {np.mean(api_scores_)} ( +/- {np.std(api_scores_)} )')

EXECUTION TIME : score_model_dataset ran in 2.33s


 0.9483283010704141 ( +/- 0.011791962996857922 )


**Final implementation**

In [33]:
@timed
def perform_nested_cv():
    
    cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

    outer_results = { name : [] for name in tuning_params.keys()}

    #X, y = create_X_y(folder_path, file_name)
    X, y = create_X_y_(oh_api)

    for train_ix, test_ix in cv_outer.split(X) :

        X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

        ensemble_models = get_ensemble_models()

        for name, model in ensemble_models.items():

            best_params, best_model = hyperparameter_tuning(X_train = X_train, 
                                                            y_train = y_train, 
                                                            model = model, 
                                                            tuning_params = tuning_params[name], 
                                                            scoring = 'accuracy', 
                                                            cv_strategy = cv_inner, 
                                                            n_settings = 30,
                                                            verbose = False,
                                                            file_prefix = 'apistats_accuracy_30',
                                                            best_model = True)


            y_pred = best_model.predict(X_test)

            acc = accuracy_score(y_test, y_pred)

            outer_results[name].append(acc)

    pickle_results(outer_results, 'apistats_accuracy_30.pickle')
    
    return outer_results

In [None]:
outer_results = perform_nested_cv()

In [None]:
fig = plot_evaluation_boxplots(list(outer_results.values()), 
                               list(outer_results.keys()), 
                               title = 'Tunned Ensemble Model Performance On One-hot Encoded API Calls Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/model-training-performance/encoded_apistats_bayes_accuracy_30.html'

save_figures_to_html(figure_path, [fig])