<a href="https://colab.research.google.com/github/Fontoura21/Asteroid-Plus/blob/main/reproducao_artigo_collusion_detection_refactor_busca_otimizacao_parametros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-optimize



In [None]:
import os
import sys
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import warnings
from joblib import dump, load
from matplotlib.lines import Line2D
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.gaussian_process import GaussianProcessClassifier
from collections import defaultdict
from scipy import stats
from itertools import cycle
from sklearn.utils import shuffle
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Font size to plot
default_font_size = 18
plt.rcParams.update({'font.size': default_font_size})

# Format to print
pd.options.display.float_format = '{:,.4f}'.format

# To hide warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Paths and filenames of the datasets
path = '../content/drive/MyDrive/Céos/Artigos e Documentos'
db_collusion_brazilian = os.path.join(path, 'DB_Collusion_Brazil_processed.csv')
db_collusion_italian = os.path.join(path, 'DB_Collusion_Italy_processed.csv')
db_collusion_american = os.path.join(path, 'DB_Collusion_America_processed.csv')
db_collusion_switzerland_gr_sg = os.path.join(path, 'DB_Collusion_Switzerland_GR_and_See-Gaster_processed.csv')
db_collusion_switzerland_ticino = os.path.join(path, 'DB_Collusion_Switzerland_Ticino_processed.csv')
db_collusion_japan = os.path.join(path, 'DB_Collusion_Japan_processed.csv')
db_collusion_all = os.path.join(path, 'DB_Collusion_All_processed.csv')

# To save plots (pdf format)
plot_pdf = True

# User's parameters for the functions
#ml_algorithms = ['GaussianProcessClassifier', 'SGDClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'AdaBoostClassifier',
#                  'GradientBoostingClassifier', 'SVC', 'KNeighborsClassifier', 'MLPClassifier', 'BernoulliNB', 'GaussianNB', 'LogisticRegression']
#ml_algorithms = ['GradientBoostingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier']
#ml_algorithms = ['LogisticRegression']
#ml_algorithms = ['MLPClassifier']
ml_algorithms = ['SVC']
screens = ['CV', 'SPD', 'DIFFP', 'RD', 'KURT', 'SKEW', 'KSTEST'] # Screening variables to use. There are seven: CV, SPD, DIFFP, RD, KURT, SKEW and KSTEST
train_size = 0.8 # Test and train sizes. The test_size is 1-train_size
repetitions = 40 # Number of repetitions for each ML algorithm. Minimum value > 30. Recommended value > 100
n_estimators = 300 # Number of estimators for ML algorithms
precision_recall = False # To plot precision-recall curves
load_data = False # To load the error metrics (to load previous data experimentation)
save_data = True # To save the error metrics (to persist the data experimentation)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def predict_collusion_company(df, dataset, predictors_column_name, targets_column_name, algorithm, train_size, n_estimators=None, seed=None):
    ''' Predict collusion applying the ML algorithm '''
    print("Init Predict!")
    # Datasets to have to simplify the process' time
    simplify_process = ['japan', 'italian', 'switzerland_gr_sg', 'american', 'all']

    # To assing the dataframes
    predictors = df[predictors_column_name]
    targets = df[targets_column_name]

    # We create the training and test sample, both for predictors and for the objective variable, based on the tender group.
    # That is, the bids of a tender either all own to the train group or the test group. They cannot be divided between both groups.
    gss = GroupShuffleSplit(n_splits=5, train_size=train_size)
    train_index, test_index = next(gss.split(predictors, targets, groups=df['Tender']))
    x_train = predictors.loc[train_index]
    y_train = targets.loc[train_index]
    x_test = predictors.loc[test_index]
    y_test = targets.loc[test_index]

    #Scale Feature
    if algorithm == 'SVC':
      scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train)
      x_train = scaling.transform(x_train)
      x_test = scaling.transform(x_test)

    # Train the model with the selected algorithm
    if algorithm == 'ExtraTreesClassifier':
        model = ExtraTreesClassifier(n_estimators=n_estimators, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                            max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True,
                            oob_score=True, n_jobs=-1, random_state=seed, verbose=0, warm_start=False, class_weight='balanced', ccp_alpha=0.0, max_samples=None)

        param_space = {
            'n_estimators': (50, 300),
            'max_features': ['auto', 'sqrt', 'log2', None],
            'min_samples_split': (2, 20),
            'min_samples_leaf': (1, 20),
        }

        classifier = BayesSearchCV(model, param_space, cv=3, random_state=seed, n_jobs=-1)
    elif algorithm == 'RandomForestClassifier':
        classifier = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.,
                            max_features=None, max_leaf_nodes=None, min_impurity_decrease=0., bootstrap=True,
                            oob_score=True, n_jobs=-1, random_state=seed, verbose=0, warm_start=False, class_weight='balanced')
    elif algorithm == 'SGDClassifier':
        classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1,
                            n_jobs=-1, random_state=seed, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5,
                            class_weight=None, warm_start=False, average=False)
    elif algorithm == 'AdaBoostClassifier':
        model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=1.0, algorithm='SAMME.R', random_state=seed)

        param_space = {
            'n_estimators': (50, 300),
            'learning_rate': (0.01, 1.0, 'log-uniform'),
            'algorithm': ['SAMME', 'SAMME.R']
        }

        classifier = BayesSearchCV(model, param_space, n_iter=50, cv=3, random_state=seed, n_jobs=-1)
    elif algorithm == 'GradientBoostingClassifier':
        if dataset in simplify_process:
            learning_rate = 100
            tol = 10
            estimators = int(round(n_estimators / 3))
        else:
            learning_rate = 0.1
            tol = 0.0001
            estimators = n_estimators
        model = GradientBoostingClassifier(loss='deviance', learning_rate=learning_rate, n_estimators=estimators, subsample=1.0, min_samples_split=2, min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0, max_depth=None, min_impurity_decrease=0.0, init=None, random_state=seed, max_features=None, verbose=0,
                            max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=tol, ccp_alpha=0.0)
        param_space = {
            'n_estimators': (50, 200),
            'learning_rate': (0.01, 0.1, 'log-uniform'),
            'max_depth': (3, 10),
        }
        classifier = BayesSearchCV(n_iter=50, cv=3, random_state=seed, n_jobs=-1)
    elif algorithm == 'SVC':
        model = SVC(coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200,
                            class_weight='balanced', verbose=False, decision_function_shape='ovr', break_ties=False, random_state=seed)
        param_space = {
            'C': (0.01, 0.1, 1, 10, 100, 1000),
            'degree': Integer(1,6),
            'gamma' : Real(1e-6, 1e+1, prior='log-uniform'),
            'kernel': Categorical(['linear', 'poly', 'rbf']),
        }

        classifier = BayesSearchCV(model, param_space, n_iter=32, cv=5, random_state=seed, n_jobs=-1)
    elif algorithm == 'KNeighborsClassifier':
        classifier = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
    elif algorithm == 'MLPClassifier':
        model = MLPClassifier(batch_size='auto', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=seed, tol=0.0001, verbose=0, warm_start=False, momentum=0.9, nesterovs_momentum=True,
                        early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)
        param_space = {
            'hidden_layer_sizes' : (240, 120, 70, 35),
            'activation': ['tanh'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.01],
            'learning_rate': ['constant','adaptive'],
        }
        classifier = BayesSearchCV(model, param_space, n_iter=32, cv=5, random_state=seed, n_jobs=-1)
    elif algorithm == 'GaussianNB':
        classifier = GaussianNB(priors=None, var_smoothing=1e-09)
    elif algorithm == 'BernoulliNB':
        classifier = BernoulliNB(alpha=0.5, binarize=0, fit_prior=True, class_prior=None)
    elif algorithm == 'LogisticRegression':
        model = LogisticRegression(dual=False, tol=1e-4, fit_intercept=True, intercept_scaling=1.0, class_weight=None,multi_class='auto', verbose=0, warm_start=False, l1_ratio=None, random_state=seed)
        param_space = {
            'max_iter' : (50, 300),
            'C' : (0.01, 0.5),
            'solver' : ['newton-cg'],
            'penalty' : ('l2', None)
        }
        classifier = BayesSearchCV(model, param_space, n_iter=50, cv=3, random_state=seed, n_jobs=-1)
    elif algorithm == 'GaussianProcessClassifier':
        if dataset in simplify_process:
            max_iter_predict = 5
            n_restarts_optimizer = 2
        else:
            max_iter_predict = 5000
            n_restarts_optimizer = 50
        classifier = GaussianProcessClassifier(kernel=None, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=n_restarts_optimizer, max_iter_predict=max_iter_predict, warm_start=False, copy_X_train=True, random_state=seed,
                        multi_class='one_vs_rest', n_jobs=-1)

    # We build the model for the train group
    classifier.fit(x_train, y_train.values.ravel())

    # We predict for the values of the test group
    predictions = classifier.predict(x_test)
    df_predictions = pd.DataFrame(data=predictions, index=y_test.index, columns=['Forecast_collusive_competitor'])

    # To calculate the error metrics for the classification binary model
    accuracy = accuracy_score(y_test, predictions) * 100
    balanced_accuracy = balanced_accuracy_score(y_test, predictions) * 100
    precision = precision_score(y_test, predictions, pos_label=1, average='binary', zero_division=1) * 100 # Ratio of true positives: tp / (tp + fp)
    recall = recall_score(y_test, predictions, pos_label=1, average='binary', zero_division=1) * 100 # Ratio of true positives: tp / (tp + fn)
    f1 = f1_score(y_test, predictions, pos_label=1, average='binary', zero_division=1) * 100 # F1 = 2 * (precision * recall) / (precision + recall)
    confusion = confusion_matrix(y_test, predictions, normalize='all') * 100

    print(f'Best params for {algorithm}: {classifier.best_params_}')

    return accuracy, balanced_accuracy, precision, recall, f1, confusion, y_test, df_predictions

def shuffle_tenders(df1, seed):
    ''' Shuffle tenders. The reason is that maybe the colluded tenders are concentrated in some parts of the excel (dataframe)'''

    df = df1.copy()
    df = shuffle(df, random_state=seed).reset_index(drop=True)
    df['Tender'] = df['Tender'].astype(str)
    reindex_tenders = 1
    list_tenders = []
    for index, row in df.iterrows():
        if not row['Tender'] in list_tenders:
            df['Tender'].replace(row['Tender'], reindex_tenders, inplace=True)
            reindex_tenders = reindex_tenders + 1
            list_tenders.append(row['Tender'])
    return df

def algorithm_comparison(df, dataset, predictors, targets, algorithms, train_size, repetitions, n_estimators, precision_recall=False, load_data=False, save_data=False, seed=None):
    ''' Print table to compare Machine Learning algorithms '''

    df = shuffle_tenders(df, seed)

    for setting in predictors:
        print('')
        print('Generating models for ' + setting)
        accuracy = defaultdict(list)
        balanced_accuracy = defaultdict(list)
        false_positive = defaultdict(list)
        false_negative = defaultdict(list)
        precision = defaultdict(list)
        recall = defaultdict(list)
        f1 = defaultdict(list)
        tenders_test = defaultdict(list)
        tenders_predictions = defaultdict(list)

        # Create namefile
        namefile = dataset + '_ML_algorithms_experimentation_' + setting + '_' + str(repetitions) + 'repetitions'

        if load_data == False:
            for algorithm in algorithms:
                print('Training algorithm ' + algorithm)
                df_copy = df.copy()
                if algorithm in ['GaussianProcessClassifier', 'GradientBoostingClassifier', 'SVC']:
                    loop = int(round(repetitions / 40))

                    if dataset == 'all' and algorithm == 'GaussianProcessClassifier':
                        # Exception: reduce the datataset to be able to compute this dataset and algorithm
                        df_copy = df_copy.sample(frac=0.5).reset_index(drop=True)
                else:
                    loop = repetitions
                loop = 1
                for i in range(loop):
                    item_accuracy, item_balanced_accuracy, item_precision, item_recall, item_f1, confusion_matrix, item_tenders_test, item_tenders_predictions = \
                                predict_collusion_company(df_copy, dataset, predictors[setting], targets, algorithm, train_size, n_estimators)
                    accuracy[algorithm].append(item_accuracy)
                    balanced_accuracy[algorithm].append(item_balanced_accuracy)
                    if confusion_matrix.shape[1] == 2:
                        false_positive[algorithm].append(confusion_matrix[0][1])
                        false_negative[algorithm].append(confusion_matrix[1][0])
                    else:
                        false_positive[algorithm].append(0)
                        false_negative[algorithm].append(0)
                    precision[algorithm].append(item_precision)
                    recall[algorithm].append(item_recall)
                    f1[algorithm].append(item_f1)
                    tenders_test[algorithm].append(item_tenders_test)
                    tenders_predictions[algorithm].append(item_tenders_predictions)

            # Save dictionaries to persist the data experimentation
            if save_data:
                path_namefile = os.path.join(path, namefile + '.pkl')
                file = [accuracy, balanced_accuracy, false_positive, false_negative, precision, recall, f1, df, tenders_test, tenders_predictions]
                dump(file, path_namefile, compress=6)

        else:
            # To load data
            pkl_file = os.path.join(path, namefile + '.pkl')
            [accuracy, balanced_accuracy, false_positive, false_negative, precision, recall, f1, df, tenders_test, tenders_predictions] = load(pkl_file)

        for algorithm in algorithms:
          mean_balanced_accuracy = np.mean(balanced_accuracy[algorithm])
          mean_f1 = np.mean(f1[algorithm])
          mean_precision = np.mean(precision[algorithm])
          mean_recall = np.mean(recall[algorithm])
          mean_accuracy = np.mean(accuracy[algorithm])
          mean_false_positive = np.mean(false_positive[algorithm])
          mean_false_negative = np.mean(false_negative[algorithm])

          # std_balanced_accuracy = np.std(balanced_accuracy[algorithm])
          # std_f1 = np.std(f1[algorithm])
          # std_precision = np.std(precision[algorithm])
          # std_recall = np.std(recall[algorithm])
          # std_accuracy = np.std(accuracy[algorithm])
          # std_false_positive = np.std(false_positive[algorithm])
          # std_false_negative = np.std(false_negative[algorithm])
          # Print error metrics
          # test_size = 1 - train_size
          # print('Algorithm {} with train:test {:,.2f}:{:,.2f}, {} repetitions and {}: mean_accuracy={:,.1f}, mean_FP={:,.1f}, '
          #     'mean_FN={:,.1f}, mean_balanced_accuracy={:,.1f}, mean_f1={:,.1f}, median_f1={:,.1f}, mean_precision={:,.1f}, '
          #     'median_precision={:,.1f}, mean_recall={:,.1f} and median_recall={:,.1f}'.format(
          #     algorithm, train_size, test_size, repetitions, setting, mean_accuracy, mean_false_positive, mean_false_negative,
          #     mean_balanced_accuracy, mean_f1, np.median(f1[algorithm]), mean_precision,
          #     np.median(precision[algorithm]), mean_recall, np.median(recall[algorithm])))

          accuracy_mean[(setting, algorithm)].append(mean_accuracy)
          balanced_accuracy_mean[(setting, algorithm)].append(mean_balanced_accuracy)
          false_positive_mean[(setting, algorithm)].append(mean_false_positive)
          false_negative_mean[(setting, algorithm)].append(mean_false_negative)
          precision_mean[(setting, algorithm)].append(mean_precision)
          recall_mean[(setting, algorithm)].append(mean_recall)
          f1_mean[(setting, algorithm)].append(mean_f1)

          # accuracy_std[(setting, algorithm)].append(std_accuracy)
          # balanced_accuracy_std[(setting, algorithm)].append(std_balanced_accuracy)
          # false_positive_std[(setting, algorithm)].append(std_false_positive)
          # false_negative_std[(setting, algorithm)].append(std_false_negative)
          # precision_std[(setting, algorithm)].append(std_precision)
          # recall_std[(setting, algorithm)].append(std_recall)
          # f1_std[(setting, algorithm)].append(std_f1)

        # Print curve precision vs recall with iso-F1 lines
        # if precision_recall:
            # plot_precision_vs_recall(dataset, algorithms, precision, recall, min_f1=0.4, max_f1=0.86, f1_curves=24, min_x_y_lim=0.5, max_x_y_lim=1, namefile=namefile)


def get_dataset(dataset):
    ''' Get the collusive dataset and their fields to use in the ML algorimths '''

    predictors = defaultdict(list)

    if dataset == 'brazilian':
        df_collusion = pd.read_csv(db_collusion_brazilian, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Pre-Tender Estimate (PTE)', 'Difference Bid/PTE', 'Site', 'Date', 'Brazilian State', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = ['Tender', 'Bid_value', 'Pre-Tender Estimate (PTE)', 'Difference Bid/PTE', 'Site', 'Date', 'Brazilian State', 'Winner', 'Number_bids'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Date', 'Number_bids']

    elif dataset == 'switzerland_gr_sg':
        df_collusion = pd.read_csv(db_collusion_switzerland_gr_sg, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Contract_type', 'Date', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = predictors['all_setting'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Date', 'Number_bids']

    elif dataset == 'switzerland_ticino':
        df_collusion = pd.read_csv(db_collusion_switzerland_ticino, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Consortium', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = predictors['all_setting'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Number_bids']

    elif dataset == 'italian':
        df_collusion = pd.read_csv(db_collusion_italian, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Pre-Tender Estimate (PTE)', 'Difference Bid/PTE', 'Site', 'Capital', 'Legal_entity_type', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = predictors['all_setting'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Number_bids']

    elif dataset == 'american':
        df_collusion = pd.read_csv(db_collusion_american, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Bid_value_without_inflation', 'Bid_value_inflation_raw_milk_price_adjusted_bid', 'Date', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = predictors['all_setting'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Date', 'Number_bids']

    elif dataset == 'japan':
        df_collusion = pd.read_csv(db_collusion_japan, header=0)
        predictors['all_setting'] = ['Tender', 'Bid_value', 'Pre-Tender Estimate (PTE)', 'Difference Bid/PTE', 'Site', 'Date', 'Winner', 'Number_bids']
        predictors['all_setting+screens'] = predictors['all_setting'] + screens
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Date', 'Number_bids']

    elif dataset == 'all':
        df_collusion = pd.read_csv(db_collusion_all, header=0)
        predictors['common'] = ['Tender', 'Bid_value', 'Winner', 'Number_bids', 'Dataset']

    predictors['common+screens'] = predictors['common'] + screens

    # Output fields of the datasets to the ML algorithms.
    targets = ['Collusive_competitor']

    return df_collusion, predictors, targets


In [None]:
# The user selectes the dataset to analyse
dataset = 'brazilian'

# 1. Get the dataset processed ready to use with the ML algorithms
df_collusion, predictors, targets = get_dataset(dataset)

seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [None]:
# 7. Execute algorithm comparison and print table comparison
accuracy_mean = defaultdict(list)
balanced_accuracy_mean = defaultdict(list)
false_positive_mean = defaultdict(list)
false_negative_mean = defaultdict(list)
precision_mean = defaultdict(list)
recall_mean = defaultdict(list)
f1_mean = defaultdict(list)

accuracy_std = defaultdict(list)
balanced_accuracy_std = defaultdict(list)
false_positive_std = defaultdict(list)
false_negative_std = defaultdict(list)
precision_std = defaultdict(list)
recall_std = defaultdict(list)
f1_std = defaultdict(list)

algorithm_comparison(df_collusion, dataset, predictors, targets, ml_algorithms, train_size, repetitions, n_estimators, precision_recall, load_data, save_data, seed=1)


Generating models for all_setting
Training algorithm SVC
Init Predict!
Best params for SVC: OrderedDict([('C', 10.0), ('degree', 6), ('gamma', 0.42243034366179416), ('kernel', 'poly')])

Generating models for all_setting+screens
Training algorithm SVC
Init Predict!
Best params for SVC: OrderedDict([('C', 1000.0), ('degree', 1), ('gamma', 0.032521397638469306), ('kernel', 'rbf')])

Generating models for common
Training algorithm SVC
Init Predict!
Best params for SVC: OrderedDict([('C', 1000.0), ('degree', 5), ('gamma', 9.01566090470945), ('kernel', 'rbf')])

Generating models for common+screens
Training algorithm SVC
Init Predict!
Best params for SVC: OrderedDict([('C', 1000.0), ('degree', 4), ('gamma', 4.0710328666530895), ('kernel', 'poly')])




```
Generating models for all_setting
Training algorithm GradientBoostingClassifier
Init Predict!
Best params for GradientBoostingClassifier: OrderedDict([('learning_rate', 0.09991742700717861), ('max_depth', 4), ('n_estimators', 170)])
Training algorithm ExtraTreesClassifier
Init Predict!
Best params for ExtraTreesClassifier: OrderedDict([('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])
Training algorithm AdaBoostClassifier
Init Predict!
Best params for AdaBoostClassifier: OrderedDict([('algorithm', 'SAMME'), ('learning_rate', 1.0), ('n_estimators', 300)])

Generating models for all_setting+screens
Training algorithm GradientBoostingClassifier
Init Predict!
Best params for GradientBoostingClassifier: OrderedDict([('learning_rate', 0.09126660627584679), ('max_depth', 10), ('n_estimators', 111)])
Training algorithm ExtraTreesClassifier
Init Predict!
Best params for ExtraTreesClassifier: OrderedDict([('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])
Training algorithm AdaBoostClassifier
Init Predict!
Best params for AdaBoostClassifier: OrderedDict([('algorithm', 'SAMME.R'), ('learning_rate', 0.6104443013206723), ('n_estimators', 290)])

Generating models for common
Training algorithm GradientBoostingClassifier
Init Predict!
Best params for GradientBoostingClassifier: OrderedDict([('learning_rate', 0.1), ('max_depth', 4), ('n_estimators', 181)])
Training algorithm ExtraTreesClassifier
Init Predict!
Best params for ExtraTreesClassifier: OrderedDict([('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 65)])
Training algorithm AdaBoostClassifier
Init Predict!
Best params for AdaBoostClassifier: OrderedDict([('algorithm', 'SAMME.R'), ('learning_rate', 0.2850791538944987), ('n_estimators', 191)])

Generating models for common+screens
Training algorithm GradientBoostingClassifier
Init Predict!
Best params for GradientBoostingClassifier: OrderedDict([('learning_rate', 0.06881923355419943), ('max_depth', 3), ('n_estimators', 126)])
Training algorithm ExtraTreesClassifier
Init Predict!
Best params for ExtraTreesClassifier: OrderedDict([('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])
Training algorithm AdaBoostClassifier
Init Predict!
Best params for AdaBoostClassifier: OrderedDict([('algorithm', 'SAMME'), ('learning_rate', 1.0), ('n_estimators', 130)])
```



In [None]:
keys = accuracy_mean.keys()

for key in keys:
  accuracy_std[key] = np.std(accuracy_mean[key])
  balanced_accuracy_std[key] = np.std(balanced_accuracy_mean[key])
  false_positive_std[key] = np.std(false_positive_mean[key])
  false_negative_std[key] = np.std(false_negative_mean[key])
  precision_std[key] = np.std(precision_mean[key])
  recall_std[key] = np.std(recall_mean[key])
  f1_std[key] = np.std(f1_mean[key])

  accuracy_mean[key] = np.mean(accuracy_mean[key])
  balanced_accuracy_mean[key] = np.mean(balanced_accuracy_mean[key])
  false_positive_mean[key] = np.mean(false_positive_mean[key])
  false_negative_mean[key] = np.mean(false_negative_mean[key])
  precision_mean[key] = np.mean(precision_mean[key])
  recall_mean[key] = np.mean(recall_mean[key])
  f1_mean[key] = np.mean(f1_mean[key])

In [None]:
results_json = defaultdict(dict)

results_json['accuracy_mean'] = {":".join(map(str, chave)): valor for chave, valor in accuracy_mean.items()}
results_json['balanced_accuracy_mean'] = {":".join(map(str, chave)): valor for chave, valor in balanced_accuracy_mean.items()}
results_json['false_positive_mean'] = {":".join(map(str, chave)): valor for chave, valor in false_positive_mean.items()}
results_json['false_negative_mean'] = {":".join(map(str, chave)): valor for chave, valor in false_negative_mean.items()}
results_json['precision_mean'] = {":".join(map(str, chave)): valor for chave, valor in precision_mean.items()}
results_json['recall_mean'] = {":".join(map(str, chave)): valor for chave, valor in recall_mean.items()}
results_json['f1_mean'] = {":".join(map(str, chave)): valor for chave, valor in f1_mean.items()}

results_json['accuracy_std'] = {":".join(map(str, chave)): valor for chave, valor in accuracy_std.items()}
results_json['balanced_accuracy_std'] = {":".join(map(str, chave)): valor for chave, valor in balanced_accuracy_std.items()}
results_json['false_positive_std'] = {":".join(map(str, chave)): valor for chave, valor in false_positive_std.items()}
results_json['false_negative_std'] = {":".join(map(str, chave)): valor for chave, valor in false_negative_std.items()}
results_json['precision_std'] = {":".join(map(str, chave)): valor for chave, valor in precision_std.items()}
results_json['recall_std'] = {":".join(map(str, chave)): valor for chave, valor in recall_std.items()}
results_json['f1_std'] = {":".join(map(str, chave)): valor for chave, valor in f1_std.items()}

import json

json_data = json.dumps(results_json, indent=4)

print(json_data)

{
    "accuracy_mean": {
        "all_setting:SVC": 84.12698412698413,
        "all_setting+screens:SVC": 82.20338983050848,
        "common:SVC": 78.37837837837837,
        "common+screens:SVC": 83.33333333333334
    },
    "balanced_accuracy_mean": {
        "all_setting:SVC": 64.20454545454545,
        "all_setting+screens:SVC": 78.95153626860944,
        "common:SVC": 74.56073338426279,
        "common+screens:SVC": 90.07633587786259
    },
    "false_positive_mean": {
        "all_setting:SVC": 7.936507936507936,
        "all_setting+screens:SVC": 6.779661016949152,
        "common:SVC": 10.81081081081081,
        "common+screens:SVC": 16.666666666666664
    },
    "false_negative_mean": {
        "all_setting:SVC": 7.936507936507936,
        "all_setting+screens:SVC": 11.016949152542372,
        "common:SVC": 10.81081081081081,
        "common+screens:SVC": 0.0
    },
    "precision_mean": {
        "all_setting:SVC": 37.5,
        "all_setting+screens:SVC": 77.77777777777779,
  



```
{
    "accuracy_mean": {
        "all_setting:GradientBoostingClassifier": 70.29702970297029,
        "all_setting:ExtraTreesClassifier": 91.0828025477707,
        "all_setting:AdaBoostClassifier": 84.9624060150376,
        "all_setting+screens:GradientBoostingClassifier": 97.91666666666666,
        "all_setting+screens:ExtraTreesClassifier": 93.71069182389937,
        "all_setting+screens:AdaBoostClassifier": 85.41666666666666,
        "common:GradientBoostingClassifier": 88.7218045112782,
        "common:ExtraTreesClassifier": 90.08264462809917,
        "common:AdaBoostClassifier": 92.90780141843972,
        "common+screens:GradientBoostingClassifier": 97.95918367346938,
        "common+screens:ExtraTreesClassifier": 86.22754491017965,
        "common+screens:AdaBoostClassifier": 90.19607843137256
    },
    "balanced_accuracy_mean": {
        "all_setting:GradientBoostingClassifier": 65.9090909090909,
        "all_setting:ExtraTreesClassifier": 94.85294117647058,
        "all_setting:AdaBoostClassifier": 58.7474645030426,
        "all_setting+screens:GradientBoostingClassifier": 98.79032258064517,
        "all_setting+screens:ExtraTreesClassifier": 75.0,
        "all_setting+screens:AdaBoostClassifier": 85.87719298245614,
        "common:GradientBoostingClassifier": 82.26950354609929,
        "common:ExtraTreesClassifier": 83.33333333333333,
        "common:AdaBoostClassifier": 83.87096774193547,
        "common+screens:GradientBoostingClassifier": 97.01963534361852,
        "common+screens:ExtraTreesClassifier": 80.33681214421253,
        "common+screens:AdaBoostClassifier": 73.21428571428572
    },
    "false_positive_mean": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 8.9171974522293,
        "all_setting:AdaBoostClassifier": 5.263157894736842,
        "all_setting+screens:GradientBoostingClassifier": 2.083333333333333,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 11.805555555555555,
        "common:GradientBoostingClassifier": 1.5037593984962405,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 1.3605442176870748,
        "common+screens:ExtraTreesClassifier": 8.383233532934131,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "false_negative_mean": {
        "all_setting:GradientBoostingClassifier": 29.7029702970297,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 9.774436090225564,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 6.289308176100629,
        "all_setting+screens:AdaBoostClassifier": 2.7777777777777777,
        "common:GradientBoostingClassifier": 9.774436090225564,
        "common:ExtraTreesClassifier": 9.917355371900827,
        "common:AdaBoostClassifier": 7.092198581560284,
        "common+screens:GradientBoostingClassifier": 0.6802721088435374,
        "common+screens:ExtraTreesClassifier": 5.389221556886228,
        "common+screens:AdaBoostClassifier": 9.803921568627452
    },
    "precision_mean": {
        "all_setting:GradientBoostingClassifier": 100.0,
        "all_setting:ExtraTreesClassifier": 60.0,
        "all_setting:AdaBoostClassifier": 36.36363636363637,
        "all_setting+screens:GradientBoostingClassifier": 86.95652173913044,
        "all_setting+screens:ExtraTreesClassifier": 100.0,
        "all_setting+screens:AdaBoostClassifier": 60.46511627906976,
        "common:GradientBoostingClassifier": 92.85714285714286,
        "common:ExtraTreesClassifier": 100.0,
        "common:AdaBoostClassifier": 100.0,
        "common+screens:GradientBoostingClassifier": 91.66666666666666,
        "common+screens:ExtraTreesClassifier": 61.111111111111114,
        "common+screens:AdaBoostClassifier": 100.0
    },
    "recall_mean": {
        "all_setting:GradientBoostingClassifier": 31.818181818181817,
        "all_setting:ExtraTreesClassifier": 100.0,
        "all_setting:AdaBoostClassifier": 23.52941176470588,
        "all_setting+screens:GradientBoostingClassifier": 100.0,
        "all_setting+screens:ExtraTreesClassifier": 50.0,
        "all_setting+screens:AdaBoostClassifier": 86.66666666666667,
        "common:GradientBoostingClassifier": 66.66666666666666,
        "common:ExtraTreesClassifier": 66.66666666666666,
        "common:AdaBoostClassifier": 67.74193548387096,
        "common+screens:GradientBoostingClassifier": 95.65217391304348,
        "common+screens:ExtraTreesClassifier": 70.96774193548387,
        "common+screens:AdaBoostClassifier": 46.42857142857143
    },
    "f1_mean": {
        "all_setting:GradientBoostingClassifier": 48.275862068965516,
        "all_setting:ExtraTreesClassifier": 74.99999999999999,
        "all_setting:AdaBoostClassifier": 28.57142857142857,
        "all_setting+screens:GradientBoostingClassifier": 93.02325581395348,
        "all_setting+screens:ExtraTreesClassifier": 66.66666666666666,
        "all_setting+screens:AdaBoostClassifier": 71.23287671232877,
        "common:GradientBoostingClassifier": 77.61194029850746,
        "common:ExtraTreesClassifier": 80.0,
        "common:AdaBoostClassifier": 80.76923076923077,
        "common+screens:GradientBoostingClassifier": 93.61702127659574,
        "common+screens:ExtraTreesClassifier": 65.67164179104478,
        "common+screens:AdaBoostClassifier": 63.41463414634146
    },
    "accuracy_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "balanced_accuracy_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "false_positive_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "false_negative_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "precision_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "recall_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    },
    "f1_std": {
        "all_setting:GradientBoostingClassifier": 0.0,
        "all_setting:ExtraTreesClassifier": 0.0,
        "all_setting:AdaBoostClassifier": 0.0,
        "all_setting+screens:GradientBoostingClassifier": 0.0,
        "all_setting+screens:ExtraTreesClassifier": 0.0,
        "all_setting+screens:AdaBoostClassifier": 0.0,
        "common:GradientBoostingClassifier": 0.0,
        "common:ExtraTreesClassifier": 0.0,
        "common:AdaBoostClassifier": 0.0,
        "common+screens:GradientBoostingClassifier": 0.0,
        "common+screens:ExtraTreesClassifier": 0.0,
        "common+screens:AdaBoostClassifier": 0.0
    }
}
```

