In [1]:
import numpy as np
import pandas as pd
import os

from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc, plot_roc_curve
from scipy.interpolate import interpn

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from matplotlib import pyplot as plt
import statistics

In [2]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [3]:
def plotConfusionMatrix(real,prev,dir,model):
    """Image for the confusion matrix
    
    Arguments:
        real {list} -- Real labels
        prev {list} -- Predicted labels
        model {string} -- add the model name to the file name
    """
    #print('>>> Creating figure...')

    fig = plt.figure()
    plt.plot([2,2,4]) #2,3,5
    ax0 = plt.subplot(2, 2, 1) #2,3,1
    ax0 = sns.heatmap(confusion_matrix(real[0],prev[0]), annot=True, cbar = False, cmap=sns.light_palette('#dea369'))
    ax0.set(xlabel='Predicted', ylabel='Real')
    ax0.title.set_text('Confusion Matrix Fold 1')
    ax1 = plt.subplot(2, 2, 2) #2,3,2
    ax1 = sns.heatmap(confusion_matrix(real[1],prev[1]), annot=True, cbar = False, cmap=sns.light_palette('#dea369'))
    ax1.set(xlabel='Predicted', ylabel='Real')
    ax1.title.set_text('Confusion Matrix Fold 2')
    ax2 = plt.subplot(2, 2, 3) #2,3,3
    ax2 = sns.heatmap(confusion_matrix(real[2],prev[2]), annot=True, cbar = False, cmap=sns.light_palette('#dea369'))
    ax2.set(xlabel='Predicted', ylabel='Real')
    ax2.title.set_text('Confusion Matrix Fold 3')
    ax3 = plt.subplot(2, 2, 4) #2,3,4
    ax3 = sns.heatmap(confusion_matrix(real[3],prev[3]), annot=True, cbar = False, cmap=sns.light_palette('#dea369'))
    ax3.set(xlabel='Predicted', ylabel='Real')
    ax3.title.set_text('Confusion Matrix Fold 4')
    
    fig.tight_layout()
    #print('>>> Saving figure...')
    plt.savefig('../../data/figures/{}/matrix_{}.png'.format(dir, model))

In [4]:
def plotRocCurve(X,y, name, dir, names, best=False, plot=False):
    """Image for the ROC curve

    Arguments:
        X {numpy.ndarray} -- Dataset to train
        y {numpy.ndarray} -- Labels for the dataset
        model {string} -- add the model name to the file name
    """
    #print(dir, name)
    skf = StratifiedKFold(n_splits=5, shuffle = True)
    tprs, aucs, real, prev = [], [], [], []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(skf.split(X, y)):
        if name == 'svm': model, p = trainSvm(X[train], y[train], X[test], y[test], best)
        elif name == 'tree': model, p = trainTree(X[train], y[train], X[test], y[test], best)
        elif name == 'log': model, p = trainLog(X[train], y[train], X[test], y[test], best)
        prev.append(p)
        real.append(list(y[test]))
        viz = plot_roc_curve(model, X[test], y[test],
                         name='ROC fold {}'.format(i+1),
                         alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
        
        f = open('auc.txt','a')
        for a in aucs:
            f.write(str(a)+'\n')
        f.close()

    #print('>>> Creating figure...')
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
    ax.legend(loc="lower right")
    if plot==True:
        plt.show()
        print('>>> Saving figure...')
        plt.savefig('../../data/figures/{}/roc_auc_{}.png'.format(dir, name))
        print('Saved in ../../data/figures/{}/roc_auc_{}.png'.format(dir, name))
        plotConfusionMatrix(real,prev, dir, name)
    plt.close()

        

In [5]:
def trainSvm(train_X,train_y,test_X,test_y,best_params):
    """Uses a SVM model to fit the data.

    Arguments:
        train_X {numpy.ndarray} -- 0.8% of the original dataset for training
        train_y {numpy.ndarray} -- labels for the training data
        test_X {numpy.ndarray} -- 0.2% of the original dataset for testing
    """	
    if not best_params:
        #print('>>> Starting grid search...')
        parameters = {'kernel':['linear','rbf','sigmoid'], 
                    'C':[0.25,0.4,0.5,0.55,0.75,1], 
                    'tol':[1e-3,1e-4,1e-5], 
                    'gamma':[25,50,75,100,150,'auto'], 
                    'degree':[1,2,3,5,10]}
        svm_model = SVC()
        grid = GridSearchCV(svm_model, parameters, cv = 5, scoring='f1')
        grid_result = grid.fit(train_X, train_y)
        best_params = grid_result.best_params_
        print(best_params)
    #print('>>> Creating model...')
    svm_model = SVC(kernel=best_params["kernel"],
                    C=best_params["C"], 
                    tol=best_params["tol"], 
                    gamma=best_params["gamma"], 
                    degree=best_params["degree"])
    svm_model = svm_model.fit(train_X, train_y)
    prev = svm_model.predict(test_X)
    
    f = open('accuracy.txt','a')
    f.write(str(accuracy_score(test_y,prev))+'\n')
    f.close()
 
    
    f = open('f1.txt','a')
    f.write(str(f1_score(test_y,prev, average='macro'))+'\n')
    f.close()
    
    return svm_model, list(prev)

In [6]:
def trainLog(train_X,train_y,test_X,test_y,best_params):
    """Uses a Logarithmic model to fit the data.

    Arguments:
        train_X {numpy.ndarray} -- 0.8% of the original dataset for training
        train_y {numpy.ndarray} -- labels for the training data
        test_X {numpy.ndarray} -- 0.2% of the original dataset for testing
    """	
    if not best_params:
        #print('>>> Starting grid search...')
        parameters = {'penalty': ['l1', 'l2'],
                    'C':[0.25,0.4,0.5,0.55,0.75,1], 
                    'tol':[1e-3,1e-4,1e-5],
                    'solver':['liblinear']}
        log_model = LogisticRegression()
        grid = GridSearchCV(log_model, parameters, cv = 5, scoring='f1')
        grid_result = grid.fit(train_X, train_y)
        best_params = grid_result.best_params_
        print(best_params)	
    #print('>>> Creating model...')
    log_model = LogisticRegression(penalty = best_params['penalty'],
                                    C = best_params['C'], 
                                    tol = best_params['tol'],
                                    solver = best_params['solver'])
    log_model = log_model.fit(train_X, train_y)
    prev = list(log_model.predict(test_X))
    f = open('accuracy.txt','a')
    f.write(str(accuracy_score(test_y,prev))+'\n')
    f.close()
 
    
    f = open('f1.txt','a')
    f.write(str(f1_score(test_y,prev, average='macro'))+'\n')
    f.close()
    return log_model, prev


In [7]:
def trainTree(train_X,train_y,test_X,test_y,best_params):
    """Uses a tree model to fit the data.

    Arguments:
        train_X {numpy.ndarray} -- 0.8% of the original dataset for training
        train_y {numpy.ndarray} -- labels for the training data
        test_X {numpy.ndarray} -- 0.2% of the original dataset for testing
    """	
    if not best_params:
        #print('>>> Starting grid search...')
        parameters = {'n_estimators':[20,50,75,100], 
                    'criterion':['entropy','gini'], 
                    'min_samples_leaf':[1,2,3,5,10], 
                    'min_samples_split':[2,4,5,8,10], 
                    'max_leaf_nodes':[2,20,50,75,100]}
        tree_model = ExtraTreesClassifier()
        grid = GridSearchCV(tree_model, parameters, cv = 5, scoring='f1')
        grid_result = grid.fit(train_X, train_y)
        best_params = grid_result.best_params_
        print(best_params)	
    #print('>>> Creating model...')
    tree_model = ExtraTreesClassifier(criterion=best_params["criterion"], 
                max_leaf_nodes=best_params["max_leaf_nodes"], 
                min_samples_leaf=best_params["min_samples_leaf"], 
                min_samples_split=best_params["min_samples_split"], 
                n_estimators=best_params["n_estimators"])
    tree_model = tree_model.fit(train_X, train_y)
    prev = tree_model.predict(test_X)
    f = open('accuracy.txt','a')
    f.write(str(accuracy_score(test_y,prev))+'\n')
    f.close()
 
    
    f = open('f1.txt','a')
    f.write(str(f1_score(test_y,prev, average='macro'))+'\n')
    f.close()
    return tree_model, list(prev)


In [8]:
def classifier(classi, dataset, name, grid=False, plot=False):    
    X=dataset.iloc[:,:-1].values
    y=pd.to_numeric(dataset.iloc[:,-1].values.ravel())
    names=dataset.iloc[:,:-1].columns
    for c in classi:
        if c == 'svm': plotRocCurve(X, y, 'svm', name, names, grid, plot)
        elif c == 'tree': plotRocCurve(X, y, 'tree', name, names, grid, plot)
        elif c == 'log': plotRocCurve(X, y, 'log', name, names, grid, plot)
        else: print('>>> The classifier chosen is not valid!')

In [9]:
def calcMetrics(name, model):
    metrics=['accuracy','f1','auc']
    for m in metrics:
        f = open('{}.txt'.format(m),'r')
        met=[]
        met = f.read().splitlines()
        f.close()
        met = [float(i) for i in met]
        print('{} {} {}'.format(name, model, m), statistics.mean(met), ' +/- ', statistics.stdev(met))
        os.remove('{}.txt'.format(m))

# Grid Search

In [10]:
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
classifier(['svm'], dataset, 'risk')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
classifier(['svm'], dataset, 'all_pval')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
classifier(['svm'], dataset, 'pval')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
classifier(['svm'], dataset, 'all_network')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
classifier(['svm'], dataset, 'network')

{'C': 0.75, 'degree': 1, 'gamma': 'auto', 'kernel': 'sigmoid', 'tol': 0.001}
{'C': 1, 'degree': 1, 'gamma': 75, 'kernel': 'sigmoid', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 'auto', 'kernel': 'sigmoid', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
{'C': 1, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf', 'tol': 0.001}
{'C': 1, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf', 'tol': 0.001}
{'C': 1, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf', 'tol': 0.001}
{'C': 1, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf', 'tol':

In [11]:
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
classifier(['tree'], dataset, 'risk')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
classifier(['tree'], dataset, 'all_pval')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
classifier(['tree'], dataset, 'pval')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
classifier(['tree'], dataset, 'all_network')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
classifier(['tree'], dataset, 'network')

{'criterion': 'entropy', 'max_leaf_nodes': 100, 'min_samples_leaf': 10, 'min_samples_split': 4, 'n_estimators': 20}
{'criterion': 'gini', 'max_leaf_nodes': 75, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 20}
{'criterion': 'gini', 'max_leaf_nodes': 100, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 20}
{'criterion': 'entropy', 'max_leaf_nodes': 100, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 20}
{'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 20}
{'criterion': 'gini', 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
{'criterion': 'gini', 'max_leaf_nodes': 100, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
{'criterion': 'entropy', 'max_leaf_nodes': 75, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 20}
{'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 1, 'min_samples_split': 2

In [12]:
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
classifier(['log'], dataset, 'risk')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
classifier(['log'], dataset, 'all_pval')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
classifier(['log'], dataset, 'pval')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
classifier(['log'], dataset, 'all_network')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
classifier(['log'], dataset, 'network')

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.75, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.5, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.4, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.25, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.001}
{'C': 1, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0001}
{'C': 1, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0001}
{'C': 0.55, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.55, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.75, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
{'C': 0.25, 'penalty': '

# Risk Dataset

## SVM

In [13]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
for i in range(1000):
    classifier(['svm'], dataset, 'risk', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('risk','svm')

Progress: [####################] 100.0%
risk svm accuracy 0.6428421299554304  +/-  0.08495134974086228
risk svm f1 0.6259111215738928  +/-  0.08837585696857804
risk svm auc 0.6731316177762342  +/-  0.10060498644413439


## TREE

In [14]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 50}
for i in range(1000):
    classifier(['tree'], dataset, 'risk', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('risk','tree')

Progress: [####################] 100.0%
risk tree accuracy 0.8817326984126984  +/-  0.05198986977458492
risk tree f1 0.8732956323932237  +/-  0.05676332635558321
risk tree auc 0.9655389146567718  +/-  0.02529392749822451


## LOG

In [15]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_risk.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
for i in range(1000):
    classifier(['log'], dataset, 'risk', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('risk','log')

Progress: [####################] 100.0%
risk log accuracy 0.7355712698412699  +/-  0.07010743274465464
risk log f1 0.7213405574068545  +/-  0.07387970769501197
risk log auc 0.7909953102453102  +/-  0.07500073484839828


# Pval Dataset

## SVM

In [16]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
for i in range(1000):
    classifier(['svm'], dataset, 'all_pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('pval','svm')

Progress: [####################] 100.0%
pval svm accuracy 0.8484365079365079  +/-  0.05785962758014378
pval svm f1 0.8389795672247028  +/-  0.06191355172435989
pval svm auc 0.9237215728715729  +/-  0.04360888116686274


## TREE

In [17]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 50}
for i in range(1000):
    classifier(['tree'], dataset, 'all_pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('pval','tree')

Progress: [####################] 100.0%
pval tree accuracy 0.9202925396825397  +/-  0.04444995324086781
pval tree f1 0.9179562926419127  +/-  0.04515256689849303
pval tree auc 0.9801480416408987  +/-  0.018361431439741054


## LOG

In [18]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
for i in range(1000):
    classifier(['log'], dataset, 'all_pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('pval','log')

Progress: [####################] 100.0%
pval log accuracy 0.8467468253968254  +/-  0.061488865984817044
pval log f1 0.837850921608183  +/-  0.06504052966754922
pval log auc 0.9255630797773655  +/-  0.048364065946515364


# top 25 Pval Dataset

## SVM

In [19]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
for i in range(1000):
    classifier(['svm'], dataset, 'pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_pval','svm')

Progress: [####################] 100.0%
top_pval svm accuracy 0.7862357142857143  +/-  0.058723366712039655
top_pval svm f1 0.7680483039580441  +/-  0.06640412382935759
top_pval svm auc 0.8436645949288807  +/-  0.06764881912703213


## TREE

In [20]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 50}
for i in range(1000):
    classifier(['tree'], dataset, 'pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_pval','tree')

Progress: [####################] 100.0%
top_pval tree accuracy 0.8838312698412698  +/-  0.049810603540462316
top_pval tree f1 0.8823329355823466  +/-  0.04967873685252635
top_pval tree auc 0.9480752009894867  +/-  0.03438855335381645


## LOG

In [21]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_pval.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
for i in range(1000):
    classifier(['log'], dataset, 'pval', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_pval','log')

Progress: [####################] 100.0%
top_pval log accuracy 0.7812476190476191  +/-  0.05911068196468217
top_pval log f1 0.7595383784474967  +/-  0.06848332853191773
top_pval log auc 0.8536187487116058  +/-  0.06219496782102092


# Central Dataset

## SVM

In [22]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
for i in range(1000):
    classifier(['svm'], dataset, 'all_network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('central','svm')

Progress: [####################] 100.0%
central svm accuracy 0.7657236507936508  +/-  0.06624858862954339
central svm f1 0.7529332547981656  +/-  0.07057365217012622
central svm auc 0.8356863739435167  +/-  0.05601429441310791


## TREE

In [23]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 50}
for i in range(1000):
    classifier(['tree'], dataset, 'all_network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('central','tree')

Progress: [####################] 100.0%
central tree accuracy 0.9267109523809524  +/-  0.039235852060522936
central tree f1 0.9198706630399279  +/-  0.044409578591601634
central tree auc 0.9826765615337044  +/-  0.01773383594418541


## LOG

In [24]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/reduced_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
for i in range(1000):
    classifier(['log'], dataset, 'all_network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('central','log')

Progress: [####################] 100.0%
central log accuracy 0.7925711111111111  +/-  0.06119135232879844
central log f1 0.7818761225050833  +/-  0.06474569254771206
central log auc 0.8703942589156874  +/-  0.0508216757168687


# top 25 Central Dataset

## SVM

In [25]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.25, 'degree': 1, 'gamma': 25, 'kernel': 'linear', 'tol': 0.001}
for i in range(1000):
    classifier(['svm'], dataset, 'network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_central','svm')

Progress: [####################] 100.0%
top_central svm accuracy 0.9088252380952381  +/-  0.04266284713979564
top_central svm f1 0.9026219805748964  +/-  0.046342393807136055
top_central svm auc 0.9612027107812822  +/-  0.028323120925549186


## TREE

In [26]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'criterion': 'entropy', 'max_leaf_nodes': 50, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 50}
for i in range(1000):
    classifier(['tree'], dataset, 'network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_central','tree')

Progress: [####################] 100.0%
top_central tree accuracy 0.9278220634920634  +/-  0.04279003020672129
top_central tree f1 0.9214145252598337  +/-  0.04803132456256997
top_central tree auc 0.9882001339929911  +/-  0.014431074443263026


## LOG

In [27]:
print('>>> Loading dataset...')
dataset = pd.read_csv('../../data/datasets/top_dataset_network.csv.gz', compression = 'gzip')
print(dataset.shape)
grid={'C': 0.4, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
for i in range(1000):
    classifier(['log'], dataset, 'network', grid)
    update_progress(i / 1000)

update_progress(1)
calcMetrics('top_central','log')

Progress: [####################] 100.0%
top_central log accuracy 0.9140957142857142  +/-  0.044843361358720114
top_central log f1 0.9094687418978469  +/-  0.04744716604757636
top_central log auc 0.9664025767882911  +/-  0.02583824895028706
