In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


from pomegranate import *
from keras_performance_metrics import *

from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

from threading import Thread
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from MDLP import MDLP_Discretizer
import entropy_based_binning as ebb
from sklearn.naive_bayes import MultinomialNB

#Functions
def multiclass_roc_auc_score(truth, pred):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred)

In [16]:
def cv_classifiers(xtrain, xtest, clf):
    wc_class_reports, wc_confmatrices, wc_auc_scores, wc_recall_scores, wc_precision_scores, wc_f1_scores, wc_accuracy_scores =[],[],[],[],[],[],[]
    ytrain = xtrain['class']
    xtrain = xtrain.drop(columns = 'class')
    trainarr = np.array(xtrain)
    ytest =  xtest['class'] 
    xtest = xtest.drop(columns = 'class')
    testarr = np.array(xtest)
    clf.fit(trainarr,ytrain)
    wc_predicted = clf.predict(testarr)
    wc_predicted = np.array( wc_predicted, dtype = "int64")
    y_true = np.array(ytest, dtype = "int64")

    print("Model Evaluation")
    _precision, _recall, _fscore, _support = precision_recall_fscore_support(y_true, wc_predicted)
    _accuracy_score = balanced_accuracy_score(y_true, wc_predicted)
    _auc_score = multiclass_roc_auc_score(y_true, wc_predicted)
    _conf_matrix = confusion_matrix(y_true, wc_predicted)
    print( classification_report(y_true, wc_predicted))
   
    
    print("Accuracy exact: %.3f" % (_accuracy_score))
    print("AUC: %.3f\n" % (_auc_score))
    

In [20]:
def genetic_feature_selection(train_x, train_y, task, model, folds, eval_metric, categorical_features = [], n_classes = 2, sparse = False, iterations = 100, generation_size = 20, generation_best_ratio = 10, mutation_prob = 0.05, stopping_rounds = -1, epochs = None, average_epochs = None, missing = np.nan, verbose = False):
    """genetic algorithm for features selection
    
    all features are encoded  as binary vector
    Parameters
    ----------
    train_x : pandas.DataDrame or numpy.ndarray
        train dataset
    train_y : np.ndarray
        target
    task : string, 'regression' or 'binary_classification' or 'multiclass_classification'
        task to solve
    model : Model
        Model  instance
    folds : list of pairs of lists
        indices of train and validation folds
        folds[i][0] - train indices in i-th train-val split
        folds[i][1] - validation indices in i-th train-val split
    eval_metric : string, possible variants: 'mse', rmse', 'auc', 'logloss', 'mlogloss', 'error', 'merror' 
        eval_metric for model
    categorical_features : list of strings or lists of integers, optional
        column names (if train is Pandas DataFrame) or column indices (if train is Numpy array) of categorical features
        [] by default
    n_classes : integer, optional
        number of classes in case of classification task
        2 by default
    sparse : boolean, optional
        whether train is sparse Scipy matrix
        False by default
    iterations : integer, optional
        number of iterations in genetic algorithm
        100 by default
    generation_size : integer, optional
        number of objects in one generation
        20 by default
    generation_best_ratio : integer, optional
        number of best objects to survive to next generation
        10 by default
    generation_best_ratio : float, optional
        mutation probability
        0.05 by default
    stopping rounds : integer, optional
        number of early stopping rounds (or epochs for neral nets) in CV evaluations, -1 means for fixed number of rounds or epochs
        -1 by default
    epochs : integer, optional
        number of epochs in case of neural network model
        10 by default
    average_epochs : integer, optional
        number of last epochs, predictions in which are averaged, in case of neural network model
        -1 by default
    missing : integer or np.nan
        missing values for xgboost models
        np.nan by default
    verbose : , optional
        whether to print running info
        False by default
    Returns 
    -------
    list of best features, best CV score  (CV score with selected features)
    """
    
    np.random.seed(1)

    n_features = train_x.shape[1]

    if isinstance(train_x, pd.DataFrame):
        train_values = train_x.values
        features_list = train_x.columns.values
    else:
        features_list = np.arange(n_features)
        train_values = train_x

    def score_features(features_sample):
            
        features_ind = np.where(features_sample == 1)[0]

        if model.type == 'xgboost':
            return cross_validation.CV_score_xgb(train_values[:, features_ind], train_y, model.params, eval_metric, folds, sparse = sparse, stopping_rounds = stopping_rounds, missing = missing, verbose = False)
        
        elif model.type == 'lightgbm':
            return cross_validation.CV_score_lgbm(train_values[:, features_ind], train_y, model.params, categorical_features, eval_metric, folds, stopping_rounds = stopping_rounds, verbose = False)

        elif model.type == 'keras':
            return cross_validation.CV_score_keras(train_values[:, features_ind], train_y, task, model.params, eval_metric, folds, n_classes = n_classes, stopping_rounds = stopping_rounds, epochs = epochs, average_epochs = average_epochs, verbose = False)

        elif model.type == 'sklearn':
            return cross_validation.CV_score_sklearn(train_values[:, features_ind], train_y, model, eval_metric, folds, verbose = False)

    generation = np.zeros((generation_size, n_features), dtype = np.int16)
    generation_scores = np.zeros((generation_size, ), dtype = np.float32)

    def random_sample():
        sample = np.zeros((n_features, ), dtype = np.int16)
        for i in range(n_features):
            if np.random.uniform(0, 1) < 0.5:
                sample[i] = 1
        return sample

    def crossover(parent_a, parent_b):
        child = np.copy(parent_a)
        for i in range(n_features):
            if np.random.uniform(0, 1) < 0.5:
                child[i] = parent_b[i]
        return child

    def mutation(sample):
        mutated_sample = np.copy(sample)
        for i in range(n_features):
            if np.random.uniform(0, 1) < mutation_prob:
                mutated_sample[i] ^= 1
        return mutated_sample

    def sort_generation(generation, generation_scores):
        
        sort_ind = np.argsort(generation_scores)
        if eval_metric == 'auc':
            sort_ind = sort_ind[::-1]

        generation = generation[sort_ind]
        generation_scores = generation_scores[sort_ind]

        return generation, generation_scores

    for i in range(generation_size):
        generation[i] = random_sample()
        generation_scores[i] = score_features(generation[i])

    for iter in range(iterations):

        generation, generation_scores = sort_generation(generation, generation_scores)
        if verbose:
            print iter, generation_scores
            print 'cur best: ', features_list[np.where(generation[0] == 1)[0]]

        for i in range(generation_best_ratio, generation_size, 1):
            parent_a = generation[np.random.randint(0, generation_best_ratio)]
            parent_b = generation[np.random.randint(0, generation_best_ratio)]
            child = crossover(parent_a, parent_b)
            mutated_child = mutation(child)
            generation[i] = np.copy(mutated_child)
            generation_scores[i] = score_features(generation[i])
            if verbose:
                print i, generation_scores[i]

    sort_generation(generation, generation_scores)
    best_features, best_score = generation[0], generation_scores[0]     
    best_features = features_list[np.where(best_features == 1)[0]]
    
    if verbose:
        print 'best features set: ', best_features
        print 'best score: ', best_score

    return best_features, best_score

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(iter, generation_scores)? (<ipython-input-20-4ecd07ff0335>, line 132)

In [38]:
#Discretization into smaller number of categories
#df = top_20_features_df
binned_data = []
discrete_data = []
df = pd.read_csv("datafiles/content_features2.csv")

for val in range(0,len(df.columns)):
    if(df.iloc[:,val].nunique() > 10):#only applied to discrete data with more than 5 values and continuous data
        [df.iloc[:,val], bins] = pd.qcut(df.iloc[:,val], 10, duplicates = 'drop', retbins = True)
        binned_data.append(bins)# store the bins used for each column (not the labels)


   sf_total_number_of_sentences  sf_total_number_of_words  \
0                            18                       224   
1                            48                       836   
2                            29                       435   
3                            50                       534   
4                             7                        90   

   sf_total_number_of_characters  sf_total_number_of_begin_upper  \
0                           1100                              47   
1                           4448                             208   
2                           2073                             104   
3                           2135                             116   
4                            448                              30   

   sf_total_number_of_begin_lower  sf_total_number_of_all_caps  \
0                             183                            4   
1                             797                           21   
2                         

In [4]:
#No of values of each variable
dfentr = pd.read_csv("datafiles/sample_featuresENTRbins.csv")
for val in range(0,len(dfentr.columns)):
   print(dfentr.iloc[:,val].nunique(), end =' ')     

4 4 2 4 3 2 4 4 4 4 4 4 4 4 4 4 4 3 4 2 

In [9]:
#Sample
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#top_20_features_discdf = pd.read_csv('datafiles/top_features.csv')
df_class0 = top_20_features_discdf.loc[top_20_features_discdf['class'] == 0]
df_class1 = top_20_features_discdf.loc[top_20_features_discdf['class'] == 1]
df0 = df_class0.sample(n = 5000)
df1 = df_class1.sample(n = 5000)
df2 = pd.concat([df0, df1])
df2.to_csv('datafiles/sample_featuresENTRbins.csv')

In [6]:
#Feature Selection
import pandas as pd
import numpy as np
from feature_selector import FeatureSelector

np.random.seed(0)
np.set_printoptions(suppress=True)
discdf =  dfentr
train_labels = discdf['class']
train = discdf.drop(columns = ['class'])
fs = FeatureSelector(data = train, labels = train_labels)
fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
top_20_features_df = discdf[fs.feature_importances.iloc[:20]["feature"]]
top_20_features_discdf = pd.concat([top_20_features_df, train_labels], axis = 1)

top_20_features_discdf.to_csv('datafiles/top_features2.csv')

0 features with greater than 0.60 missing values.

21 features with a single unique value.

2 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0117262	valid_0's auc: 0.999774
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0114541	valid_0's auc: 0.99977
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0106496	valid_0's auc: 0.999829
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0117003	valid_0's auc: 0.999753
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iterat

In [5]:
train_y = discdf['class']
train_x = discdf.drop(columns = ['class'])
genetic_feature_selection(train_x, train_y, binary_classification, model, folds, eval_metric, categorical_features = [], n_classes = 2, sparse = False, iterations = 100, generation_size = 20, generation_best_ratio = 10, mutation_prob = 0.05, stopping_rounds = -1, epochs = None, average_epochs = None, missing = np.nan, verbose = False):

In [12]:
from pomegranate import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import *
import pandas as pd
import numpy as np

seed = 11

df3 = pd.read_csv('datafiles/sample_featuresENTRbins.csv')
#df3 = df2.reset_index(drop=True, inplace = True)

wc_class_reports, wc_confmatrices, wc_auc_scores, wc_recall_scores, wc_precision_scores, wc_f1_scores, wc_accuracy_scores =[],[],[],[],[],[],[]
kfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = seed)
folds = []
for train_index, test_index in kfold.split(df3,df3['class']):

    xtrain = df3.loc[train_index,:]
    xtest = df3.loc[test_index,:]
    ytrain = xtrain['class']
    ytest = xtest['class']
   

    xtest['class']= np.nan
    trainarr = np.array(xtrain)
    testarr = np.array(xtest)
  
    bn_model = BayesianNetwork.from_samples(trainarr, algorithm='greedy',max_parents = 4, n_jobs=-1)
    genetic_feature_selection(xtrain, ytrain, bn_model, folds, task = 'binary_classification',eval_metric = 'auc', categorical_features = list(xtrain.columns) , n_classes = 2, sparse = False, iterations = 100, generation_size = 20, generation_best_ratio = 10, mutation_prob = 0.05, stopping_rounds = -1)
    print("Inference process\n")
    
    #, folds), eval_metric = 'auc', categorical_features = list(xtrain.columns) , n_classes = 2, sparse = False, iterations = 100, generation_size = 20, generation_best_ratio = 10, mutation_prob = 0.05, stopping_rounds = -1)
    print("Inference process\n")
    print(bn_model.structure)
    bn_model.fit(trainarr, n_jobs=-1)   
       # xtest['class'] = np.nan
    wc_predicted = model.predict(testarr, n_jobs=-1)
    wc_predicted = np.array( wc_predicted, dtype = "int64")

    wc_predicted = wc_predicted[:,len(xtrain.columns)-1]
    wc_predicted = wc_predicted.reshape(-1)
    y_true = np.array(classval, dtype = "int64")
    y_true = y_true.reshape(-1)
    _precision, _recall, _fscore, _support = precision_recall_fscore_support(y_true, wc_predicted)
    _accuracy_score = accuracy_score(y_true, wc_predicted)
    _auc_score =roc_auc_score(y_true, wc_predicted)
    _conf_matrix = confusion_matrix(y_true, wc_predicted)
    _class_report = classification_report(y_true, wc_predicted)
    wc_accuracy_scores.append(_accuracy_score)
    wc_f1_scores.append(_fscore)
    wc_precision_scores.append(_precision)
    wc_recall_scores.append(_recall)
    wc_auc_scores.append(_auc_score)
    wc_confmatrices.append(_conf_matrix)
    wc_class_reports.append(_class_report)

print("Accuracy exact: %.3f" % (np.mean(wc_accuracy_scores)))
print("Precision: %.3f" % (np.mean(wc_precision_scores)))
print("Recall: %.3f" % (np.mean(wc_recall_scores)))
print("F1 Score: %.3f" % (np.mean(wc_f1_scores)))
print("AUC: %.3f\n" % (np.mean(wc_auc_scores)))
    

NameError: name 'genetic_feature_selection' is not defined

In [12]:
# BN  with GA
from pomegranate import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df3 = pd.read_csv('datafiles/sample_featuresENTRbins.csv')
np.random.seed(0)
np.set_printoptions(suppress=True)
seed = 11

wc_class_reports, wc_confmatrices, wc_auc_scores, wc_recall_scores, wc_precision_scores, wc_f1_scores, wc_accuracy_scores =[],[],[],[],[],[],[]
xtrain, xtest, ytrain, ytest = train_test_split(df3, df3['class'], test_size=0.3, random_state=42)

count = 0
xtrain = pd.read_csv('datafiles/train_dataset.csv')
xtest = pd.read_csv('datafiles/test_dataset.csv')
classval =  xtest['class']
xtest['class']= np.nan
trainarr = np.array(xtrain)
testarr = np.array(xtest) 
tub = ((),(0,2,19,),(0,),(0,5,),(0,19,),(0,2,),(0,2,3,),(2,6,),(0,2,18,),(6,7,),(6,9,),(2,6,),(3,6,),(1,5,9,),(0,9,),(1,8,16,),(0,8,),(0,2,9,),(7,9,),(0,2,5,7,))
model = BayesianNetwork.from_structure(xtrain,tub)
model.fit(trainarr, n_jobs=-1)   
   # xtest['class'] = np.nan
wc_predicted = model.predict(testarr, n_jobs=-1)
wc_predicted = np.array( wc_predicted, dtype = "int64")
wc_predicted = wc_predicted[:,len(xtrain.columns)-1]
wc_predicted = wc_predicted.reshape(-1)
y_true = np.array(classval, dtype = "int64")
y_true = y_true.reshape(-1)
_precision, _recall, _fscore, _support = precision_recall_fscore_support(y_true, wc_predicted)
_accuracy_score = accuracy_score(y_true, wc_predicted)
_auc_score = roc_auc_score(y_true, wc_predicted)
_conf_matrix = confusion_matrix(y_true, wc_predicted)
_class_report = classification_report(y_true, wc_predicted)
wc_accuracy_scores.append(_accuracy_score)
wc_f1_scores.append(_fscore)
wc_precision_scores.append(_precision)
wc_recall_scores.append(_recall)
wc_auc_scores.append(_auc_score)
wc_confmatrices.append(_conf_matrix)
wc_class_reports.append(_class_report)    
    
print("Accuracy exact: %.3f" % (np.mean(wc_accuracy_scores)))
print("Precision: %.3f" % (np.mean(wc_precision_scores)))
print("Recall: %.3f" % (np.mean(wc_recall_scores)))
print("F1 Score: %.3f" % (np.mean(wc_f1_scores)))
print("AUC: %.3f\n" % (np.mean(wc_auc_scores)))
  

Accuracy exact: 0.858
Precision: 0.859
Recall: 0.857
F1 Score: 0.858
AUC: 0.857



In [19]:
# All classifiers except BN
from sklearn.model_selection import train_test_split
df3 = pd.read_csv('datafiles/sample_featuresENTRbins.csv')
np.random.seed(0)
np.set_printoptions(suppress=True)
seed = 11

kfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = seed)
folds = []
for train_index, test_index in kfold.split(df3,df3['class']):

    xtrain = df3.loc[train_index,:]
    xtest = df3.loc[test_index,:]
    ytrain = xtrain['class']
    ytest = xtest['class']
    print("Logistic Regression")
    clf_lr = LogisticRegression(penalty = 'l2', random_state=0, solver='newton-cg',class_weight='balanced', n_jobs = -1)
    best_features, best_scores = genetic_feature_selection(xtrain, ytrain, clf_lr, folds, task = 'binary_classification',eval_metric = 'auc', categorical_features = list(xtrain.columns) , n_classes = 2, sparse = False, iterations = 100, generation_size = 20, generation_best_ratio = 10, mutation_prob = 0.05, stopping_rounds = -1)

Logistic Regression


NameError: name 'genetic_feature_selection' is not defined

In [26]:
#Logistic Regression 
#df3 = df2.reset_index(drop=True)
df3 = pd.read_csv('datafiles/sample_featuresENTRbins.csv')
np.random.seed(0)
classval = df3['class']
df3.drop(columns = 'class')
#df = pd.read_csv('discretized_textual_features2.csv')
kfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = seed)
wc_class_reports, wc_confmatrices, wc_auc_scores, wc_recall_scores, wc_precision_scores, wc_f1_scores, wc_accuracy_scores =[],[],[],[],[],[],[]
for train_index, test_index in kfold.split(df3,df3['class']):
    xtrain = df3.loc[train_index,:]
    xtest = df3.loc[test_index,:]   
    trainarr = np.array(xtrain)
    testarr = np.array(xtest)
    ytest = classval.loc[test_index]
    clf = LogisticRegression(penalty = 'l2', random_state=0, solver='newton-cg',class_weight='balanced', n_jobs = -1)
    clf.fit(trainarr,xtrain['class'])

    print("Inference process\n")
    wc_predicted = clf.predict(testarr)
    wc_predicted = np.array( wc_predicted, dtype = "int64")
    print(wc_predicted.shape)
    y_true = np.array(ytest, dtype = "int64")
    print(y_true.shape)

    print("Model Evaluation")
    _precision, _recall, _fscore, _support = precision_recall_fscore_support(y_true, wc_predicted)
    _accuracy_score = balanced_accuracy_score(y_true, wc_predicted)
    _auc_score = multiclass_roc_auc_score(y_true, wc_predicted)
    _conf_matrix = confusion_matrix(y_true, wc_predicted)
    _class_report = classification_report(y_true, wc_predicted)
    wc_accuracy_scores.append(_accuracy_score)
    wc_f1_scores.append(_fscore)
    wc_precision_scores.append(_precision)
    wc_recall_scores.append(_recall)
    wc_auc_scores.append(_auc_score)
    wc_confmatrices.append(_conf_matrix)
    wc_class_reports.append(_class_report)
    
    
print("Accuracy exact: %.3f" % (np.mean(wc_accuracy_scores)))
print("Precision: %.3f" % (np.mean(wc_precision_scores)))
print("Recall: %.3f" % (np.mean(wc_recall_scores)))
print("F1 Score: %.3f" % (np.mean(wc_f1_scores)))
print("AUC: %.3f\n" % (np.mean(wc_auc_scores)))

Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Accuracy exact: 0.950
Precision: 0.967
Recall: 0.950
F1 Score: 0.947
AUC: 0.950



In [15]:
# Random Forest Classifier
df3 = pd.read_csv('datafiles/sample_featuresENTRbins.csv')
classval = df3['class']
df3.drop(columns = 'class')

kfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = seed)
for train_index, test_index in kfold.split(df3,df3['class']):
    xtrain = df3.loc[train_index,:]
    xtest = df3.loc[test_index,:]   
    trainarr = np.array(xtrain)
    testarr = np.array(xtest)
    ytest = classval.loc[test_index]
    clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clf.fit(trainarr,xtrain['class'])

    print("Inference process\n")
    wc_predicted = clf.predict(testarr)
    wc_predicted = np.array( wc_predicted, dtype = "int64")
    print(wc_predicted.shape)
    y_true = np.array(ytest, dtype = "int64")
    print(y_true.shape)

    print("Model Evaluation")
    _precision, _recall, _fscore, _support = precision_recall_fscore_support(y_true, wc_predicted)
    _accuracy_score = balanced_accuracy_score(y_true, wc_predicted)
    _auc_score = multiclass_roc_auc_score(y_true, wc_predicted)
    _conf_matrix = confusion_matrix(y_true, wc_predicted)
    _class_report = classification_report(y_true, wc_predicted)
    wc_accuracy_scores.append(_accuracy_score)
    wc_f1_scores.append(_fscore)
    wc_precision_scores.append(_precision)
    wc_recall_scores.append(_recall)
    wc_auc_scores.append(_auc_score)
    wc_confmatrices.append(_conf_matrix)
    wc_class_reports.append(_class_report)
    
    
print("Accuracy exact: %.3f" % (np.mean(wc_accuracy_scores)))
print("Precision: %.3f" % (np.mean(wc_precision_scores)))
print("Recall: %.3f" % (np.mean(wc_recall_scores)))
print("F1 Score: %.3f" % (np.mean(wc_f1_scores)))
print("AUC: %.3f\n" % (np.mean(wc_auc_scores)))


Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Inference process

(1000,)
(1000,)
Model Evaluation
Accuracy exact: 0.921
Precision: 0.924
Recall: 0.921
F1 Score: 0.920
AUC: 0.921



In [3]:
counter = 1
while(counter in range(1,5)):
    counter = counter+1
    print (counter)

2
3
4
5
