In [1]:
# Please set the path to the weka.jar
weka_path="/usr/weka/weka-3-8-2/weka.jar"


# Imports
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option('display.max_columns', 50)
from IPython.display import display, HTML


import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import math
import itertools
from copy import deepcopy
import time
from utils import *

### init
# Get dictionaries of the data.
train_files=!ls DATA/GENERATED|grep -v "for_DPWGAN"|grep arff
data_train={
    "KCCR":  ["DATA/GENERATED/"+x for x in train_files if "KCCR"  in x],
    "KCCFD": ["DATA/GENERATED/"+x for x in train_files if "KCCFD" in x],
    "MNIST": ["DATA/GENERATED/"+x for x in train_files if "MNIST" in x],}
data_eval={
    "MNIST": "DATA/PREPROCESSED/TEST/MNIST_TEST.arff",
    "KCCFD": "DATA/PREPROCESSED/TEST/KCCFD_TEST.arff",
    "KCCR":  "DATA/PREPROCESSED/TEST/KCCR_TEST.arff",}

# Define weka models 
models={
    "J48":'weka.classifiers.trees.J48 -- -C 0.25 -M 2',
    "NaiveBayes":'weka.classifiers.bayes.NaiveBayes',
    "RandomForest1":'weka.classifiers.trees.RandomForest -- -P 100 -I 500 -num-slots 1 -K 5 -M 1.0 -V 0.001 -S 1 -depth 5',
    "RandomForest2":'weka.classifiers.trees.RandomForest -- -P 100 -I 2000 -num-slots 1 -K 1 -M 1.0 -V 0.001 -S 1 -depth 1',
    "SGD":'weka.classifiers.functions.SGD -- -F 0 -L 0.01 -R 1.0E-4 -E 500 -C 0.001 -S 1',
    "SMO":'weka.classifiers.functions.SMO -- -C 1.0 -L 0.001 -P 1.0E-12 -N 0 -V -1 -W 1 -K "weka.classifiers.functions.supportVector.PolyKernel -E 1.0 -C 250007" -calibrator "weka.classifiers.functions.Logistic -R 1.0E-8 -M -1 -num-decimal-places 4"',
    "SimpleLogistic":'weka.classifiers.functions.SimpleLogistic -- -I 0 -M 500 -H 50 -W 0.0',
    "IBk":'weka.classifiers.lazy.IBk -- -K 1 -W 0 -A \"weka.core.neighboursearch.LinearNNSearch -A \\\"weka.core.EuclideanDistance -R first-last\\\"\"',
    }


# Main functions
def generate_run_lists():
    # Creates a list of elements with the following form:
    #     [Dataset_name,Train_File,Test_File,Model_ID,Boosted(bool),AUROC]
    run_lists=[]
    for boosted in [False,True]:
        for dataset,train_files in data_train.items():
            for train in train_files:
                test  = data_eval[dataset]
                for model_id,_ in models.items():
                    run_lists += [[dataset,train,test,model_id,boosted,None]]
    return run_lists


def parse_weka_model(model_id=None,train=None,val=None,boost=False):
    # Parses a Weka command to be run with bash (technically Jupyter uses dash).
    global models
    string="java -classpath "+weka_path+" {} -t {} -T {} -o {} -W {}"
    if boost:
        # Gradient boosting method in weka.
        boost="weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10"
        string = string.format(boost,"{0}","{1}","","{2}")
    else:
        # This filter effectively does nothing.
        # Used to avoid messing with escape characters during runtime.
        _fc="weka.classifiers.meta.FilteredClassifier"
        _f="-F weka.filters.unsupervised.attribute.ReplaceMissingValues"
        string = string.format(_fc,"{0}","{1}",_f,"{2}")
    string=string.format(train,val,models[model_id])
    return string


def get_AUROC(output):
    # Retrieves AUROC from the output string. 
    # It is located directly below the "ROC Area" string.
    output=output[int(len(output)/2):] # First half of output is evaluation on training data
    try:
        loc=[j for j,x in enumerate(output) if "ROC Area" in x][-1]
        _i=output[loc].find("ROC Area")
        return float(output[loc+1][_i:_i+8].strip())
    except:
        print("\n".join(output))
        return None
    
    
def run_models(run_lists):
    global new_run_lists
    new_run_lists=[]
    for run_list in run_lists:
        if run_list[5] is None:
            train=run_list[1]
            t1=time.time() 
            command = parse_weka_model(model_id=run_list[3],train=train,val=run_list[2],boost=run_list[4])
            output =! {command}
            performance = get_AUROC(output)
            run_list[5] = performance
            new_run_lists += [run_list]
            print(new_run_lists[-1])
            print("Time taken:",time.time()-t1)
        else:
            new_run_lists += run_list
    return new_run_lists

In [None]:
runlists = generate_run_lists()

# # Edit as desired. For example:
# runlists=lists_with(runlists,"MNIST")
# runlists=lists_without(runlists,"SGD")

print_list(runlists)

In [None]:
# Note: because of the try catch statement, the best way to abort is to restart the kernel.
results = run_models(runlists)