### Imports

In [1]:
from subprocess import Popen, PIPE, STDOUT
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt 

### Interfacing with the Java implementation of the Negative Selection algorithm

In [2]:
def get_negsel_outputs(test_filename, train_filename="english.train", alphabetfile = None, 
                       n=10, r=4, c=True, l=True, notebook_in_folder=False):
    
    # Fix command for when notebook is not inside the negative-selection folder
    folder_prefix = "negative-selection/" if not notebook_in_folder else ""
    
    # Create command using given filenames and parameters
    command = f"java -jar {folder_prefix}negsel2.jar " \
              f"-self {folder_prefix}{train_filename} " \
              f"-n {n} -r {r} {'-c' if c else ''} {'-l' if l else ''} " \
              f"-alphabet file://{alphabetfile}" \
              f"< {folder_prefix}{test_filename}"
    
    # Run the command
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=2, close_fds=True)
    
    # Extract and return output values
    outputs = [float(line.rstrip().decode('utf-8')) for line in iter(process.stdout.readline, b'')]
    return outputs

## Strategy?
.train files contain normal sequences 

.test contain both normal and anomalies

.labels files contains the labels for the .test files, 0 = normal 1 = anomaly

.alpha is the alphabet uste to encode system calls...

## Problems 
* labels 1 file instead of two...
* no fixed length sequences
    * training: preprocess sequences in to fixed lenght chunks..
    * testing: evaluate patterns in chunks and use a measure to obtain anomaly score for a chunk


## (possible) Solutions

* split the files based on the labels so we end up with two files..?
* create a function that splits the sequences in substrings of a specified length



### Running the algorithm

In [16]:
snd_cert = get_negsel_outputs("syscalls/snd-cert/snd-cert.1.test", 
                              train_filename = "syscalls/snd-cert/snd-cert.train",
                              alphabetfile = "syscalls/snd-cert/snd-cert.alpha")

ValueError: could not convert string to float: 'Exception in thread "main" java.lang.ExceptionInInitializerError'

### Computing ROC-AUC

In [5]:
def roc_auc(merged_outputs, title = 'Nog Mooiere en Minder Gehandicapte ROC-Curve, Maar Waarschijnlijk Net Zo Fout'):
    sensitivity = []
    specificity = []

    for v in merged_outputs:
        # For each v in merged_outputs: calculate percentage of values in tagalog that are larger
        sensitivity_v = len([x for x in tagalog_outputs if x > v]) / len(tagalog_outputs)

        # For each v in merged_outputs: calculate percentage of values in english that are smaller
        specificity_v = len([x for x in english_outputs if x < v]) / len(english_outputs)
        
        # Keep track of these values
        sensitivity.append(sensitivity_v)
        specificity.append(specificity_v)

    plt.plot(sorted(sensitivity),sorted(specificity))
    plt.plot([0.0, 1.0], [0.0,1.0], ls='--')
    plt.xlabel('sensitivity')
    plt.ylabel('specificity')
    plt.title(title)
    plt.show()
    print("ROC-AUC = {}".format(auc(sorted(sensitivity), sorted(specificity))))