### Imports

In [1]:
from subprocess import Popen, PIPE, STDOUT
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt 
import pdb
import os

### Interfacing with the Java implementation of the Negative Selection algorithm

In [2]:
def get_negsel_outputs(test_filename, train_filename="english.train", alphabetfile = None, 
                       n=10, r=4, c=True, l=True, notebook_in_folder=False):
    
    # Fix command for when notebook is not inside the negative-selection folder
    folder_prefix = "negative-selection/" if not notebook_in_folder else ""

    # Create command using given filenames and parameters
    command = f"java -jar {folder_prefix}negsel2.jar " \
              f"-self {folder_prefix}{train_filename} " \
              f"-n {n} -r {r} {'-c' if c else ''} {'-l' if l else ''} " \
              f"-alphabet file://{folder_prefix}{alphabetfile} " \
              f"< {folder_prefix}{test_filename}"
    # Run the command
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=2, close_fds=True)
    # Extract and return output values
    """
        I have added an if statement because 
        the output contains warnings which leads to errors when converting to floats
        The output contains warnings because I have appended _ to chunks of length smaller than n
        
    """
    outputs = [float(line.rstrip().decode('utf-8')) for line in iter(process.stdout.readline, b'') if b"symbol" not in line]
    return outputs

## Strategy?
.train files contain normal sequences 

.test contain both normal and anomalies

.labels files contains the labels for the .test files, 0 = normal 1 = anomaly

.alpha is the alphabet uste to encode system calls...

## Problems 
* labels 1 file instead of two...
* no fixed length sequences
    * training: preprocess sequences in to fixed lenght chunks..
    * testing: evaluate patterns in chunks and use a measure to obtain anomaly score for a chunk


## (possible) Solutions

* We have to account for the labels in the test files when calculating AUC scores
    to calculate the sensitivity and specificity...
* create a function that splits the sequences in substrings of a specified length



In [3]:
# a='abcdefghijklmn'
# print(a[6:12:])

# z = range(0,len(a),6)
# print(z)
# [x for x in range(0,len(a),6)]

# for j in range(0,len(a),6):
#     print(j)
#     start = j
#     stop = j + 6
#     print(a[start:stop:])
# za = [a[x:x+6:] for x in range(0,len(a),6)]
# # print(za)

# print(len(za[-1]))
# za[-1] += "_" *  (6 - len(za[-1])) 
# print(za[-1])
# print(len(za[-1]))
# print(''.join(za))

!ls negative-selection/syscalls/snd-cert/

snd-cert.1.labels	 snd-cert.2.test    snd-cert.train
snd-cert.1.test		 snd-cert.3.labels  snd-cert.train.chunked
snd-cert.1.test.chunked  snd-cert.3.test
snd-cert.2.labels	 snd-cert.alpha


In [4]:
## we need to open a file
## split the sequences in chunks of fixed length
### what to do if chunk if fixed length cannot be formed? 
#### we append underscore, _ (this character does not occur in the alphabets..) 
     

def process_files(file,chunk_size):
    chunks = []
    with open(file) as f:
        for line in f:
            line = line.replace('\n','')
            chunk = [line[start:start+chunk_size:] for start in range(0,len(line),chunk_size)]
            chunk[-1] += "_" * (chunk_size - len(chunk[-1]))  
            [chunks.append(c) for c in chunk]
    output_file ='./'+ file + '.chunked'
    print(output_file)
    with open(output_file, 'w+') as out: #w+ to truncate existing file...
        for chunk in chunks:
            out.write(chunk +'\n')
            
process_files("negative-selection/syscalls/snd-cert/snd-cert.train",10)
process_files("negative-selection/syscalls/snd-cert/snd-cert.1.test",10)


./negative-selection/syscalls/snd-cert/snd-cert.train.chunked
./negative-selection/syscalls/snd-cert/snd-cert.1.test.chunked


### Running the algorithm

In [5]:
snd_cert = get_negsel_outputs(test_filename="syscalls/snd-cert/snd-cert.1.test.chunked", 
                              train_filename = "syscalls/snd-cert/snd-cert.train.chunked",
                              alphabetfile = "syscalls/snd-cert/snd-cert.alpha")

In [6]:
print(snd_cert)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 36.811488546796014, 0.0, 37.40303175675036, 0.0, 36.394252606020004, 0.0, 36.394252606020004, 0.0, 36.394252606020004, 0.0, 36.40303786645184, 0.0, 0.0, 0.0, 0.0, 0.0, 36.38526061920034, 0.0, 0.0, 0.0, 0.0, 0.0, 37.13383681318645, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34.818021979661474, 37.3715420144694, 0.0, 0.0, 36.39351852604081, 0.0, 0.0, 34.81807786845041, 35.817833597632806, 0.0, 0.0, 37.62820731260149, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

### Computing ROC-AUC

In [7]:
def roc_auc(merged_outputs, title = 'Nog Mooiere en Minder Gehandicapte ROC-Curve, Maar Waarschijnlijk Net Zo Fout'):
    sensitivity = []
    specificity = []

    for v in merged_outputs:
        # For each v in merged_outputs: calculate percentage of values in tagalog that are larger
        sensitivity_v = len([x for x in tagalog_outputs if x > v]) / len(tagalog_outputs)

        # For each v in merged_outputs: calculate percentage of values in english that are smaller
        specificity_v = len([x for x in english_outputs if x < v]) / len(english_outputs)
        
        # Keep track of these values
        sensitivity.append(sensitivity_v)
        specificity.append(specificity_v)

    plt.plot(sorted(sensitivity),sorted(specificity))
    plt.plot([0.0, 1.0], [0.0,1.0], ls='--')
    plt.xlabel('sensitivity')
    plt.ylabel('specificity')
    plt.title(title)
    plt.show()
    print("ROC-AUC = {}".format(auc(sorted(sensitivity), sorted(specificity))))