In [1]:
from collections import defaultdict 
import pydot
import statistics
import os
import sys
import csv
import math
import random
import time
import FordFulkerson as ff
import EdmondsKarp as EK
from multiprocessing import Pool
from collections import Counter
from IPython.display import Image, display
from matplotlib import pyplot as plt
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
path = "20%_test"
pathB = "/home/achilleas/Desktop/thesis/DATASET F1/specimen_benign/20%_test"
pathT = "80%_train"
paths = ["/home/achilleas/Desktop/thesis/DATASET F1/Fold_"+str(i) for i in range(1,6)]

filename = "default_G_tone_map"
mapFilename = "default_unique_mapping"

rootDirs = {}
trainDirs = {}

for i in range(len(paths)):
    p = [os.path.join(root, name)
             for root, dirs, files in os.walk(paths[i]+"/"+path)
             for name in files
             if name.endswith(filename+".csv")]
    p.sort()
    rootDirs[i] = p

benignDir = [os.path.join(root, name)
             for root, dirs, files in os.walk(pathB)
             for name in files
             if name.endswith(filename+".csv")]
benignDir.sort()

for i in range(len(paths)):
    trainDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(paths[i]+"/"+pathT)
                 for name in files
                 if name.endswith(filename+".csv")]
    trainDir.sort()
    trainDirs[i] = trainDir


In [3]:
classes = {"ACCESS_MASK":0,"Atom":1,"BOOLEAN":2,"Debug":3,"Device":4,
                                 "Environment":5,"File":6,"HANDLE":7,"Job":8,"LONG":9,"LPC":10,
                                 "Memory":11,"NTSTATUS":12,"Object":13,"Other":14,"PHANDLE":15,
                                 "PLARGE_INTEGER":16,"Process":17,"PUNICODE_STRING":18,
                                 "PULONG":19,"PULARGE_INTEGER":20,"PVOID_SIZEAFTER":21,
                                 "PWSTR":22,"Registry":23,"Security":24,"Synchronization":25,
                                 "Time":26,"Transaction":27,"ULONG":28,"WOW64":29, "DummyStart":30,"DummyEnd":31}

In [4]:
def getSourceSink(path):
    verteces = []
    with open(path) as fp:
        line = fp.readline()
        while line:
            verteces.append(line.split(","))
            line = fp.readline()
    return [verteces[0][0],verteces[-1][1].strip()]

In [5]:
def getArray (path):
    results = []
    with open(path) as csvfile:
        reader = csv.reader(csvfile,csv.QUOTE_NONNUMERIC) # change contents to floats
        for row in reader: # each row is a list
            nums = []
            for i in row: 
                if i :
                    nums.append(int(i))
            results.append(nums)
    return results

In [6]:
def createExtendedG(g):
    parents = []
    children = []
    for i in range(len(g)):
        for j in range(len(g[i])):
            if g[i][j]!=0:
                break
            else:
                children.append(i)
    for i in range(len(g)):
        for j in range(len(g[i])):
            if g[j][i]!=0:
                break
            else:
                parents.append(i)
                
    for i in range(len(g)):
        g[i].append(0)
        g[i].append(0)
    leng= len(g)
    g.append([0 for i in range (leng+2)])
    g.append([0 for i in range (leng+2)])
    
    for i in children :
        g[i][-1] = 1
    for j in parents:
        g[-2][j]=1
    return g

In [7]:
def createImage(g):
    G = pydot.Dot(graph_type='digraph')
    for i in range(len(g)):
        x = pydot.Node(i)
        for j in range(len(g[i])):
            if g[i][j]!= 0 :
                y = pydot.Node(j)
                e = pydot.Edge(i,j)
                G.add_edge(e)
                
    im = Image(G.create_png())
#     G.write_png(path)
    display(im)             

In [8]:
def findMaxOutDegreeVertex(g):
    outDegrees={}
    return_matrix = []
    for i in range(len(g)):
        for j in range(len(g[i])):
            if g[i][j]!=0:
                if i not in outDegrees:
                    outDegrees[i]=[g[i][j],1]
                else: 
                    weight = outDegrees[i][0]+g[i][j]
                    cardinality = outDegrees[i][1]+1
                    outDegrees[i]=[weight,cardinality]
    return outDegrees

In [9]:
def findMaxInDegreeVertex(g):
    inDegrees= {}
    return_matrix = []
    for i in range (len(g)):
        for j in range(len(g[i])):
            if g[j][i]!=0:
                if i not in inDegrees:
                    inDegrees[i] = [g[j][i],1]
                else: 
                    weight = inDegrees[i][0]+g[j][i]
                    cardinality = inDegrees[i][1]+1
                    inDegrees[i]=[weight,cardinality]
                    
    return inDegrees    

In [10]:
def createCoverageGraph(g):
    cvg = []
    combinedDegrees = {}
    inDegrees = findMaxInDegreeVertex(g)
    inKeys =list(inDegrees.keys())
    outDegrees = findMaxOutDegreeVertex(g)
    outKeys =list(outDegrees.keys())
    combinedDegrees= outDegrees.copy()
    for i in inKeys:
        if i in outDegrees:
            combinedDegrees[i] = [inDegrees[i][0]+outDegrees[i][0], inDegrees[i][1]+outDegrees[i][1]]
        else : 
            combinedDegrees[i] = inDegrees[i]
    sortedcDegrees = sorted(combinedDegrees.items(), key = lambda kv:kv[0])     
    cKeys = [i[0] for i in sortedcDegrees]
    for i in range(30):
        row = [0 for k in range(30)]
        if i not in cKeys:
            cvg.append(row)
            continue
        weight = combinedDegrees[i][0]
        cardinality = combinedDegrees[i][1]
        for j in cKeys:
            if weight > combinedDegrees[j][0] and cardinality > combinedDegrees[j][1]:
                row[j] = 1
        cvg.append(row)
        

    return cvg
    

# def writeCSV()
generic method to quickly write a graph to a csv file.

In [11]:
def writeCSV(g,path,name):
    path_spl = path.split('/')
    path_spl[-1] = name
    sp_path = '/'.join(path_spl)
    with open(sp_path, mode='w') as cvg_file:
        cvg_writer = csv.writer(cvg_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for i in g:
            cvg_writer.writerow(i)

# def executeTrial ()
## here we call the max Flow algorithm 
### Max flow in GRG
* if we get the source and the sink from the default G' the algorithm stops in cases of first caller X and last called X function.
* if we get the source and the sink from the default G_tone_cardinality tests show that nothing stops the algorithm as in all cases the first caller is different than the last called

### Max flow in CVG
* the algorithm works fine.


In [12]:
def executeTrial(path):
    sp_path = path.split('/')
    family = sp_path[-3]
    graph = getArray(path)
#     for i in range(len(graph)):
#         for j in range(len(graph[i])):
#             if graph[i][j]!=0:
#                 graph[i][j] =1
#     g = Graph(createExtendedG(createCoverageGraph(graph)))
    sp_path[-1]= "default_G_tone_cardinality.txt"
#     source = 30
#     sink = 31
    source = classes[getSourceSink('/'.join(sp_path))[0]]
    sink = classes[getSourceSink('/'.join(sp_path))[1]]
    fG = Graph(graph).FordFulkerson(source,sink)
    return [fG, family]

# def experiment()
wrapper method to execute and calculate median max flow for each member of our dataset
## median
we prefer median values of the flows because they are more robust to individual extreme flows

In [13]:
def experiment(dirs):
#     print(len(dirs))
    max_flow_vals={}
    mean_max_flow={}
    mean_flows = []
    total_values = []
    for i in dirs:
        values= executeTrial(i)
        total_values.append(values[0])
        if values[1] in max_flow_vals:
            max_flow_vals[values[1]].append(values[0])
        else : 
            max_flow_vals[values[1]]= [values[0]]

    for i in max_flow_vals:
        mean_max_flow[i] = statistics.median(max_flow_vals[i])
    for i in mean_max_flow :
        mean_flows.append(mean_max_flow[i])
        
    return mean_max_flow

# main body 
here we make separate calls in our functions to test the validity of the above methods

In [14]:
def executeExp():
    root_flows = experiment(rootDir)
    benign_flows = experiment(benignDir)
    train_flows = experiment(trainDir)
    # mal_corners = [min(root_flows), max(root_flows)]
    # benign_corners = [min(benign_flows), max(benign_flows)]


    # print("mal_corners:")
    # print(mal_corners)
    # print("benign_corners:")
    # print(benign_corners)

    sort_root = sorted(root_flows.items(), key = lambda kv:kv[1])
    sort_benign = sorted(benign_flows.items(), key = lambda kv:kv[1])
    sort_train = sorted(train_flows.items(), key = lambda kv:kv[1])

    # print()

    sort_root_vals = [i[1] for i in sort_root]
    sort_benign_vals = [i[1] for i in sort_benign]
    sort_train_vals = [i[1] for i in sort_train]


    sort_root_names = [i[0] for i in sort_root]
    sort_benign_names = [i[0] for i in sort_benign]
    sort_train_names = [i[0] for i in sort_train]


    sorted_root_flows = sorted(root_flows)
    sorted_benign_flows= sorted(benign_flows)



    # bCount =sorted(Counter(benign_flows).items(),key = lambda kv:kv[0])
    # mCount = sorted(Counter(root_flows).items(),key = lambda kv:kv[0])



    # --------------------------------------------
    lists = [sort_root_vals, sort_benign_vals,sort_train_vals]
    names = [sort_root_names, sort_benign_names, sort_train_names]

    for i in lists:
        plt.plot(i,marker=11)
    fig = plt.gcf()
    plt.show()
    plt.draw()

    fig.savefig('maxFlow.png')
    #---------------------------------------------------
    # executeTrial(rootDir[0])
    # ar = [[0,1,1,1,1],
    #       [0,0,0,1,1],
    #       [0,0,0,1,1],
    #       [0,0,0,0,1],
    #       [0,0,0,0,0]]
    # createExtendedG(ar)
    # cvg = createCoverageGraph(getArray(rootDir[0]),rootDir[0])

    # for i in rootDir:
    #     writeCSV(createExtendedG(createCoverageGraph(getArray(i))),i, 'CVGB.csv')
    # for i in cvg:
    #     print(i)
    # g = createExtendedG(ar)
    # G = pydot.Dot(graph_type='digraph')
    # for i in range(len(g)):
    #     x = pydot.Node(i)
    #     for j in range(len(g[i])):
    #         if g[i][j]!= 0 :
    #             y = pydot.Node(j)
    #             e = pydot.Edge(i,j)
    #             G.add_edge(e)

    # im = Image(G.create_png())
    # display(im)             

# Flow map
## def createFlowMap(g):
* Given the capacity of each edge and the neighbors of each node of a graph, the function returns a NxN array tha represents the flow map of the given graph, having in each `g'[i][j]= MaxFlow(cap,neighs,i,j)`. **the base function used to permutate our original GrGs** 

## def createMap(dirs):
* wrapper method that calls createFlowMap(path) for every path in the dirs list. This method is used to run through the rootDir, benignDir and train Dir to create our point of reference

## def CSM(A,B):
* calculates the cosine similarity metric for two arrays A and B
   

In [15]:
def createFlowMap(capacity, neighbors): 
    values = [[0 for i in range(len(capacity))] for j in range(len(capacity))]

    for i in range(len(capacity)):
        for j in range(len(capacity)):
            values[i][j] = EK.EdmondsKarp(capacity,neighbors,i,j)
#     return values
    return values

In [16]:
def createMap(paths):
    maps ={}
    median_vals = {}
    for i in range(len(paths)):
        family = paths[i].split('/')[-3]
        cap, neigh = EK.ParseGraph(paths[i])
        map_values = createFlowMap(cap,neigh)
        if family in maps:
            maps[family].append(map_values)
        else :
            maps[family] =  [map_values]
        printProgressBar(i, len(paths))       
    
    return maps

In [17]:
def CSM(A,B):
    e = sum(A[i][j]*B[i][j] for i in range(len(A)) for j in range(len(B)))
    a = sum(A[i][j]**2 for i in range(len(A))for j in range(len(A)))
    b = sum(B[i][j]**2 for i in range(len(B))for j in range(len(B)))
    d = math.sqrt(a)*math.sqrt(b)
    return round(e/d,15)

In [18]:
def checkDatabase(dirs, db):
    total = {}
    for i in range(len(dirs)):
        csm_val ={}
        fname = dirs[i].split('/')[-3]
        cap , neigh = EK.ParseGraph(dirs[i])
        A = createFlowMap(cap,neigh)
        for d in db:
            val = []
            for g in db[d]:
                val.append(CSM(A,g))
            csm_val[d] = max(val)
        printProgressBar(i,len(dirs))
        sr = sorted(csm_val.items(), key = lambda kv:kv[1], reverse =True)
        total[i]=[fname, sr[0][0], sr[0][1]]
#         print([i, fname, sr[0][0], sr[0][1]])
    return total

In [19]:
# total = {}
#     for t in test:
#         for r in train:
#             csm_vals= {}
#             for i in range(len(test[t])):
#                 vals =[]
#                 for j in range(len(train[r])):
#                     vals.append(CSM(test[t][i],train[r][j]))
#                 csm_vals[t]=vals
#         total[t]= sorted(csm_vals[t], reverse = True)[0]

In [20]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [22]:
database = []
for i in range(len(paths)):
    database.append(createMap(trainDirs[i]))
    print("%d database created"%(i))
print(len(database))

0 database created███████████████████████████████████████████████████████████████████████████████████-| 100.0% 
1 database created███████████████████████████████████████████████████████████████████████████████████-| 100.0% 
2 database created███████████████████████████████████████████████████████████████████████████████████-| 100.0% 
3 database created███████████████████████████████████████████████████████████████████████████████████-| 100.0% 
4 database created███████████████████████████████████████████████████████████████████████████████████-| 100.0% 
5


In [28]:
malCheck = []
for i in range(len(paths)):
    ti = time.time()
    malCheck.append(checkDatabase(rootDirs[i], database[i]))
    print("\nmalcheck %d done."%(i))
    # with open('malwareCheck.txt', 'w') as f:
    #     for item in malwareCheck:
    #         f.write("%s\n" % malwareCheck[item])
    # f.close()
    print("check %d mal check finished in %f "%(i,time.time()-ti))
    ti = time.time()
    benignCheck = checkDatabase(benignDir, database[i])
    print("\nBencheck %d done."%(i)) 
    # with open('benignCheck.txt', 'w') as f:
    #     for item in benignCheck:
    #         f.write("%s\n" % benignCheck[item])
    # f.close()
    print("check %d ben check finished in %f s"%(i, time.time()-ti))

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.8% 
malcheck 0 done.
check 0 mal check finished in 1385.261115 
 |█████████████████████████████████████████████████████████████████████████████████████████████████---| 97.1% 
Bencheck 0 done.
check 0 ben check finished in 88.375573 s
 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.8% 
malcheck 1 done.
check 1 mal check finished in 1337.751143 
 |█████████████████████████████████████████████████████████████████████████████████████████████████---| 97.1% 
Bencheck 1 done.
check 1 ben check finished in 76.439063 s
 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.8% 
malcheck 2 done.
check 2 mal check finished in 1144.466473 
 |█████████████████████████████████████████████████████████████████████████████████████████████████---| 97.1% 
Bencheck 2 done.
check 2 ben check f

In [47]:
def similarityMetric(sr):
    counterA = 0
    counterB = 0
    counterC = 0
    for i in sr :
        A,B = i[0].split(',')[0], i[0].split(',')[1]
        C,D = i[1].split(',')[0], i[1].split(',')[1]
        if A==C and B==D:
            counterA +=1
        if A==C or B==D:
            counterB +=1
        if A==C or A==D or B == C or B == D:
            counterC += 1
    print("cA = %d (%f) cB= %d (%f) cC =%d (%f)"%(counterA, counterA/len(sr), counterB, counterB/len(sr), counterC, counterC/len(sr)))
    return [counterA/len(sr),counterB/len(sr),counterC/len(sr)]

In [48]:
def saveFigures(mal_th, ben_th, start):
    lists = [mal_th,ben_th]
    v = 0
    for i in range(len(mal_th)):
        v = max(v,abs(mal_th[i]-ben_th[i]))

    print(v)

    for i in lists:
        plt.plot(i,marker= 11)
    plt.legend(["True positives","False positives"])
    plt.yticks(np.arange(0, 1, 0.05))
    plt.xticks(np.arange(0,step, 1))
    plt.ylabel("")
    plt.draw()
    plt.savefig(str(start)+".png")
    plt.clf()



In [59]:
mal_pairs = []
sr = []
avg = [[] for i in range(3)]
for i in range(len(malCheck)):
    mal_pairs.append([malCheck[i][j] for j in malCheck[i]])
    
for i in range(len(mal_pairs)):
    sr.append(sorted(mal_pairs[i],key =lambda kv:kv[-1],reverse=True))
    
ben_pairs = [benignCheck[i] for i in benignCheck]
br = sorted(ben_pairs,key =lambda kv:kv[-1],reverse=True)
sims = [similarityMetric(i) for i in sr]
for i in range(len(sims)):
    for j in range(len(sims[i])):
        avg[j].append(sims[i][j])
res = [statistics.mean(avg[i]) for i in range(len(avg)) ]
print(avg)
print("------------------")
print()
# c =0
# # # m_vals =[sr[i][-1] for i in range(len(sr))]
# b_vals = [br[i][-1] for i in range(len(br))]
# for i in sr :
#     print(i)
# print("--------------------------")
# for j in br :
#     if j[-1] > 0.8059138664157343 :
#         c+=1
#     print(j)
# print(c)


cA = 346 (0.664107) cB= 420 (0.806142) cC =425 (0.815739)
cA = 346 (0.664107) cB= 419 (0.804223) cC =426 (0.817658)
cA = 343 (0.658349) cB= 415 (0.796545) cC =425 (0.815739)
cA = 340 (0.652591) cB= 412 (0.790787) cC =427 (0.819578)
cA = 332 (0.627599) cB= 414 (0.782609) cC =423 (0.799622)
[[0.6641074856046065, 0.6641074856046065, 0.6583493282149712, 0.6525911708253359, 0.6275992438563327], [0.8061420345489443, 0.8042226487523992, 0.7965451055662188, 0.7907869481765835, 0.782608695652174], [0.8157389635316699, 0.817658349328215, 0.8157389635316699, 0.8195777351247601, 0.7996219281663516]]
------------------
[0.6533509428211706, 0.796061086539264, 0.8136671879365333]


In [60]:
# start = 0.855737150150311
for j in br:
    start =j[-1]
    step = 10
    dif = 1.0 - start
    thresh = [0 for i in range(step)]
    for i in range(step):
        thresh[i] = start + i*(dif/step)
    mal_th = [0 for i in range(len(thresh))]
    ben_th = [0 for i in range(len(thresh))]
    for i in range(len(thresh)):
        CM=0
        val=0
        for j in range(len(sr)):
            if sr[j][-1] > thresh[i]:
                #mal_th[i]+=1
                CM += 1
        val=CM/521
        mal_th[i] = val

    for i in range(len(thresh)):
        CM=0
        val=0
        for j in range(len(br)):
            if br[j][-1] > thresh[i]:
                #mal_th[i]+=1
                CM += 1
        val=CM/35
        ben_th[i] = val
    # print(mal_th)
    print(ben_th)
    saveFigures(mal_th,ben_th,start)



TypeError: '>' not supported between instances of 'list' and 'float'