In [248]:
import sys
import os
import csv
import time
import statistics
import numpy as np
from numpy import dot
from numpy.linalg import norm
import networkx as nx
import networkx.algorithms
from collections import Counter
from IPython.display import Image, display
from matplotlib import pyplot as plt
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib.widgets import Slider, Button, RadioButtons

In [None]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [None]:
def getArray (path):
    results = []
    with open(path) as csvfile:
        reader = csv.reader(csvfile,csv.QUOTE_NONNUMERIC) # change contents to floats
        for row in reader: # each row is a list
            nums = []
            for i in row: 
                if i :
                    nums.append(int(i))
            results.append(nums)
    return results

In [None]:
def createDirs(fold):
    path = "20%_test"
    pathB = "/home/achilleas/Desktop/thesis/DATASET F1/specimen_benign/20%_test"
    pathT = "80%_train"
    # paths = ["/home/achilleas/Desktop/thesis/DATASET F1/Fold_"+str(i) for i in range(1,6)]
    paths = "/home/achilleas/Desktop/thesis/DATASET F1/Fold_"+str(fold)
    print(paths)

    filename = "default_G_tone_map"
    mapFilename = "default_unique_mapping"
    
    rootDir = [os.path.join(root, name)
             for root, dirs, files in os.walk(paths+"/"+path)
             for name in files
             if name.endswith(filename+".csv")]
    rootDir.sort()
    print(len(rootDir))
    # for i in range(len(paths)):
    #     p = [os.path.join(root, name)
    #              for root, dirs, files in os.walk(paths[i]+"/"+path)
    #              for name in files
    #              if name.endswith(filename+".csv")]
    #     p.sort()
    #     rootDirs[i] = p
    # print(rootDirs[10])
    benignDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(pathB)
                 for name in files
                 if name.endswith(filename+".csv")]
    benignDir.sort()

    # for i in range(len(paths)):
    trainDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(paths+"/"+pathT)
                 for name in files
                 if name.endswith(filename+".csv")]
    trainDir.sort()
    #     trainDirs[i] = trainDir
    return [rootDir,benignDir,trainDir]

In [None]:
def createGraphs(dirs, npmode):
    rootDir = dirs[0]
    benignDir = dirs[1]
    trainDir = dirs[2]
    mal_gs = []
    mal_np_gs = []
    ben_gs = []
    train_gs = []
    train_np_gs = []
    ben_np_gs = []
    csm_vals = {}
    for i in range(len(rootDir)):
        A = np.array(getArray(rootDir[i]))
        G = nx.from_numpy_matrix(A)
        mal_gs.append(G)
        mal_np_gs.append(A)
        printProgressBar(i, len(rootDir)-1,length =50, prefix = 'malware gs')
    for i in range(len(trainDir)):
        A = np.array(getArray(trainDir[i]))
        G = nx.from_numpy_matrix(A)
        train_gs.append(G)
        train_np_gs.append(A)
        printProgressBar(i, len(trainDir)-1,length =50, prefix = 'train gs')
        
    for i in range(len(benignDir)):
        A = np.array(getArray(benignDir[i]))
        G = nx.from_numpy_matrix(A)
        ben_gs.append(G)
        ben_np_gs.append(A)
        printProgressBar(i, len(benignDir)-1,length =50, prefix = 'benign gs')
    if npmode: 
        return [mal_np_gs, ben_np_gs, train_np_gs]
    else: 
        return [mal_gs, ben_gs, train_gs]



In [None]:
def findCentrality(graphs):
    ben_gs = graphs[0]
    mal_gs = graphs[1]
    train_gs = graphs[2]
    ben_bs = []
    mal_bs = []
    train_bs = []
    ben_centr = []
    mal_centr = []
    train_centr = []
    for i in ben_gs:
        ben_bs.append(nx.betweenness_centrality(i))
        printProgressBar(ben_gs.index(i),len(ben_gs)-1,length =50, prefix = 'benign')
    for j in mal_gs:
        mal_bs.append(nx.betweenness_centrality(j))
        printProgressBar(mal_gs.index(j),len(mal_gs)-1,length =50,prefix = 'malware',)
    for k in train_gs:
        train_bs.append(nx.betweenness_centrality(k))
        printProgressBar(train_gs.index(k),len(train_gs)-1,length =50,prefix = 'train',)
    for i in range(len(ben_bs)):
        ben_centr.append(list(ben_bs[i].values()))
    for i in range(len(mal_bs)):
        mal_centr.append(list(mal_bs[i].values()))
        print(list(mal_bs[i].values()))
    for i in range(len(train_bs)):
        train_centr.append(list(train_bs[i].values()))
    return [ben_centr, mal_centr, train_centr]


In [None]:
def calculateDifference(listA, listB, dirs,isBenign):
    csm_val = []
    rootDir = dirs[0]
    benDir = dirs[1]
    trainDir = dirs[2]
    for i in range(len(listA)):
        printProgressBar(i,len(listA)-1,length = min(len(listA), 50),prefix='diff calculation:')
        val = []
        for j in range(len(listB)):
            val.append(1-distance.cosine(listA[i], listB[j]))
        if isBenign:
            csm_val.append([benDir[i].split('/')[-3],max(val),trainDir[val.index(max(val))].split('/')[-3]])
        else :
            csm_val.append([rootDir[i].split('/')[-3],max(val),trainDir[val.index(max(val))].split('/')[-3]])
    return csm_val

In [None]:
iterationDirs = [createDirs(i) for i in range(1,6)]

In [None]:
graphs = [createGraphs(iterationDirs[i],False)for i in range(len(iterationDirs))]

In [None]:
graphCentr = [findCentrality(graphs[i]) for i in range(len(graphs))]

In [None]:
t = time.time()
malwareTests = [calculateDifference(graphCentr[i][0],graphCentr[i][2], 
                                    iterationDirs[i], False) for i in range(len(graphCentr))]
print("malware took %f seconds"%(time.time()-t))

In [229]:
benignTests = [calculateDifference(graphCentr[i][1],graphCentr[i][2], 
                                   iterationDirs[i], True) for i in range(len(graphCentr))]

diff calculation: |███████████████████████████████████| 100.0% 
diff calculation: |███████████████████████████████████| 100.0% 
diff calculation: |███████████████████████████████████| 100.0% 
diff calculation: |███████████████████████████████████| 100.0% 
diff calculation: |███████████████████████████████████| 100.0% 


In [None]:
def sortTests(test):
    sortedTests =[]
    for i in range(len(test)):
        srTest = sorted(test[i], key = lambda kv:kv[1],reverse = True)
        sortedTests.append(srTest)
        # sortMalTest = sorted(malwareTests[0], key = lambda kv:kv[1], reverse = True)
        # print(sortMalTest)
        print()
        print('---------iteration %d---------'%(i))
        print()
        print(srTest)
    return sortedTests

In [None]:
def saveFigures(mal_th, ben_th, start, iteration):
    lists = [mal_th,ben_th]
    diffs = []
    v = 0
    for i in range(len(mal_th)):
        v = max(v,abs(mal_th[i]-ben_th[i]))
        diffs.append(abs(mal_th[i]-ben_th[i]))
    print(v)
    d = max(diffs)
    print(d, mal_th[diffs.index(d)])

    for i in lists:
        plt.plot(i,marker= 11)
    plt.title("max dif at %f"%(diffs[diffs.index(d)]))
    plt.legend(["True positives","False positives"])
    plt.yticks(np.arange(0, 1, 0.05))
    plt.xticks(np.arange(0,step, 1))
    plt.ylabel("")
    plt.draw()
    try:
        os.mkdir(str(iteration))
    except OSError:
#         print ("Creation of the directory %s failed" % str(iteration))
        pass
    else:
        print ("Successfully created the directory %s" % str(iteration))
    plt.savefig(str(iteration)+"/"+str(start)+".png")
    plt.clf()



In [230]:
def similarityMetric(sr):
    counterA = 0
    counterB = 0
    counterC = 0
    for i in sr :
        A,B = i[0].split(',')[0], i[0].split(',')[1]
        C,D = i[2].split(',')[0], i[2].split(',')[1]
        if A==C and B==D:
            counterA +=1
        if A==C or B==D:
            counterB +=1
        if A==C or A==D or B == C or B == D:
            counterC += 1
    print("cA = %d (%f) cB= %d (%f) cC =%d (%f)"%(counterA, counterA/len(sr), counterB, counterB/len(sr), counterC, counterC/len(sr)))
    return [counterA/len(sr),counterB/len(sr),counterC/len(sr)]

In [373]:
def detection(ben, mal, itr, step =10, threshIdx = 0):
    dif = 1.0-ben[-1-threshIdx]
    thresh = [ben[-1-threshIdx]+i*dif/step for i in range(step)]
    vThresh = [round(i,3) for i in thresh]
    malsOT = sum(i > thresh for i in mal)
    bensOT = sum(i > thresh for i in ben)
    print(malsOT)
    print(bensOT)
    m = np.array(malsOT/len(mal))
    p = np.array(bensOT/len(ben))
    d = m-p
    for i in [m,p]:
        plt.plot(i,marker= 'o')

    (a,b, c )= plt.stem(np.array([i for i in range(len(p))]),p,linefmt='C0:')
    plt.setp(c, visible=False)
    plt.yticks(np.arange(0,1.05,0.05))
    plt.xticks(np.arange(len(thresh)), vThresh,rotation= 'vertical')    
    plt.legend(['malware','benign'])
    plt.figtext(.2,.2, "max dif %f at %d"%(max(d),np.where(d== max(d))[0]))
    plt.draw()
    
    try:
        os.mkdir(str(itr)+"--BTWC")
    except OSError:
#         print ("Creation of the directory %s failed" % str(iteration))
        pass
    else:
        print ("Successfully created the directory %s" % str(itr))
    
    plt.title('detection at '+str(itr)+ ' iteration // first threshold = '+ str(thresh[-1 -threshIdx]))
    plt.savefig(str(itr)+"--BTWC/"+str(threshIdx)+'--btc.png')
    plt.clf()

In [None]:
sortBenignTests= sortTests(benignTests)

In [None]:
sortMalwareTests =  sortTests(malwareTests)

In [None]:
malwares =  [[sortMalwareTests[j][i][1] for i in range(len(sortMalwareTests[j]))] 
             for j in range(len(sortMalwareTests))]
benigns =  [[sortBenignTests[j][i][1] for i in range(len(sortBenignTests[j]))] 
             for j in range(len(sortBenignTests))]

In [342]:
print(benigns[0])

[1.0, 1.0, 1.0, 0.9997878937928353, 0.999525610606294, 0.999396477344311, 0.9993574445469795, 0.9977934076304636, 0.9968699461783223, 0.9958794239100288, 0.9954295987566565, 0.9946736375907338, 0.9943800154268131, 0.993802219789989, 0.9915109356245235, 0.9891444937318298, 0.9872611180672923, 0.9865720874859274, 0.9861086368284955, 0.9800910262407794, 0.9778535674900424, 0.9740429036811438, 0.9737885498973333, 0.9728705368452987, 0.9691700524883922, 0.967858917960301, 0.9645328960256325, 0.9614477844957747, 0.9604938373840788, 0.9541644785923894, 0.9516055029539386, 0.9507694541122511, 0.9398583261745685, 0.9346179859545285, 0.8866506879792664]


In [375]:
step = 10
for j in range(step):
    for i in range(len(benigns)):
        detection(benigns[i], malwares[i], i,threshIdx = j)

[521 520 520 519 519 518 514 514 502 486]
[34 34 34 34 34 32 29 26 21 16]
[517 516 516 516 516 515 513 510 499 486]
[34 34 33 32 30 27 25 21 19 14]
[520 519 519 518 517 515 513 506 502 484]
[34 34 33 32 30 28 25 20 18 14]
[521 521 520 518 517 514 509 504 500 483]
[34 34 33 32 30 28 25 21 18 12]
[528 528 525 524 523 523 520 515 508 496]
[34 34 33 32 30 28 25 20 18 14]
[519 519 516 514 514 514 509 498 490 476]
[33 32 32 29 28 26 22 19 17 14]
[516 516 516 515 515 513 510 508 495 479]
[33 32 32 29 27 26 22 19 18 14]
[519 519 517 515 515 513 507 505 495 479]
[33 32 32 29 28 26 22 18 17 14]
[521 519 518 516 514 511 504 500 494 479]
[33 32 32 29 28 26 22 19 17 12]
[525 525 523 523 523 521 517 510 503 492]
[33 32 32 29 28 26 20 18 18 14]
[519 517 514 514 514 511 506 498 488 476]
[32 32 30 29 27 24 21 19 16 13]
[516 516 516 515 513 512 510 505 494 477]
[32 32 30 29 26 24 21 19 17 13]
[519 518 517 515 514 512 507 503 493 474]
[32 32 30 29 27 24 20 18 17 13]
[519 518 517 516 512 508 504 500 490 4

<Figure size 432x288 with 0 Axes>

In [None]:
similarities = [similarityMetric(sortMalwareTests[i]) for i in range(len(sortMalwareTests))]
for i in similarities :
    print(i)
print('-------------------------------\n')
counts = [[i[j] for i in similarities] for j in range(len(i))]
avg = [statistics.mean(i) for i in counts]

for j in avg : print(j)