In [1]:
import sys
import os
import csv
import time
import statistics
import numpy as np
from pathlib import Path, PureWindowsPath
from numpy import dot
from numpy.linalg import norm
import networkx as nx
import networkx.algorithms.matching
from collections import Counter
from IPython.display import Image, display
from matplotlib import pyplot as plt
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [3]:
def getArray (path):
    results = []
    with open(path) as csvfile:
        reader = csv.reader(csvfile,csv.QUOTE_NONNUMERIC) # change contents to floats
        for row in reader: # each row is a list
            nums = []
            for i in row: 
                if i :
                    nums.append(int(i))
            results.append(nums)
    return results

In [4]:
def createDirs(fold):
    
    path = "DATASET F1/fold_" +str(fold)
    mal_path = path+ "/20%_test"
    train_path = path+ "/80%_train"
    benign_path = "DATASET F1/specimen_benign/20%_test"
    
    w_mal_path = PureWindowsPath(mal_path)
    w_train_path= PureWindowsPath(train_path)
    w_benign_path = PureWindowsPath(benign_path)
#     print( mal_path+"\n"+ train_path+"\n"+ benign_path)
#     print( w_mal_path, w_train_path, w_benign_path)

    filename = "default_G_tone_map"
    mapFilename = "default_unique_mapping"
    
    rootDir = [os.path.join(root, name)
             for root, dirs, files in os.walk(w_mal_path)
             for name in files
             if name.endswith(filename+".csv")]
    rootDir.sort()
    # for i in range(len(paths)):
    #     p = [os.path.join(root, name)
    #              for root, dirs, files in os.walk(paths[i]+"/"+path)
    #              for name in files
    #              if name.endswith(filename+".csv")]
    #     p.sort()
    #     rootDirs[i] = p
    # print(rootDirs[10])
    benignDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_benign_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    benignDir.sort()

    # for i in range(len(paths)):
    trainDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_train_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    trainDir.sort()
    #     trainDirs[i] = trainDir
    return [rootDir,benignDir,trainDir]

In [5]:
def createGraphs(dirs, npmode):
    rootDir = dirs[0]
    benignDir = dirs[1]
    trainDir = dirs[2]
    mal_gs = []
    mal_np_gs = []
    ben_gs = []
    train_gs = []
    train_np_gs = []
    ben_np_gs = []
    csm_vals = {}
    for i in range(len(rootDir)):
        A = np.array(getArray(rootDir[i]))
        G = nx.from_numpy_matrix(A)
        mal_gs.append(G)
        mal_np_gs.append(A)
#         printProgressBar(i, len(rootDir)-1,prefix = 'malware gs')

    for i in range(len(trainDir)):
        A = np.array(getArray(trainDir[i]))
        G = nx.from_numpy_matrix(A)
        train_gs.append(G)
        train_np_gs.append(A)
#         printProgressBar(i, len(trainDir)-1, prefix = 'train gs')
        
    for i in range(len(benignDir)):
        A = np.array(getArray(benignDir[i]))
        G = nx.from_numpy_matrix(A)
        ben_gs.append(G)
        ben_np_gs.append(A)
#         printProgressBar(i, len(benignDir)-1, prefix = 'benign gs')
    if npmode: 
        return [mal_np_gs, ben_np_gs, train_np_gs]
    else: 
        return [mal_gs, ben_gs, train_gs]



In [6]:
def findCentrality(graphs):
    ben_gs = graphs[0]
    mal_gs = graphs[1]
    train_gs = graphs[2]
    ben_bs = []
    mal_bs = []
    train_bs = []
    ben_centr = []
    mal_centr = []
    train_centr = []
    for i in ben_gs:
        ben_bs.append(nx.betweenness_centrality(i))
#         printProgressBar(ben_gs.index(i),len(ben_gs)-1,prefix = 'benign')
    for j in mal_gs:
        mal_bs.append(nx.betweenness_centrality(j))
#         printProgressBar(mal_gs.index(j),len(mal_gs)-1,prefix = 'malware')
    for k in train_gs:
        train_bs.append(nx.betweenness_centrality(k))
#         printProgressBar(train_gs.index(k),len(train_gs)-1,prefix = 'train')
    for i in range(len(ben_bs)):
        ben_centr.append(list(ben_bs[i].values()))
    for i in range(len(mal_bs)):
        mal_centr.append(list(mal_bs[i].values()))
    for i in range(len(train_bs)):
        train_centr.append(list(train_bs[i].values()))
    return [ben_centr, mal_centr, train_centr]


In [7]:
def calculateDifference(listA, listB, dirs,isBenign):
    csm_val = []
    rootDir = dirs[0]
    benDir = dirs[1]
    trainDir = dirs[2]
    for i in range(len(listA)):
#         printProgressBar(i,len(listA)-1, prefix='diff calculation:')
        val = []
        for j in range(len(listB)):
            val.append(1-distance.cosine(listA[i], listB[j]))
        if isBenign:
            csm_val.append([benDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
        else :
            csm_val.append([rootDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
    return csm_val

In [8]:
def sortTests(test):
    sortedTests =[]
    for i in range(len(test)):
        srTest = sorted(test[i], key = lambda kv:kv[1],reverse = True)
        sortedTests.append(srTest)
    return sortedTests

In [9]:
def saveFigures(mal_th, ben_th, start, iteration):
    lists = [mal_th,ben_th]
    diffs = []
    v = 0
    for i in range(len(mal_th)):
        v = max(v,abs(mal_th[i]-ben_th[i]))
        diffs.append(abs(mal_th[i]-ben_th[i]))
#     print(v)
    d = max(diffs)
#     print(d, mal_th[diffs.index(d)])

    for i in lists:
        plt.plot(i,marker= 11)
    plt.title("max dif at %f"%(diffs[diffs.index(d)]))
    plt.legend(["True positives","False positives"])
    plt.yticks(np.arange(0, 1, 0.05))
    plt.xticks(np.arange(0,step, 1))
    plt.ylabel("")
    plt.draw()
    try:
        os.mkdir(str(iteration))
    except OSError:
#         print ("Creation of the directory %s failed" % str(iteration))
        pass
    else:
        print ("Successfully created the directory %s" % str(iteration))
    plt.savefig(str(iteration)+"/"+str(start)+".png")
    plt.clf()



In [10]:
def similarityMetric(sr):
    counterA = 0
    counterB = 0
    counterC = 0
    for i in sr :
        A,B = i[0].split(',')[0], i[0].split(',')[1]
        C,D = i[2].split(',')[0], i[2].split(',')[1]
        if A==C and B==D:
            counterA +=1
        if A==C or B==D:
            counterB +=1
        if A==C or A==D or B == C or B == D:
            counterC += 1
    print("cA = %d (%f) cB= %d (%f) cC =%d (%f)"%(counterA, counterA/len(sr), counterB, counterB/len(sr), counterC, counterC/len(sr)))
    return [counterA/len(sr),counterB/len(sr),counterC/len(sr)]

In [11]:
def detection(ben, mal, itr, step):
    dif = ben[0]-ben[-1]
    thresh = [ben[-1]+i*dif/step for i in range(step)]
    thresh.append(0.99999999)
    m = np.array((sum(i > thresh for i in mal)/len(mal))*100)
    p = np.array((sum(i > thresh for i in ben)/len(ben))*100)
    
    for i in [m,p]:
        plt.plot(i)
    plt.yticks(np.arange(0,1.05,0.05))
    plt.draw()
    plt.title('detection at '+str(itr)+ ' iteration')
    plt.savefig(str(itr)+'--btc.png')
    plt.clf()
    return [m,p],thresh

In [12]:
def detectSteps(testA, testB,step, metric = "Csm"):
    detect = []
    thresh = []
    for i in range(len(testA)):
        det, thr = detection(testA[i], testB[i], i, step, metric=metric)
        detect.append(det)
        thresh.append(thr)
    return detect, thresh

In [13]:
def plot_metrics(listA, listB, listThr, metric):
    
    diffs = []
    v = 0
    for i in range(len(listA)):
        v = max(v,abs(listA[i]-listB[i]))
        diffs.append(abs(listA[i]-listB[i]))
    pd = diffs.index(v)
    print(listA[pd], listB[pd])
    plt.figure(num=None, figsize=(10,7), dpi=200, facecolor='w', edgecolor=None)
    plt.margins(0.01)
    plt.plot(listA,'r')
    plt.plot(listB,'g')
#     plt.plot(pd,listA[pd],'yo-')
#     plt.plot(pd,listB[pd],'yo-')
#     plt.plot((pd,pd),(listA[pd],listB[pd]),'k--')
    xi = list(range(len(listThr)))
    plt.xticks(xi,[round(i,5) for i in listThr])
    plt.xticks(fontsize=8, rotation=90)
    plt.yticks(np.arange(0,105,5))
    plt.legend(["TP-rate","FP-rate"], loc='lower left')
#     plt.title("Average five fold with %s metric"%(metric))
    plt.savefig("Average five fold with %s metric"%(metric))
    plt.clf()

In [14]:
iterationDirs = [createDirs(i) for i in range(1,6)]

In [15]:
graphs = [createGraphs(iterationDirs[i],False)for i in range(len(iterationDirs))]

In [16]:
graphCentr = [findCentrality(graphs[i]) for i in range(len(graphs))]

In [17]:
t = time.time()
malwareTests = [calculateDifference(graphCentr[i][0],graphCentr[i][2], 
                                    iterationDirs[i], False) for i in range(len(graphCentr))]
print("malware took %f seconds"%(time.time()-t))

malware took 153.024907 seconds


In [18]:
benignTests = [calculateDifference(graphCentr[i][1],graphCentr[i][2], 
                                   iterationDirs[i], True) for i in range(len(graphCentr))]

In [19]:
sortBenignTests= sortTests(benignTests)

In [20]:
sortMalwareTests =  sortTests(malwareTests)

In [21]:
malwares =  [[sortMalwareTests[j][i][1] for i in range(len(sortMalwareTests[j]))] 
             for j in range(len(sortMalwareTests))]
benigns =  [[sortBenignTests[j][i][1] for i in range(len(sortBenignTests[j]))] 
             for j in range(len(sortBenignTests))]

In [22]:
step = 20
d = []
thresh = []
for i in range(len(benigns)):
    db, trh = detection(benigns[i], malwares[i], i, step)
    d.append(db)
    thresh.append(trh)

mal_det = np.array([d[i][0] for i in range(len(d))])
ben_det = np.array([d[i][1] for i in range(len(d))])
threshMean = np.mean(np.array([i for i in thresh]),axis=0)
avg_d = np.mean(mal_det,axis=0)
avg_d_b =  np.mean(ben_det,axis=0)
print(avg_d)
print(avg_d_b)
plt.figure(num=None, figsize=(10,7), dpi=200, facecolor='w', edgecolor=None)
plt.margins(0.01)
plt.plot(avg_d,'r')
plt.plot(avg_d_b,'g')
plt.yticks(np.arange(0,105,5))
plt.xticks(np.arange(0,len(avg_d),1))
plt.legend(['TP-rate','FP-rate'],loc = 'lower left')
plt.draw()
plt.savefig('average--btc.png')
plt.clf()

[99.77025424 99.73186652 99.65509109 99.57947672 99.50328182 99.42650639
 99.31192378 99.23572888 99.19734116 99.0437903  98.92862715 98.65991314
 98.31616529 97.89448095 97.55131364 96.90162513 96.09664416 94.79436448
 93.18614414 90.0104133  79.10010196]
[97.14285714 97.14285714 97.14285714 97.14285714 94.85714286 92.57142857
 92.57142857 92.         88.         85.14285714 81.71428571 77.71428571
 73.71428571 69.71428571 61.71428571 56.         53.71428571 49.14285714
 40.         32.57142857  8.        ]


<Figure size 432x288 with 0 Axes>

<Figure size 2000x1400 with 0 Axes>

In [23]:
similarities = [similarityMetric(sortMalwareTests[i]) for i in range(len(sortMalwareTests))]
for i in similarities :
    print(i)
print('-------------------------------\n')
counts = [[i[j] for i in similarities] for j in range(len(i))]
avg = [statistics.mean(i) for i in counts]

for j in avg : print(j)

cA = 216 (0.414587) cB= 297 (0.570058) cC =314 (0.602687)
cA = 217 (0.416507) cB= 318 (0.610365) cC =328 (0.629559)
cA = 252 (0.483685) cB= 321 (0.616123) cC =339 (0.650672)
cA = 217 (0.416507) cB= 292 (0.560461) cC =309 (0.593090)
cA = 191 (0.361059) cB= 262 (0.495274) cC =286 (0.540643)
[0.4145873320537428, 0.5700575815738963, 0.6026871401151631]
[0.4165067178502879, 0.6103646833013435, 0.6295585412667947]
[0.4836852207293666, 0.6161228406909789, 0.6506717850287908]
[0.4165067178502879, 0.5604606525911708, 0.5930902111324377]
[0.3610586011342155, 0.4952741020793951, 0.5406427221172023]
-------------------------------

0.41846891792358015
0.570455972047357
0.6033300799320777


In [24]:
plot_metrics(avg_d, avg_d_b, threshMean, "BETWEENES")

79.1001019560319 8.0


<Figure size 2000x1400 with 0 Axes>

In [25]:
for i in getArray(iterationDirs[0][0][0]):
    print(i)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0