In [1]:
from collections import defaultdict 
import statistics
import os
import sys
import csv
import math
import random
import time
import networkx as nx
from matplotlib.pyplot import figure
from networkx.algorithms.flow import edmonds_karp
from pathlib import Path, PureWindowsPath
from networkx.algorithms.link_analysis import pagerank_numpy
from collections import Counter
from IPython.display import Image, display
from matplotlib import pyplot as plt
import numpy as np
from numpy import array
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
from networkx import similarity as sm


In [2]:
def simAverages(malTest, benTest, metric):
    similarities = [similarityMetric(malTest[i]) for i in range(len(malTest))]
    ties = []
    print(malTest[0])
    for i in similarities :
        print(i)
    counts = [[i[j] for i in similarities] for j in range(len(i))]
    avg = [statistics.mean(i) for i in counts]
    print("---------------")
    print(metric)
    print("---------------")
    print(avg)

In [3]:
def similarityMetric(sr):
    counterA = 0
    counterB = 0
    counterC = 0
    for i in sr :
        A,B = i[0].split(',')[0], i[0].split(',')[1]
        C,D = i[2].split(',')[0], i[2].split(',')[1]
        if A==C and B==D:
            counterA +=1
        if A==C or B==D:
            counterB +=1
        if A==C or A==D or B == C or B == D:
            counterC += 1
#     print("cA = %d (%f) cB= %d (%f) cC =%d (%f)"%(counterA, counterA/len(sr), counterB, counterB/len(sr), counterC, counterC/len(sr)))
    return [counterA/len(sr),counterB/len(sr),counterC/len(sr)]

In [4]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [5]:
def createDirs(fold):
    path = "DATASET F1/fold_" +str(fold)
    mal_path = path+ "/20%_test"
    train_path = path+ "/80%_train"
    benign_path = "DATASET F1/specimen_benign/20%_test"
    
    w_mal_path = PureWindowsPath(mal_path)
    w_train_path= PureWindowsPath(train_path)
    w_benign_path = PureWindowsPath(benign_path)
#     print( mal_path+"\n"+ train_path+"\n"+ benign_path)
#     print( w_mal_path, w_train_path, w_benign_path)

    filename = "default_G_tone_map"
    mapFilename = "default_unique_mapping"
    
    rootDir = [os.path.join(root, name)
             for root, dirs, files in os.walk(w_mal_path)
             for name in files
             if name.endswith(filename+".csv")]
    rootDir.sort()
    # for i in range(len(paths)):
    #     p = [os.path.join(root, name)
    #              for root, dirs, files in os.walk(paths[i]+"/"+path)
    #              for name in files
    #              if name.endswith(filename+".csv")]
    #     p.sort()
    #     rootDirs[i] = p
    # print(rootDirs[10])
    benignDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_benign_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    benignDir.sort()

    # for i in range(len(paths)):
    trainDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_train_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    trainDir.sort()
    #     trainDirs[i] = trainDir
    return [rootDir,benignDir,trainDir]

In [6]:
def createGraphs(dirs,npmode):
    time_to = time.time()
    rootDir = dirs[0]
    benignDir = dirs[1]
    trainDir = dirs[2]
    mal_gs = []
    mal_np_gs = []
    ben_gs = []
    train_gs = []
    train_np_gs = []
    ben_np_gs = []
    csm_vals = {}
    for i in range(len(rootDir)):
        A = np.array(getArray(rootDir[i]))
        G = nx.from_numpy_matrix(A)
        mal_gs.append(G)
        mal_np_gs.append(A)
#         printProgressBar(i, len(rootDir)-1,prefix = 'malware gs')

    for i in range(len(trainDir)):
        A = np.array(getArray(trainDir[i]))
        G = nx.from_numpy_matrix(A)
        train_gs.append(G)
        train_np_gs.append(A)
#         printProgressBar(i, len(trainDir)-1, prefix = 'train gs')
        
    for i in range(len(benignDir)):
        A = np.array(getArray(benignDir[i]))
        G = nx.from_numpy_matrix(A)
        ben_gs.append(G)
        ben_np_gs.append(A)
#         printProgressBar(i, len(benignDir)-1, prefix = 'benign gs')
    if npmode:
        return [mal_np_gs, ben_np_gs, train_np_gs]
    else: 
        return [mal_gs, ben_gs, train_gs]
    

In [7]:
def calculateDifference(listA, listB, dirs,isBenign, metric = "csm"):
    ret_val = []
    rootDir = dirs[0]
    benDir = dirs[1]
    trainDir = dirs[2]
    methods = {'csm': lambda a,b : 1-distance.cosine(a,b), 
               'euc': lambda a,b: 1000.0/(1000.0+distance.euclidean(a,b)), 
               'bray': lambda a,b: 1-distance.braycurtis(a,b), 
               'mink': lambda a,b: 1000.0/(1000.0+distance.minkowski(a,b,1)),
               'prod': lambda a,b: (1-distance.cosine(a,b))*(1-distance.braycurtis(a,b)),
               'sqeuc' : lambda a,b: 1000.0/(1000.0+distance.sqeuclidean(a,b)),
               'cor' : lambda a,b: 1-distance.correlation(a,b)}
    for i in range(len(listA)):
#         printProgressBar(i,len(listA)-1, length = 50, prefix = 'diff calculation', suffix = "with " + metric)
        val = []
        e_val = []
        stats = []
        for j in range(len(listB)):
            val.append(methods[metric](listA[i], listB[j]))
        if isBenign:
            ret_val.append([benDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
        else :
            ret_val.append([rootDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
    return ret_val


In [8]:
def detection(ben, mal, itr, step, metric = 'Csm' ):
    thresh =np.append(list(np.arange(ben[-1], ben[0], abs(ben[0]-ben[-1])/step)),ben[0])
    
    m = np.array((sum(i > thresh for i in mal)/len(mal))*100)
    p = np.array((sum(i > thresh for i in ben)/len(ben))*100)
    
#     for i in [m,p]:
#         plt.plot(i)
        
#     plt.yticks(np.arange(0,100,5))
#     plt.xticks(np.arange(0,len(thresh),1))
#     plt.draw()
#     plt.title('detection at '+str(itr)+ ' iteration')
#     plt.savefig(metric+ "--"+ str(itr)+'.png')
#     plt.clf() 
    return [m, p], thresh

In [9]:
def detectSteps(testA, testB,step, metric = "Csm"):
    detect = []
    thresh = []
    for i in range(len(testA)):
        det, thr = detection(testA[i], testB[i], i, step, metric=metric)
        detect.append(det)
        thresh.append(thr)
    return detect, thresh

In [10]:
def sortTests(test):
    sortedTests =[]
    for i in range(len(test)):
        srTest = sorted(test[i], key = lambda kv:kv[1],reverse = True)
        sortedTests.append(srTest)
    return sortedTests

In [11]:
def plot_metrics(listA, listB, listThr, metric):
    
    diffs = []
    v = 0
    print(listThr)
    for i in range(len(listA)):
        v = max(v,abs(listA[i]-listB[i]))
        diffs.append(abs(listA[i]-listB[i]))
    pd = diffs.index(v)
    print(listA[pd], listB[pd])
    figure(num=None, figsize=(10,7), dpi=200, facecolor='w', edgecolor=None)
    plt.margins(0.01)
    plt.plot(listA,'r')
    plt.plot(listB,'g')
#     plt.plot(pd,listA[pd],'yo-')
#     plt.plot(pd,listB[pd],'yo-')
#     plt.plot((pd,pd),(listA[pd],listB[pd]),'k--')
    xi = list(range(len(listThr)))
    plt.xticks(xi,[round(i,5) for i in listThr])
    plt.xticks(fontsize=8, rotation=90)
    plt.yticks(np.arange(0,105,5))
    plt.legend(["TP-rate","FP-rate"], loc='lower left')
#     plt.title("Average five fold with %s metric"%(metric))
    plt.savefig("Average five fold with %s metric pagerank"%(metric))
    plt.clf()

In [12]:
def getArray (path):
    results = []
    with open(path) as csvfile:
        reader = csv.reader(csvfile,csv.QUOTE_NONNUMERIC) # change contents to floats
        for row in reader: # each row is a list
            nums = []
            for i in row: 
                if i :
                    nums.append(int(i))
            results.append(nums)
    return results

In [13]:
def createGraphs(dirs, npmode):
    rootDir = dirs[0]
    benignDir = dirs[1]
    trainDir = dirs[2]
    mal_gs = []
    mal_np_gs = []
    ben_gs = []
    train_gs = []
    train_np_gs = []
    ben_np_gs = []
    csm_vals = {}
    for i in range(len(rootDir)):
        A = np.array(getArray(rootDir[i]))
        G = nx.from_numpy_matrix(A)
        mal_gs.append(G)
        mal_np_gs.append(A)
#         printProgressBar(i, len(rootDir)-1, suffix = 'malware gs')
    for i in range(len(trainDir)):
        A = np.array(getArray(trainDir[i]))
        G = nx.from_numpy_matrix(A)
        train_gs.append(G)
        train_np_gs.append(A)
#         printProgressBar(i, len(trainDir)-1, suffix = 'train gs')
    for i in range(len(benignDir)):
        A = np.array(getArray(benignDir[i]))
        G = nx.from_numpy_matrix(A)
        ben_gs.append(G)
        ben_np_gs.append(A)
#         printProgressBar(i, len(benignDir)-1, suffix = 'benign gs')
    if npmode: 
        return [mal_np_gs, ben_np_gs, train_np_gs]
    else: 
        return [mal_gs, ben_gs, train_gs]

In [14]:
def pgrank (graphs,fold) :
    mal_g = graphs[0]
    ben_g = graphs[1]
    train_g =  graphs[2]
    
    pgr = [nx.pagerank_numpy(mal_g[i])for i in range(len(mal_g))]
    pgr_ben = [nx.pagerank_numpy(ben_g[i]) for i in range(len(ben_g))]
    pgr_train = [nx.pagerank_numpy(train_g[i]) for i in range(len(train_g))]
    
    pgrgs = [list(pgr[i].values()) for i in range(len(pgr))]
    pgrgs_ben = [list(pgr_ben[i].values())for i in range(len(pgr_ben))]
    pgrgs_train = [list(pgr_train[i].values())for i in range(len(pgr_train))]
    
    sr = calculateDifference(pgrgs_ben,pgrgs_train, itDirs[fold],True)
    wr= calculateDifference(pgrgs,pgrgs_train, itDirs[fold],False)
    
    srr = sorted(sr, key= lambda kv:kv[1],reverse= True)
    malrr = sorted(wr, key= lambda kv:kv[1],reverse = True)
    print("-------------==========================--------------------")
    malr = [malrr[i][1] for i in range(len(malrr))]
    benr = [srr [i][1] for i in range(len(srr))]
    
    return [malr,benr,sr,wr]

In [15]:
itDirs = [createDirs(i) for i in range(1,6)]
graphs = [createGraphs(i,False) for i in itDirs]


In [16]:
maps = [pgrank(graphs[i],i) for i in range(len(graphs))]



In [17]:
malwareTests= [maps[i][3] for i in range(len(maps))]
benignTests= [maps[i][2] for i in range(len(maps))]
simAverages(sortTests(malwareTests),sortTests(benignTests),"Csm")
print("===========================================")
print()

[['ABU,Banload,', 1.0, 'Banbra,Banker,'], ['ABU,Banload,', 1.0, 'Banbra,Banker,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Agent,', 1.0, 'Agent,Agent,'], ['Agent,Small,', 1.0, 'Agent,Agent,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Allaple,RAHack,', 1.0, 'Allaple,RAHack,'], ['Ardamax,Ardamax,', 1.0, 'Ardamax,Ardamax,'], ['Ardamax,Ardamax,', 1.0, 'Ardamax,Ardamax,'], ['Ardamax,Ardamax,', 1.0, 'Ardamax,Ardamax,'], ['Bactera,VB,'

In [18]:
mal = []
ben = []
steps = 20
for i,j in maps:
    mal.append(i)
    ben.append(j)
det, thr = detectSteps(ben,mal,steps, metric= "pagerank")
mal_det = np.array([det[i][0] for i in range(len(det))])
ben_det = np.array([det[i][1] for i in range(len(det))])
threshPgr = np.mean(np.array([i for i in thr]),axis=0)
avg_det = np.mean(mal_det,axis=0)
avg_det_b = np.mean(ben_det,axis=0)
plot_metrics(avg_det,avg_det_b,threshPgr,"pagerank")

ValueError: too many values to unpack (expected 2)

In [None]:
print(maps[0][1])

In [None]:
simAverages(maps[0],maps[1],"Csm")
print("===========================================")
print()

In [None]:
graphCentr = [findCentrality(graphs[i]) for i in range(len(graphs))]

In [None]:
t = time.time()
malwareTests = [calculateDifference(graphCentr[i][0],graphCentr[i][2], 
                                    itDirs[i], False) for i in range(len(graphCentr))]
print("malware took %f seconds"%(time.time()-t))
benignTests = [calculateDifference(graphCentr[i][1],graphCentr[i][2], 
                                   itDirs[i], True) for i in range(len(graphCentr))]


In [None]:
sortBenignTests= sortTests(benignTests)
sortMalwareTests =  sortTests(malwareTests)

malwares =  [[sortMalwareTests[j][i][1] for i in range(len(sortMalwareTests[j]))] 
             for j in range(len(sortMalwareTests))]
benigns =  [[sortBenignTests[j][i][1] for i in range(len(sortBenignTests[j]))] 
             for j in range(len(sortBenignTests))]

In [None]:
steps = 20
detBtc, thrBtc = detectSteps(benigns,malwares,steps, metric= "bet centr")
malbtc = np.array([detBtc[i][0] for i in range(len(detBtc))])
benbtc = np.array([detBtc[i][1] for i in range(len(detBtc))])
threshBtc = np.mean(np.array([i for i in thrBtc]),axis=0)
avg_btc = np.mean(malbtc,axis=0)
avg_btc_b = np.mean(benbtc,axis=0)
plot_metrics(avg_btc,avg_btc_b,threshBtc,"btcentrality")

In [None]:
simAverages(sortMalwareTests,sortBenignTests,"Csm")
print("===========================================")
print()