In [None]:
import sys
import os
import csv
import time
import statistics
import numpy as np
from pathlib import Path, PureWindowsPath
from numpy import dot
from numpy.linalg import norm
import networkx as nx
import networkx.algorithms.matching
from collections import Counter
from IPython.display import Image, display
from matplotlib import pyplot as plt
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [None]:
def similarityMetric(sr):
    counterA = 0
    counterB = 0
    counterC = 0
    for i in sr :
        A,B = i[0].split(',')[0], i[0].split(',')[1]
        C,D = i[2].split(',')[0], i[2].split(',')[1]
        if A==C and B==D:
            counterA +=1
        if A==C or B==D:
            counterB +=1
        if A==C or A==D or B == C or B == D:
            counterC += 1
    print("cA = %d (%f) cB= %d (%f) cC =%d (%f)"%(counterA, counterA/len(sr), counterB, counterB/len(sr), counterC, counterC/len(sr)))
    return [counterA/len(sr),counterB/len(sr),counterC/len(sr)]

In [None]:
def createDirs(fold):
    
    path = "DATASET F1/fold_" +str(fold)
    mal_path = path+ "/20%_test"
    train_path = path+ "/80%_train"
    benign_path = "DATASET F1/specimen_benign/20%_test"
    
    w_mal_path = PureWindowsPath(mal_path)
    w_train_path= PureWindowsPath(train_path)
    w_benign_path = PureWindowsPath(benign_path)
    print( mal_path+"\n"+ train_path+"\n"+ benign_path)
    print( w_mal_path, w_train_path, w_benign_path)

    filename = "default_G_tone_map"
    mapFilename = "default_unique_mapping"
    
    rootDir = [os.path.join(root, name)
             for root, dirs, files in os.walk(w_mal_path)
             for name in files
             if name.endswith(filename+".csv")]
    rootDir.sort()
    # for i in range(len(paths)):
    #     p = [os.path.join(root, name)
    #              for root, dirs, files in os.walk(paths[i]+"/"+path)
    #              for name in files
    #              if name.endswith(filename+".csv")]
    #     p.sort()
    #     rootDirs[i] = p
    # print(rootDirs[10])
    benignDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_benign_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    benignDir.sort()

    # for i in range(len(paths)):
    trainDir = [os.path.join(root, name)
                 for root, dirs, files in os.walk(w_train_path)
                 for name in files
                 if name.endswith(filename+".csv")]
    trainDir.sort()
    #     trainDirs[i] = trainDir
    return [rootDir,benignDir,trainDir]

In [None]:
def createGraphs(dirs, npmode):
    rootDir = dirs[0]
    benignDir = dirs[1]
    trainDir = dirs[2]
    mal_gs = []
    mal_np_gs = []
    ben_gs = []
    train_gs = []
    train_np_gs = []
    ben_np_gs = []
    csm_vals = {}
    for i in range(len(rootDir)):
        A = np.array(getArray(rootDir[i]))
        G = nx.from_numpy_matrix(A)
        mal_gs.append(G)
        mal_np_gs.append(A)
        printProgressBar(i, len(rootDir)-1, prefix = 'malware gs')

    for i in range(len(trainDir)):
        A = np.array(getArray(trainDir[i]))
        G = nx.from_numpy_matrix(A)
        train_gs.append(G)
        train_np_gs.append(A)
        printProgressBar(i, len(trainDir)-1, prefix = 'train gs')
        
    for i in range(len(benignDir)):
        A = np.array(getArray(benignDir[i]))
        G = nx.from_numpy_matrix(A)
        ben_gs.append(G)
        ben_np_gs.append(A)
        printProgressBar(i, len(benignDir)-1, prefix = 'benign gs')
    if npmode: 
        return [mal_np_gs, ben_np_gs, train_np_gs]
    else: 
        return [mal_gs, ben_gs, train_gs]



In [None]:
def getArray (path):
    results = []
    with open(path) as csvfile:
        reader = csv.reader(csvfile,csv.QUOTE_NONNUMERIC) # change contents to floats
        for row in reader: # each row is a list
            nums = []
            for i in row: 
                if i :
                    nums.append(int(i))
            results.append(nums)
    return results

In [None]:
def detection(ben, mal, step= 10):
    dif = ben[0]-ben[-1]
    thresh = [ben[-1]+i*dif/step for i in range(step)]
    m = np.array(sum(i > thresh for i in mal)/len(mal))
    p = np.array(sum(i > thresh for i in ben)/len(ben))
#     print(m)
    plt.plot(m)
    plt.plot(p)


In [None]:
c = createDirs(1)
print(len(c))

In [None]:
g = createGraphs(c, False)

In [None]:
def calculateDifference(listA, listB, dirs,isBenign):
    csm_val = []
    rootDir = dirs[0]
    benDir = dirs[1]
    trainDir = dirs[2]
    for i in range(len(listA)):
        printProgressBar(i,len(listA)-1,prefix='diff calculation:')
        val = []
        for j in range(len(listB)):
            val.append(1-distance.cosine(listA[i], listB[j]))
        if isBenign:
            csm_val.append([benDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
        else :
            csm_val.append([rootDir[i].split('\\')[-3],max(val),trainDir[val.index(max(val))].split('\\')[-3]])
    return csm_val

In [None]:
from networkx.algorithms import approximation as approx

In [None]:
e = [approx.average_clustering(g[0][i]) for i in range(len(g[0]))]

In [None]:
q =[approx.average_clustering(g[1][i]) for i in range(len(g[1]))]

In [None]:
plt.hist(e,bins= 10)
plt.hist(q,bins=10)

In [None]:
x = [nx.degree_assortativity_coefficient(g[0][i]) for i in range(len(g[0]))]

In [None]:
f = [nx.degree_assortativity_coefficient(g[1][i]) for i in range(len(g[1]))]

In [None]:
plt.hist(x,bins= 10)
plt.hist(f,bins=10)

In [None]:
sx = sorted(x,reverse= True)
sf = sorted(f,reverse= True)

detection(sx,sf, step = 25)

In [None]:
avgd = [nx.average_neighbor_degree(g[0][i]) for i in range(len(g[0]))]

In [None]:
avgd_ben = [nx.average_neighbor_degree(g[1][i]) for i in range(len(g[1]))]

In [None]:
avgdList = [list(avgd[i].values()) for i in range(len(avgd))]
avgdListBen = [list(avgd_ben[i].values())for i in range(len(avgd_ben))]

In [None]:
calculateDifference(avgdListBen,avgdList, c, True)

In [None]:
from networkx.algorithms.link_analysis import pagerank_numpy

In [None]:
pgr = [nx.pagerank_numpy(g[0][i])for i in range(len(g[0]))]
pgr_ben = [nx.pagerank_numpy(g[1][i]) for i in range(len(g[1]))]
pgr_train = [nx.pagerank_numpy(g[2][i]) for i in range(len(g[2]))]

In [None]:
pgrgs = [list(pgr[i].values()) for i in range(len(pgr))]
pgrgs_ben = [list(pgr_ben[i].values())for i in range(len(pgr_ben))]
pgrgs_train = [list(pgr_train[i].values())for i in range(len(pgr_train))]

In [None]:
similarityMetric(calculateDifference(pgrgs,pgrgs_train, c, False))

In [None]:
srr = sorted(calculateDifference(pgrgs_ben,pgrgs_train,c, True), key= lambda kv:kv[1],reverse= True)
malrr = sorted(calculateDifference(pgrgs,pgrgs_train,c, False), key= lambda kv:kv[1],reverse = True)

In [None]:
malr = [malrr[i][1] for i in range(len(malrr))]
benr = [srr [i][1] for i in range(len(srr))]

In [None]:
detection(benr,malr,step = 35)

In [None]:
dirs = [createDirs(i) for i in range(1, 6)]

In [None]:
graphs = [createGraphs(dirs[i], False) for i in range(len(dirs))]

In [None]:
def pgrank (graphs,fold) :
    mal_g = graphs[0]
    ben_g = graphs[1]
    train_g =  graphs[2]
    
    pgr = [nx.pagerank_numpy(mal_g[i])for i in range(len(mal_g))]
    pgr_ben = [nx.pagerank_numpy(ben_g[i]) for i in range(len(ben_g))]
    pgr_train = [nx.pagerank_numpy(train_g[i]) for i in range(len(train_g))]
    
    pgrgs = [list(pgr[i].values()) for i in range(len(pgr))]
    pgrgs_ben = [list(pgr_ben[i].values())for i in range(len(pgr_ben))]
    pgrgs_train = [list(pgr_train[i].values())for i in range(len(pgr_train))]
    
    srr = sorted(calculateDifference(pgrgs_ben,pgrgs_train, dirs[fold], True), key= lambda kv:kv[1],reverse= True)
    malrr = sorted(calculateDifference(pgrgs,pgrgs_train, dirs[fold], False), key= lambda kv:kv[1],reverse = True)
    print("-------------==========================--------------------")
    malr = [malrr[i][1] for i in range(len(malrr))]
    benr = [srr [i][1] for i in range(len(srr))]
    
    return [malr,benr]

In [None]:
# srr = sorted(calculateDifference(pgrgs_ben,pgrgs_train,c, True), key= lambda kv:kv[1],reverse= True)
# malrr = sorted(calculateDifference(pgrgs,pgrgs_train,c, False), key= lambda kv:kv[1],reverse = True)
# print("-------------==========================--------------------")
# malr = [malrr[i][1] for i in range(len(malrr))]
# benr = [srr [i][1] for i in range(len(srr))]

In [None]:
exps = [pgrank(graphs[i], i ) for i in range(len(graphs))]

In [None]:
len(exps[i]) for i in range(len(exps))