In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Feature Extraction CSVs

In [2]:
import networkx as nx
import time
import argparse
import csv
from multiprocessing import Pool as ThreadPool
from functools import partial
import glob


In [3]:
def obtain_sensitive_apis(file):
    print("In Obtain_Sensitive_APIs")
    sensitive_apis = []
    with open(file, 'r') as f:
        for line in f.readlines():
            if line.strip() == '':
                continue
            else:
                sensitive_apis.append(line.strip())
    print("Out Obtain_Sensitive_APIs")
    return sensitive_apis

In [4]:
def callgraph_extraction(file):
    CG = nx.read_gexf(file)
    return CG

In [5]:
def degree_centrality_feature(file, sensitive_apis):
    print("In Degree")
    sha256 = file.split('/')[-1].split('.')[0]
    CG = callgraph_extraction(file)
    node_centrality = nx.degree_centrality(CG)
    
    vector = []
    for api in sensitive_apis:
        if api in node_centrality.keys():
            vector.append(node_centrality[api])
        else:
            vector.append(0)
    print("Out Degree")
    return (sha256, vector)

In [6]:
def katz_centrality_feature(file, sensitive_apis):
    print("In Katz")
    sha256 = file.split('/')[-1].split('.')[0]
    CG = callgraph_extraction(file)
    node_centrality = nx.katz_centrality(CG)

    vector = []
    for api in sensitive_apis:
        if api in node_centrality.keys():
            vector.append(node_centrality[api])
        else:
            vector.append(0)
    print("Out Katz")
    return (sha256, vector)

In [7]:
def closeness_centrality_feature(file, sensitive_apis):
    print("In Closeness")
    sha256 = file.split('/')[-1].split('.')[0]
    CG = callgraph_extraction(file)
    node_centrality = nx.closeness_centrality(CG)
    
    vector = []
    for api in sensitive_apis:
        if api in node_centrality.keys():
            vector.append(node_centrality[api])
        else:
            vector.append(0)
    
    print("Out Closeness")
    return (sha256, vector)

In [8]:
def harmonic_centrality_feature(file, sensitive_apis):
    print("In Harmonic")
    sha256 = file.split('/')[-1].split('.')[0]
    CG = callgraph_extraction(file)
    node_centrality = nx.harmonic_centrality(CG)
    
    vector = []
    for api in sensitive_apis:
        if api in node_centrality.keys():
            vector.append(node_centrality[api])
        else:
            vector.append(0)
    print("Out Harmonic")
    return (sha256, vector)


In [9]:
def obtain_dataset(dataset_path, centrality_type, sensitive_apis):
    Vectors = []
    Labels = []
    
    print("In Obtain_Dataset")
    if dataset_path[-1] == '/':
        apps_b = glob.glob(dataset_path + 'Benign/*.gexf')
        apps_m = glob.glob(dataset_path + 'Malign/*.gexf')
    else:
        apps_b = glob.glob(dataset_path + '/Benign/*.gexf')
        apps_m = glob.glob(dataset_path + '/Malign/*.gexf')
    print(len(apps_b),len(apps_m))

    pool_b = ThreadPool(15)
    pool_m = ThreadPool(15)
    if centrality_type == 'degree':
        vector_b = pool_b.map(partial(degree_centrality_feature, sensitive_apis=sensitive_apis), apps_b)
        vector_m = pool_m.map(partial(degree_centrality_feature, sensitive_apis=sensitive_apis), apps_m)
    elif centrality_type == 'katz':
        vector_b = pool_b.map(partial(katz_centrality_feature, sensitive_apis=sensitive_apis), apps_b)
        vector_m = pool_m.map(partial(katz_centrality_feature, sensitive_apis=sensitive_apis), apps_m)
    elif centrality_type == 'closeness':
        vector_b = pool_b.map(partial(closeness_centrality_feature, sensitive_apis=sensitive_apis), apps_b)
        vector_m = pool_m.map(partial(closeness_centrality_feature, sensitive_apis=sensitive_apis), apps_m)
    elif centrality_type == 'harmonic':
        vector_b = pool_b.map(partial(harmonic_centrality_feature, sensitive_apis=sensitive_apis), apps_b)
        vector_m = pool_m.map(partial(harmonic_centrality_feature, sensitive_apis=sensitive_apis), apps_m)
    else:
        print('Error Centrality Type!')

    Vectors.extend(vector_b)
    Labels.extend([0 for i in range(len(vector_b))])

    Vectors.extend(vector_m)
    Labels.extend([1 for i in range(len(vector_m))])
    
    print("Out Obtain_Dataset")
    return Vectors, Labels

In [10]:
def main():
    sensitive_apis_path = '/content/drive/MyDrive/Project BE 2020-2021/Semi Final/sensitive_apis.txt'
    sensitive_apis = obtain_sensitive_apis(sensitive_apis_path)

    dataset_path = "/content/drive/MyDrive/Project BE 2020-2021/Semi Final/gefx files/"
    output_path = "/content/drive/MyDrive/Project BE 2020-2021/Semi Final/gefx files/"


    # cetrality_type = 'degree'
    # cetrality_type = 'katz'
    cetrality_type = 'closeness'
    # cetrality_type = 'harmonic'

    Vectors, Labels = obtain_dataset(dataset_path, cetrality_type, sensitive_apis)
    feature_csv = [[] for i in range(len(Labels)+1)]
    feature_csv[0].append('SHA256')
    feature_csv[0].extend(sensitive_apis)
    feature_csv[0].append('Label')

    for i in range(len(Labels)):
        (sha256, vector) = Vectors[i]
        feature_csv[i+1].append(sha256)
        feature_csv[i+1].extend(vector)
        feature_csv[i+1].append(Labels[i])

    if output_path[-1] == '/':
        csv_path = output_path + cetrality_type + '_features.csv'
    else:
        csv_path = output_path + '/' + cetrality_type + '_features.csv'

    with open(csv_path, 'w', newline='') as f:
        csvfile = csv.writer(f)
        csvfile.writerows(feature_csv)

In [None]:
if __name__ == '__main__':
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
Out Closeness
Out Closeness
In Closeness
In Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
Out Closeness
In Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
Out Closeness
In Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
In Closeness
Out Closeness
Out Closeness
In Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In Closeness
Out Closeness
In 

#LSTM Model Training

In [None]:
from __future__ import print_function
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np
np.random.seed(1337)  # for reproducibility
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb
from keras.utils.np_utils import to_categorical
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn import metrics
from sklearn.preprocessing import Normalizer
import h5py
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

In [None]:
cetrality_type = 'degree'
# cetrality_type = 'katz'
# cetrality_type = 'closeness'
# cetrality_type = 'harmonic'

In [None]:
traindata = pd.read_csv('/content/drive/MyDrive/Project BE 2020-2021/Semi Final/gefx files/' + cetrality_type + '_features.csv', header=None)
testdata = pd.read_csv('/content/drive/MyDrive/Project BE 2020-2021/Semi Final/gefx files/' + cetrality_type + '_features.csv', header=None)

In [None]:
X = traindata.drop(labels=['Label', 'SHA256'], axis=1)
Y = traindata['Label']
C = testdata['Label']
T = testdata.drop(labels=['Label', 'SHA256'], axis=1)

In [None]:
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
# summarize transformed data
np.set_printoptions(precision=3)
#print(trainX[0:5,:])

In [None]:
scaler = Normalizer().fit(T)
testT = scaler.transform(T)
# summarize transformed data
np.set_printoptions(precision=3)
#print(testT[0:5,:])

In [None]:
y_train = np.array(Y)
y_test = np.array(C)

In [None]:
# reshape input to be [samples, time steps, features]
X_train = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
X_test = np.reshape(testT, (testT.shape[0], 1, testT.shape[1]))

In [None]:
batch_size = 32

In [None]:
# 1. define the network
model = Sequential()
model.add(SimpleRNN(64,input_dim=42, return_sequences=True))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(SimpleRNN(64, return_sequences=True))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(SimpleRNN(64, return_sequences=True))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(SimpleRNN(64, return_sequences=True))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(SimpleRNN(64, return_sequences=True))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(SimpleRNN(64, return_sequences=False))  # try using a GRU instead, for fun
model.add(Dropout(0.1))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
checkpointer = callbacks.ModelCheckpoint(filepath="logs/5/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='val_acc',mode='max')
csv_logger = CSVLogger('logs/5/training_set_iranalysis.csv',separator=',', append=False)
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=50, validation_data=(X_test, y_test),callbacks=[checkpointer,csv_logger])
model.save("/content/drive/MyDrive/Project BE 2020-2021/Semi Final/LSTM Models/LSTM_" + cetrality_type + "_model.hdf5")

In [None]:
model.load_weights("/content/drive/MyDrive/Project BE 2020-2021/Semi Final/LSTM Models/LSTM_" + cetrality_type + "_model.hdf5")
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
loss, accuracy = model.evaluate(X_train, y_train)
print("\nLoss: %.2f, Accuracy: %.2f%%" % (loss, accuracy*100))

In [None]:
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)

In [None]:
expected = y_train
predicted = model.predict_classes(X_train)

In [None]:
accuracy = accuracy_score(expected, predicted)
precision = precision_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
f1 = f1_score(expected, predicted , average="binary")

In [None]:
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" % precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)