In [None]:
# imports
import json
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import shap

# Data operations, import, parsing, preprocessing
# open data extracted from Cuckoo reports
with open('extractedInfo.json') as f:
    data = json.load(f)

completeCallList = data['completeCallList']
allFilesList = data['allFilesList']
finishedRows = data['finishedRows']
malwareList = data['malwareList']

# create dataframe 
df = pd.DataFrame(columns=completeCallList)

# put statistics from API calls in working dataframe
count = 0
for malwareSampleData in finishedRows:
    df.loc[count] = malwareSampleData
    count+=1

# add truth label to dataset, take off as needed
df['Malware'] = malwareList

# drop truth label from training set, define training and testing sets
X = df.drop('Malware', axis=1)  
y = df['Malware']

# train/test split. 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 2020, stratify=y)
y_train = np.array(y_train)


# API calls (features) obtained for graphics
features = X_test.columns

print("This dataset consists of frequency feature behavior collected from 105",
      "malicious and benign samples collected through cuckoo sandbox on a", 
      "Windows 7 machine. 216 were observed. The list of observed system", 
      "calls is below:\n\n")
for feature in features:
    print(feature)

In [None]:
# define Random Forest Classifier RFC
rfc_clf = RandomForestClassifier()

# optimize hyperparameters using gridsearch
parameters = {'n_estimators':[10, 50, 100, 150, 200, 250, 300, 325, 350, 375, 400, 425, 450, 475, 500, 1000],
              'criterion':('gini', 'entropy')}
rfc_grid = GridSearchCV(rfc_clf, parameters, n_jobs=-1)
rfc_grid.fit(X_train, y_train)

# take best model from gridsearch
rfc_model = rfc_grid.best_estimator_

# make some predictions
rfc_pred = rfc_model.predict(X_test)

# RFC metrics
rfc_acc = accuracy_score(y_test, rfc_pred)
rfc_precision = precision_score(y_test, rfc_pred)
rfc_recall = recall_score(y_test, rfc_pred)
rfc_f1 = f1_score(y_test, rfc_pred)

# print RFC scores
print('\nRandom Forest Classifier')
print('Best Hyperparameters:', rfc_grid.best_params_)
print('Accuracy: %.3f' % rfc_acc)
print('Precision: %.3f' % rfc_precision)
print('Recall: %.3f' % rfc_recall)
print('F1 score: %.3f' % rfc_f1)

# show RFC confusion matrix
rfc_cm = confusion_matrix(y_test, rfc_pred, labels=rfc_model.classes_)
rfc_disp = ConfusionMatrixDisplay(confusion_matrix=rfc_cm, display_labels=rfc_model.classes_)
rfc_disp.plot()
plt.show()

In [None]:
# RFC explanation using SHapely Additive exPlanations (SHAP)

# set up js for rendering SHAP graphics
shap.initjs()

# shap for tree explainer on RFC, summary plot
# note: max_display limits number of features displayed in order of importance.
#       20 is default, but currently set to 216 to display all features.Right 
#       click an output image and select 'open image in new tab' to 
#       inspect more easily.
print("RFC Summary Plot")
rfc_explainer = shap.TreeExplainer(rfc_model)
rfc_shap_values = rfc_explainer.shap_values(X_test)
shap.summary_plot(rfc_shap_values, features, class_names=["Benign", "Malicious"], max_display=216, plot_type="bar", plot_size="auto", show=True)

In [None]:
# RFC Waterfall plot for 1 prediction
print("\n\n\nRFC Waterfall Plot")
rfc_shap_vals = rfc_explainer(X_test)
rfc_exp = shap.Explanation(rfc_shap_vals.values[:,:,1],
                      rfc_shap_vals.base_values[:,1],
                      data=X_test.values,
                      feature_names=features)
idx=1
# max_display default is 10 for good visibility. Currently set to 216
# to make apparent which of all features push the prediction towards
# malicious or benign, no matter how small that contribution is.
shap.plots.waterfall(rfc_exp[idx], max_display=216)

In [None]:
# define a very simple Feed-Forward Neural Network FFNN
ffnn_model = keras.Sequential([
    layers.Dense(216, activation='relu', input_shape=(216,)),
    layers.Dense(1, activation='sigmoid')
])
ffnn_model.compile(optimizer='adam', loss='binary_crossentropy', 
                   metrics=['accuracy', keras.metrics.Precision(),
                           keras.metrics.Recall()])
ffnn_model.fit(X_train, y_train, epochs=10, batch_size=10)

# make some predictions, gather FFNN metrics
ffnn_loss, ffnn_acc, ffnn_precision, ffnn_recall = ffnn_model.evaluate(X_test, y_test)
ffnn_pred = ffnn_model.predict(X_test)
ffnn_pred_binary = [1 if pred >= 0.5 else 0 for pred in ffnn_pred]

# calc F1, avoid div by zero
try:
    ffnn_f1 = 2 * ((ffnn_precision * ffnn_recall) / (ffnn_precision + ffnn_recall))
except:
    ffnn_f1 = 0.0
    print("F1 calc error")

# print FFNN scores
print('\nFeed Forward Neural Network')
print('Loss:', ffnn_loss)
print('Accuracy: %.3f' % ffnn_acc)
print('Precision: %.3f' % ffnn_precision)
print('Recall: %.3f' % ffnn_recall)
print('F1 score: %.3f' % ffnn_f1)

# show FFNN confusion matrix
ffnn_cm = confusion_matrix(y_test, ffnn_pred_binary)
ffnn_disp = ConfusionMatrixDisplay(confusion_matrix=ffnn_cm)
ffnn_disp.plot()
plt.show()

In [None]:
# FFNN explanation using SHapely Additive exPlanations (SHAP)

# shap for kernel explainer on FFNN, summary plot
# note: max_display limits number of features displayed in order of importance.
#       20 is default, but currently set to 216 to display all features. Right 
#       click an output image and select 'open image in new tab' to 
#       inspect more easily.
ffnn_explainer = shap.KernelExplainer(ffnn_model, X_test)
ffnn_shap_values = ffnn_explainer.shap_values(X_test)
shap.summary_plot(ffnn_shap_values, features, class_names=["Benign", "Malicious"], max_display=216, plot_type="bar", plot_size="auto", show=True)

In [None]:
# FFNN Waterfall plot for 1 prediction
# max_display default is 10 for good visibility. Currently set to 216
# to make apparent which of all features push the prediction towards
# malicious or benign, no matter how small that contribution is.
shap.plots._waterfall.waterfall_legacy(ffnn_explainer.expected_value[0],
                                       ffnn_shap_values[0][0],
                                       feature_names=features,
                                       max_display=216)

In [None]:
# define Multi-layer perceptron MLP
mlp_clf = MLPClassifier()

# optimize hyperparameters using gridsearch
parameters = {'hidden_layer_sizes':[100, 150, 200], 
              'activation':('logistic', 'relu'),
              'solver':('lbfgs', 'adam'),
              'learning_rate':('constant', 'invscaling'),
              'max_iter':[200, 500, 1000]}
mlp_grid = GridSearchCV(mlp_clf, parameters, n_jobs=-1)
mlp_grid.fit(X_train, y_train)

# take the best model from gridsearch
mlp_model = mlp_grid.best_estimator_

# make some predictions
mlp_pred = mlp_model.predict(X_test)

# MLP metrics
mlp_acc = accuracy_score(y_test, mlp_pred)
mlp_precision = precision_score(y_test, mlp_pred)
mlp_recall = recall_score(y_test, mlp_pred)
mlp_f1 = f1_score(y_test, mlp_pred)

# print MLP scores
print('\nMulti-layer Perceptron')
print('Best Hyperparameters:', mlp_grid.best_params_)
print('Accuracy: %.3f' % mlp_acc)
print('Precision: %.3f' % mlp_precision)
print('Recall: %.3f' % mlp_recall)
print('F1 score: %.3f' % mlp_f1)

# show MLP confusion matrix
mlp_cm = confusion_matrix(y_test, mlp_pred, labels=mlp_model.classes_)
mlp_disp = ConfusionMatrixDisplay(confusion_matrix=mlp_cm, display_labels=mlp_model.classes_)
mlp_disp.plot()
plt.show()

In [None]:
# MLP explanation using SHapely Additive exPlanations (SHAP)

# shap for kernel explainer on MLP, summary plot
# note: max_display limits number of features displayed in order of importance.
#       20 is default, but currently set to 216 to display all features. Right 
#       click an output image and select 'open image in new tab' to 
#       inspect more easily.
mlp_explainer = shap.KernelExplainer(mlp_model.predict, X_test)
mlp_shap_values = mlp_explainer.shap_values(X_test)
shap.summary_plot(mlp_shap_values, features, class_names=["Benign", "Malicious"], max_display=216, plot_type="bar", plot_size="auto", show=True)

In [None]:
# MLP Waterfall plot for 1 prediction
# max_display default is 10 for good visibility. Currently set to 216
# to make apparent which of all features push the prediction towards
# malicious or benign, no matter how small that contribution is.
shap.plots._waterfall.waterfall_legacy(mlp_explainer.expected_value,
                                       mlp_shap_values[0],
                                       feature_names=features,
                                       max_display=216)

In [None]:
# define support vector machine SVM
svm_clf = svm.SVC()

# optimize hyperparameters using gridsearch
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[1, 5, 10, 25, 50, 75, 100, 500]}
svm_grid = GridSearchCV(svm_clf, parameters, n_jobs=-1)
svm_grid.fit(X_train, y_train)

# take the best model from gridsearch
svm_model = svm_grid.best_estimator_

# make some predictions
svm_pred = svm_model.predict(X_test)

# svm metrics
svm_acc = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)

# print SVM scores
print('\nSupport Vector Machine')
print('Best Hyperparameters:', svm_grid.best_params_)
print('Accuracy: %.3f' % svm_acc)
print('Precision: %.3f' % svm_precision)
print('Recall: %.3f' % svm_recall)
print('F1 score: %.3f' % svm_f1)

# show SVM confusion matrix
svm_cm = confusion_matrix(y_test, svm_pred, labels=svm_model.classes_)
svm_disp = ConfusionMatrixDisplay(confusion_matrix=svm_cm, display_labels=svm_model.classes_)
svm_disp.plot()
plt.show()

In [None]:
# SVM explanation using SHapely Additive exPlanations (SHAP)

# shap for kernel explainer on SVM, summary plot
# note: max_display limits number of features displayed in order of importance.
#       20 is default, but currently set to 216 to display all features. Right 
#       click an output image and select 'open image in new tab' to 
#       inspect more easily.

svm_explainer = shap.KernelExplainer(svm_model.predict, X_test)
svm_shap_values = svm_explainer.shap_values(X_test)
shap.summary_plot(svm_shap_values, features, class_names=["Benign", "Malicious"], max_display=216, plot_type="bar", plot_size="auto", show=True)

In [None]:
# SVM Waterfall plot for 1 prediction
# max_display default is 10 for good visibility. Currently set to 216
# to make apparent which of all features push the prediction towards
# malicious or benign, no matter how small that contribution is.
shap.plots._waterfall.waterfall_legacy(svm_explainer.expected_value,
                                       svm_shap_values[0],
                                       feature_names=features,
                                       max_display=216)