## **Imports and configurations**

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [21]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [5]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser'

## **Models evaluation**

In [102]:
def preview_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    print(f'Nb Observations: {df.shape[0]}')
    print(f'Nb Columns: {df.shape[1]}')
    return df.head(3)


def get_null_columns(df):
    null_per_column = (df==0).sum()
    null_columns = null_per_column[null_per_column==len(df)].index.tolist()
    return null_columns

def create_X_y(folder_path, file_name, drop_null_columns, index_value = 'md5'):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, index_col = index_value)
    X = df.drop('label', axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['label']
    return shuffle(X, y)


def create_regular_net():
    model = Sequential()
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer1'))
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer2'))
    model.add(Dense(1, activation = 'sigmoid', name = 'dense_output'))   #sigmoid for binary 
    model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics = ['accuracy'])
    return model


def wrap_regular_net():
    model = KerasClassifier(build_fn=create_regular_net, epochs=50, batch_size=64, verbose=0)
    return model


def create_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['KNN'] = KNeighborsClassifier()
    models['Decision tree'] = DecisionTreeClassifier()
    models['Random Forest'] = RandomForestClassifier()
    #models['Stochastic Gradient Descent'] = SGDClassifier()
    models['SVM'] = SVC()
    models['RegularNets'] = wrap_regular_net()
    #models['LDA'] = LDA()
    models['Gaussian Naive Bayes'] = GaussianNB()
    return models


def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


def get_evaluation_results(folder_path, file_name, drop_null_columns=False, scaler=None):  
    
    #generate features and target
    X, y = create_X_y(folder_path, file_name, drop_null_columns)
    
    #generate models
    models = create_models()
    
    #evaluate models and store results
    results, names = list(), list()
    for name, model in models.items():
        if scaler is not None:
            pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
        else:
            pipeline = model
        scores = evaluate_model(pipeline, X, y)
        results.append(scores)
        names.append(name) 
        
    return names, results


def print_evaluation_results(results, names):  
    for name, scores in zip(names, results):
        print(f"{name:30} \t: {np.mean(scores)*100:.3f}% ( (+/-) {np.std(scores)*100:.3f}% )")  

### 1. apistats (one-hot encoded)

In [7]:
file_name = 'onehot_encoded_apistats_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3761
Nb Columns: 304


Unnamed: 0_level_0,LdrUnloadDll,RegCloseKey,GetSystemTimeAsFileTime,LoadStringW,GetSystemInfo,RegQueryValueExA,LdrGetProcedureAddress,MessageBoxTimeoutW,RegSetValueExA,NtTerminateProcess,...,CIFrameElement_CreateElement,CDocument_write,WSAConnect,CopyFileExW,NtDeleteFile,ExitWindowsEx,CreateServiceW,WNetGetProviderNameW,RtlCreateUserProcess,NtLoadKeyEx
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 64.03 seconds


In [9]:
print_evaluation_results(results, names)

LogisticRegression             	: 90.862% ( (+/-) 1.074% )
KNN                            	: 92.130% ( (+/-) 1.028% )
Decision tree                  	: 90.880% ( (+/-) 1.628% )
Random Forest                  	: 94.665% ( (+/-) 0.937% )
SVM                            	: 93.140% ( (+/-) 1.072% )
RegularNets                    	: 92.422% ( (+/-) 0.854% )
LDA                            	: 89.524% ( (+/-) 1.298% )
Gaussian Naive Bayes           	: 69.556% ( (+/-) 1.994% )


In [13]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_onehot_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 1. apistats (count encoded)

In [17]:
file_name = 'count_encoded_apistats_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3822
Nb Columns: 305


Unnamed: 0_level_0,LdrUnloadDll,RegCloseKey,GetSystemTimeAsFileTime,LoadStringW,GetSystemInfo,RegQueryValueExA,LdrGetProcedureAddress,MessageBoxTimeoutW,RegSetValueExA,NtTerminateProcess,...,CDocument_write,WSAConnect,ExitWindowsEx,CopyFileExW,NtDeleteFile,CreateServiceW,WNetGetProviderNameW,RtlCreateUserProcess,NtLoadKeyEx,NtLoadDriver
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,4,3,1,2,1,13,6,1,6,3,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,6,2,1,0,55,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,9,1,0,0,0,21,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation on {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation on 8 models required a duration of 80.87 seconds


In [31]:
print_evaluation_results(results, names)

LogisticRegression             	: 87.031% ( (+/-) 1.824% )
KNN                            	: 88.497% ( (+/-) 1.433% )
Decision tree                  	: 91.035% ( (+/-) 1.535% )
Random Forest                  	: 95.125% ( (+/-) 0.768% )
SVM                            	: 75.475% ( (+/-) 1.817% )
RegularNets                    	: 88.654% ( (+/-) 1.667% )
LDA                            	: 82.104% ( (+/-) 1.790% )
Gaussian Naive Bayes           	: 66.841% ( (+/-) 2.230% )


In [32]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Count Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_count_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 3. dll_loaded (one-hot encoded)

In [33]:
file_name = 'onehot_encoded_dll_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3162
Nb Columns: 2242


Unnamed: 0_level_0,RTUTILS.DLL,kernel32.dll,label,kernel32,C:\Users\cucko\AppData\Local\Temp\emedcfd.dll,RpcRtRemote.dll,CRYPTSP.dll,CLBCatQ.DLL,msimg32.dll,libssl32.dll,...,C:\Users\cucko\AppData\Local\Temp\VmX.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_1d4457e8e6917937845f55ebbce2fc49.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_1d50b69a05d60d4f9f703b789a2933de.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_268eef019bf65b2987e945afaf29643f.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_3c1b2fabb7d74bc5be0820eae4107f8a.exe,C:\Users\cucko\AppData\Local\Temp\VirusShare_43b844c35e1a933e9214588be81ce772.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_933b11bc4799f8d9f65466fb2e3ea659.exe,C:\Users\cucko\AppData\Local\Temp\VirusShare_9fc3ed6c9b8056fbf155f79569ca7cb1.exe,D3DIM700.DLL,C:\Users\cucko\AppData\Local\Temp\VirusShare_bcbdef1678049378be04719ed29078d2.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 7 models required a duration of 228.69 seconds


In [64]:
print_evaluation_results(results, names)

LogisticRegression             	: 87.022% ( (+/-) 1.843% )
KNN                            	: 84.061% ( (+/-) 1.940% )
Decision tree                  	: 84.535% ( (+/-) 1.923% )
Random Forest                  	: 87.118% ( (+/-) 2.130% )
SVM                            	: 87.602% ( (+/-) 1.943% )
RegularNets                    	: 86.717% ( (+/-) 1.411% )
Gaussian Naive Bayes           	: 64.800% ( (+/-) 3.020% )


In [65]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "dll_loaded" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_onehot_dll_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 4. File operations (counts summary)

In [66]:
file_name = 'file_operations_counts_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3496
Nb Columns: 11


Unnamed: 0_level_0,label,file_opened,file_exists,file_read,file_deleted,file_failed,file_created,file_recreated,file_written,file_copied,file_moved
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00263ca2071dc9a6ee577eb356b0d1d9,0,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,4,7,1,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,0,0,0,0,0,0,0,0


In [67]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 7 models required a duration of 45.48 seconds


In [68]:
print_evaluation_results(results, names)

LogisticRegression             	: 58.533% ( (+/-) 2.213% )
KNN                            	: 74.649% ( (+/-) 2.860% )
Decision tree                  	: 77.918% ( (+/-) 2.120% )
Random Forest                  	: 79.597% ( (+/-) 2.150% )
SVM                            	: 57.008% ( (+/-) 1.093% )
RegularNets                    	: 62.919% ( (+/-) 2.452% )
Gaussian Naive Bayes           	: 49.589% ( (+/-) 1.510% )


In [69]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On File Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_file_operation_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 5. Registry Key Operations (counts summary)

In [70]:
file_name = 'regkeys_counts_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3496
Nb Columns: 5


Unnamed: 0_level_0,regkey_opened,regkey_read,regkey_written,label,regkey_deleted
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00263ca2071dc9a6ee577eb356b0d1d9,2,8,6,0,0
0137996cd3aa197ae8eb64fef12c044a,1,2,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,13,9,0,0,0


In [71]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 7 models required a duration of 58.60 seconds


In [72]:
print_evaluation_results(results, names)

LogisticRegression             	: 63.930% ( (+/-) 2.534% )
KNN                            	: 79.443% ( (+/-) 2.758% )
Decision tree                  	: 80.215% ( (+/-) 2.147% )
Random Forest                  	: 82.112% ( (+/-) 2.233% )
SVM                            	: 64.550% ( (+/-) 2.175% )
RegularNets                    	: 65.741% ( (+/-) 2.140% )
Gaussian Naive Bayes           	: 56.875% ( (+/-) 0.516% )


In [73]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Registry Key Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_registry_key_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 6. PE Entropy

In [74]:
file_name = 'pe_entropy_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 4308
Nb Columns: 795


Unnamed: 0_level_0,.text,.data,.rsrc,.reloc,label,.rdata,.pdata,_RDATA,UPX0,UPX1,...,/47,/61,/73,/84,/95,dtwwkix,afaqowd,q\xa0\x01\x00c,PAGEtext,PAGEdata
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,6.068004,4.873371,4.527203,5.159578,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0137996cd3aa197ae8eb64fef12c044a,6.49023,2.382517,0.0,5.372628,0,5.108187,5.160316,1.117635,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0282f83bbfb58c08b54dbd8015e54d2e,6.342347,1.021041,4.717804,4.662975,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, drop_null_columns=True, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 7 models required a duration of 99.66 seconds


In [104]:
print_evaluation_results(results, names)

LogisticRegression             	: 75.395% ( (+/-) 1.624% )
KNN                            	: 85.933% ( (+/-) 1.664% )
Decision tree                  	: 87.311% ( (+/-) 1.647% )
Random Forest                  	: 90.614% ( (+/-) 1.792% )
SVM                            	: 74.126% ( (+/-) 1.520% )
RegularNets                    	: 80.672% ( (+/-) 1.769% )
Gaussian Naive Bayes           	: 46.402% ( (+/-) 1.210% )


In [105]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_pe_entropy.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)