## **Imports and configurations**

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *  

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
folder_path = '/content/drive/MyDrive/processed-ransomware-datasets/'

## **Models evaluation**

In [None]:
def preview_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    print(f'Nb Observations: {df.shape[0]}')
    print(f'Nb Features: {df.shape[1] - 1}')
    return df.drop(['label'], axis=1).head(3)

def get_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    return df

def get_null_columns(df):
    null_per_column = (df==0).sum()
    null_columns = null_per_column[null_per_column==len(df)].index.tolist()
    return null_columns

def create_X_y(folder_path, file_name, drop_null_columns=False, index_value = 'md5'):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, index_col = index_value)
    #X = df.drop('label', axis=1)
    X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['sublabel']
    return shuffle(X, y)


def create_regular_net():
    model = Sequential()
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer1'))
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer2'))
    model.add(Dense(1, activation = 'sigmoid', name = 'dense_output'))   #sigmoid for binary 
    model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics = ['accuracy'])
    return model


def wrap_regular_net():
    model = KerasClassifier(build_fn=create_regular_net, epochs=50, batch_size=64, verbose=0)
    return model


def create_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['KNN'] = KNeighborsClassifier()
    models['Decision tree'] = DecisionTreeClassifier()
    models['Random Forest'] = RandomForestClassifier()
    #models['Stochastic Gradient Descent'] = SGDClassifier()
    models['SVM'] = SVC()
    models['RegularNets'] = wrap_regular_net()
    models['LDA'] = LDA()
    models['Gaussian Naive Bayes'] = GaussianNB()
    return models


def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


def get_evaluation_results(folder_path, file_name, drop_null_columns=False, scaler=None):  
    
    #generate features and target
    X, y = create_X_y(folder_path, file_name, drop_null_columns)
    
    #generate models
    models = create_models()
    
    #evaluate models and store results
    results, names = list(), list()
    for name, model in models.items():
        if scaler is not None:
            pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
        else:
            pipeline = model
        scores = evaluate_model(pipeline, X, y)
        results.append(scores)
        names.append(name) 
        
    return names, results


def print_evaluation_results(results, names):  
    for name, scores in zip(names, results):
        print(f"{name:30} \t: {np.mean(scores)*100:.3f}% ( (+/-) {np.std(scores)*100:.3f}% )")  

### 1. apistats (one-hot encoded)

In [None]:
file_name = 'onehot_encoded_apistats_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2618
Nb Features: 300


Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,RtlAddVectoredExceptionHandler,FindResourceExW,NtCreateFile,GetSystemTimeAsFileTime,GlobalMemoryStatusEx,LoadResource,CoInitializeSecurity,SetFileAttributesW,NtQueryInformationFile,RegCreateKeyExW,WriteProcessMemory,RegQueryValueExA,LookupPrivilegeValueW,NtQueryValueKey,RegQueryValueExW,CreateActCtxW,NtReadFile,NtWriteFile,LdrGetDllHandle,NtQuerySystemInformation,CreateThread,GetSystemDirectoryW,GetVolumeNameForVolumeMountPointW,CoCreateInstanceEx,GetSystemDirectoryA,NtProtectVirtualMemory,CoInitializeEx,GlobalMemoryStatus,RegSetValueExW,LdrGetProcedureAddress,...,CryptEncrypt,GetDiskFreeSpaceExW,RtlCompressBuffer,NtCreateUserProcess,CIFrameElement_CreateElement,CertOpenSystemStoreA,FindWindowExA,NetGetJoinInformation,CryptDecrypt,InternetOpenUrlW,NtQueryMultipleValueKey,Thread32First,Thread32Next,NtQueueApcThread,WSASocketA,accept,WSASend,IWbemServices_ExecMethod,WSAConnect,PRF,SendNotifyMessageA,NtDeleteFile,NtQueryFullAttributesFile,FindFirstFileExA,__anomaly__,DnsQuery_A,GetFileVersionInfoSizeExW,GetFileVersionInfoExW,CDocument_write,CertCreateCertificateContext,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,1,0,1,1,0,0,1,0,0,0,0,1,1,0,1,0,1,1,0,0,1,0,1,1,0,1,1,1,1,0,1,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df = get_data(folder_path, file_name)
df.head()

Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,RtlAddVectoredExceptionHandler,FindResourceExW,NtCreateFile,GetSystemTimeAsFileTime,GlobalMemoryStatusEx,LoadResource,CoInitializeSecurity,SetFileAttributesW,NtQueryInformationFile,RegCreateKeyExW,WriteProcessMemory,RegQueryValueExA,LookupPrivilegeValueW,NtQueryValueKey,RegQueryValueExW,CreateActCtxW,NtReadFile,NtWriteFile,LdrGetDllHandle,NtQuerySystemInformation,CreateThread,GetSystemDirectoryW,GetVolumeNameForVolumeMountPointW,CoCreateInstanceEx,GetSystemDirectoryA,NtProtectVirtualMemory,CoInitializeEx,GlobalMemoryStatus,RegSetValueExW,LdrGetProcedureAddress,...,CryptEncrypt,GetDiskFreeSpaceExW,RtlCompressBuffer,NtCreateUserProcess,CIFrameElement_CreateElement,CertOpenSystemStoreA,FindWindowExA,NetGetJoinInformation,CryptDecrypt,InternetOpenUrlW,NtQueryMultipleValueKey,Thread32First,Thread32Next,NtQueueApcThread,WSASocketA,accept,WSASend,IWbemServices_ExecMethod,WSAConnect,PRF,SendNotifyMessageA,NtDeleteFile,NtQueryFullAttributesFile,FindFirstFileExA,__anomaly__,DnsQuery_A,GetFileVersionInfoSizeExW,GetFileVersionInfoExW,CDocument_write,CertCreateCertificateContext,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,1,0,1,1,0,0,1,0,0,0,0,1,1,0,1,0,1,1,0,0,1,0,1,1,0,1,1,1,1,0,1,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
00a53241bf9c9425c6df8da44a5ca4f4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X, y = create_X_y(folder_path, file_name)

In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation 8 models required a duration of 132.93 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 97.607% ( (+/-) 0.823% )
KNN                            	: 97.467% ( (+/-) 0.921% )
Decision tree                  	: 96.664% ( (+/-) 1.092% )
Random Forest                  	: 97.467% ( (+/-) 0.947% )
SVM                            	: 97.543% ( (+/-) 1.012% )
RegularNets                    	: 97.645% ( (+/-) 1.062% )
LDA                            	: 96.958% ( (+/-) 1.150% )
Gaussian Naive Bayes           	: 52.916% ( (+/-) 2.990% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "apistats" Ransomware Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_onehot_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)


### 1. apistats (count encoded)

In [None]:
file_name = 'count_encoded_apistats_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2618
Nb Features: 300


Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,RtlAddVectoredExceptionHandler,FindResourceExW,NtCreateFile,GetSystemTimeAsFileTime,GlobalMemoryStatusEx,LoadResource,CoInitializeSecurity,SetFileAttributesW,NtQueryInformationFile,RegCreateKeyExW,WriteProcessMemory,RegQueryValueExA,LookupPrivilegeValueW,NtQueryValueKey,RegQueryValueExW,CreateActCtxW,NtReadFile,NtWriteFile,LdrGetDllHandle,NtQuerySystemInformation,CreateThread,GetSystemDirectoryW,GetVolumeNameForVolumeMountPointW,CoCreateInstanceEx,GetSystemDirectoryA,NtProtectVirtualMemory,CoInitializeEx,GlobalMemoryStatus,RegSetValueExW,LdrGetProcedureAddress,...,CryptEncrypt,GetDiskFreeSpaceExW,RtlCompressBuffer,NtCreateUserProcess,CIFrameElement_CreateElement,CertOpenSystemStoreA,FindWindowExA,NetGetJoinInformation,CryptDecrypt,InternetOpenUrlW,NtQueryMultipleValueKey,Thread32First,Thread32Next,NtQueueApcThread,WSASocketA,accept,WSASend,IWbemServices_ExecMethod,WSAConnect,PRF,SendNotifyMessageA,NtDeleteFile,NtQueryFullAttributesFile,FindFirstFileExA,__anomaly__,DnsQuery_A,GetFileVersionInfoSizeExW,GetFileVersionInfoExW,CDocument_write,CertCreateCertificateContext,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000a3ea381d7d70be8b6fe1ee51dca22,8,3,1173,876,475,1387,810,58,1,102,2,16,860,101,4,34,5,3,318,461,672,66,17,953,503,19,1600,13442,830,125,34,11,10,4,2,436,32,121,438,3406,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,8,10,1,10,2,0,0,0,0,4,0,0,0,0,1,0,0,6,0,2,5,2,0,0,0,230,0,0,0,0,0,0,0,0,0,0,361,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,2,0,6,1,0,0,1,0,0,0,0,5,8,0,11,0,2,5,0,0,2,0,8,1,0,2,13,11,1,0,2,0,0,3,518,0,0,0,311706,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation on {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation on 8 models required a duration of 146.07 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 92.564% ( (+/-) 1.412% )
KNN                            	: 96.906% ( (+/-) 0.979% )
Decision tree                  	: 96.995% ( (+/-) 0.907% )
Random Forest                  	: 97.555% ( (+/-) 0.786% )
SVM                            	: 92.666% ( (+/-) 1.091% )
RegularNets                    	: 92.615% ( (+/-) 1.715% )
LDA                            	: 91.966% ( (+/-) 1.322% )
Gaussian Naive Bayes           	: 60.325% ( (+/-) 2.991% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Count Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_count_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 3. dll_loaded (one-hot encoded)

In [None]:
file_name = 'onehot_encoded_dll_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2168
Nb Features: 1603


Unnamed: 0_level_0,NETAPI32.dll,wab32res.dll,C:\Windows\system32\acctres.dll,API-MS-Win-Security-LSALookup-L1-1-0.dll,DNSAPI.dll,UxTheme.dll,msftedit.dll,dwmapi.dll,C:\Program Files (x86)\Common Files\System\wab32.dll,slc.dll,C:\Windows\system32\uxtheme.dll,ncrypt.dll,API-MS-WIN-Service-Management-L2-1-0.dll,C:\Windows\system32\inetres.dll,imagehlp.dll,SspiCli.dll,advapi32.dll,comctl32,psapi.dll,SHLWAPI.dll,USER32.dll,C:\Program Files\Windows Media Player\wmpnssci.dll,C:\Program Files\Common Files\System\wab32.dll,gdi32.dll,C:\Windows\System32\mswsock.dll,SHELL32.dll,CLBCatQ.DLL,C:\Program Files\Windows Mail\WinMail.exe,WINMM.dll,C:\Program Files\Common Files\System\wab32res.dll,rpcrt4.dll,COMCTL32.DLL,urlmon.dll,kernel32.dll,C:\Windows\system32\IMM32.DLL,CRYPTBASE.dll,oleaut32.dll,C:\Windows\system32\wbem\wbemsvc.dll,C:\Windows\system32\napinsp.dll,C:\Windows\System32\msxml6.dll,...,C:\Users\cucko\AppData\Local\Temp\VirusShare_1d86214fb2e5e9df06044614795c1710.dll,rasauto32.dll,C:\Users\cucko\AppData\Local\Temp\psapi,psapi,C:\Users\cucko\AppData\Local\Temp\IXP000.TMP\DVDSHR~1LOC.dll,C:\Users\cucko\AppData\Local\Temp\IXP000.TMP\DVDSHR~1PTB.dll,C:\Windows\kernel32.dll,C:\Users\Administrator\AppData\Local\Temp\kernel32.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_4c6bddcca2695d6202df38708e14fc7e.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_51326bf40da5a5357a143dd9a6e6a11c.exe,Iphlpapi.dll,C:\Program Files (x86)\Mozilla Firefox\nssutil3.dll,C:\Program Files (x86)\Mozilla Firefox\plds4.dll,C:\Program Files (x86)\Mozilla Firefox\plc4.dll,C:\Program Files (x86)\Mozilla Firefox\softokn3.dll,C:\Program Files (x86)\Mozilla Firefox\nspr4.dll,C:\Program Files (x86)\Mozilla Firefox\sqlite3.dll,C:\Program Files (x86)\Mozilla Firefox\mozcrt19.dll,C:\Program Files (x86)\Mozilla Firefox\nss3.dll,C:\Windows\SysWOW64\28463\KGCU.006,C:\Users\cucko\AppData\Local\Temp\@ABB4.tmp,C:\Users\cucko\AppData\Local\Temp\@71F7.tmp,KGCU.006,KGCU.007,oleacc.dll,C:\Windows\WinSxS\amd64_microsoft.vc80.crt_1fc8b3b9a1e18e3b_8.0.50727.4940_none_88df89932faf0bf6\MSVCR80.dll,C:\Windows\WinSxS\amd64_microsoft.vc80.crt_1fc8b3b9a1e18e3b_8.0.50727.4940_none_88df89932faf0bf6\msvcm80.dll,C:\Windows\Microsoft.NET\Framework64\v2.0.50727\VERSION.dll,IdnDL.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_9e860622fee66074dfe81dcfcc40c4e2.dll,msvcr100.dll,C:\Windows\DXGIDebug.dll,C:\Windows\System32\hhctrl.ocx,C:\Users\cucko\AppData\Local\Temp\VirusShare_baabd9b76bff84ed27fd432cfc6df241.exe,C:\Users\cucko\AppData\Local\Temp\VirusShare_bf80dbf969b73790253f683cd723fd71.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_c6a4bb1a4e4f69ec71855d70d6960859.dll,msvcrt,wInINeT.dll,setupapi,C:\Users\cucko\AppData\Local\Temp\VirusShare_ef8e0fb20e7228c7492ccdc59d87c690.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation 8 models required a duration of 322.31 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 97.817% ( (+/-) 0.981% )
KNN                            	: 97.525% ( (+/-) 1.113% )
Decision tree                  	: 97.017% ( (+/-) 1.236% )
Random Forest                  	: 97.802% ( (+/-) 0.993% )
SVM                            	: 97.586% ( (+/-) 1.000% )
RegularNets                    	: 97.586% ( (+/-) 0.957% )
LDA                            	: 95.788% ( (+/-) 1.517% )
Gaussian Naive Bayes           	: 71.018% ( (+/-) 2.062% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "dll_loaded" Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_onehot_dll_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 4. File operations (counts summary)

In [None]:
file_name = 'file_operations_counts_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2392
Nb Features: 11


Unnamed: 0_level_0,file_opened,file_read,file_created,file_moved,file_written,file_recreated,file_failed,file_deleted,file_exists,sublabel,file_copied
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000a3ea381d7d70be8b6fe1ee51dca22,69,18,16,1,18,7,7,6,37,1,0
001cfa63ad79aaf3e4a2b85a2e7f227f,2,0,0,1,0,0,1,0,4,1,0
003e845bdcc5367220bf13f7170da16f,7,1,3,0,2,0,0,0,4,1,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation 8 models required a duration of 88.68 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 83.083% ( (+/-) 0.489% )
KNN                            	: 93.255% ( (+/-) 1.831% )
Decision tree                  	: 92.990% ( (+/-) 1.662% )
Random Forest                  	: 94.203% ( (+/-) 1.273% )
SVM                            	: 82.887% ( (+/-) 0.209% )
RegularNets                    	: 90.886% ( (+/-) 2.046% )
LDA                            	: 83.222% ( (+/-) 0.469% )
Gaussian Naive Bayes           	: 56.470% ( (+/-) 24.123% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On File Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_file_operation_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 5. Registry Key Operations (counts summary)

In [None]:
file_name = 'regkeys_counts_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2392
Nb Features: 5


Unnamed: 0_level_0,regkey_opened,regkey_written,regkey_deleted,regkey_read,sublabel
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000a3ea381d7d70be8b6fe1ee51dca22,222,60,4,561,1
001cfa63ad79aaf3e4a2b85a2e7f227f,9,0,0,5,1
003e845bdcc5367220bf13f7170da16f,3,1,0,9,1


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation 8 models required a duration of 85.94 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 82.818% ( (+/-) 0.108% )
KNN                            	: 91.138% ( (+/-) 1.519% )
Decision tree                  	: 91.011% ( (+/-) 1.534% )
Random Forest                  	: 91.471% ( (+/-) 1.288% )
SVM                            	: 82.818% ( (+/-) 0.108% )
RegularNets                    	: 82.818% ( (+/-) 0.108% )
LDA                            	: 82.860% ( (+/-) 0.172% )
Gaussian Naive Bayes           	: 51.045% ( (+/-) 2.671% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Registry Key Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_registry_key_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 6. PE Entropy

In [None]:
file_name = 'pe_entropy_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3051
Nb Features: 714


Unnamed: 0_level_0,.text,.rdata,.data,.data2,.rsrc,.reloc,sublabel,.pdata,UPX0,UPX1,.idata,fdata,CODE,DATA,BSS,.data6,.data5,.data4,.data3,.gdata,INIT,\xcd\x9d\x02\x00oc,evglglf,PAGE,.edata,.tls,.rdata2,.INIT3,.text3,.text2,.rdata4,\x9b4\x0c\x00\xd2\x14,rt_ecx,rt_eax,rt_esp,rt_edx,rt_ebp,rt_ebx,rt_edi,.ndata,...,10,.flat,.api,\xe1CO\x00\xbc\x11,.klgnmx,.svcjif,.rql,\x87v\x8f\x00\xa8c,.wrqnuz,.bax,.tqpgx,.pgb,0\x1bu\x00\x00\xa0\x03,\xe7@\xd0\x00\xd6#,9e)\x00\xf0\x1c\x02,\x00,.idata,Unnamed: 59_level_0,gbgaegbf,ahgpkzfv,\xc3=\xf6\x00\x00\xb0\x84,\x8e\x07\x00c,\xb1\xaa\x02\x00c,hevuxhr,\x8dH\x01\x00oc,\x01\\x07\x00c,\x99\x87\x02\x00c,!\x9f\x04\x00c,\xf9\x00\x03\x00c,^\x06\x01\x00ta,hultkjl,#\r\n\x00c,.init,\x99\xe3\x00\x00c,T\t\x00oc,CDS0,CDS1,PS\xff\xd5\xab\xeb\xe7\xc3,\x10@\x00\x88=A,\xfc\xe4@\x00\xfc\x0f@
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000a3ea381d7d70be8b6fe1ee51dca22,7,5,0,0,5,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,4,0,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,6,0,5,0,7,0,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, drop_null_columns=True, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Cross-validation 8 models required a duration of 181.84 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 96.067% ( (+/-) 1.030% )
KNN                            	: 97.203% ( (+/-) 1.118% )
Decision tree                  	: 96.766% ( (+/-) 1.242% )
Random Forest                  	: 97.433% ( (+/-) 1.063% )
SVM                            	: 85.469% ( (+/-) 0.209% )
RegularNets                    	: 96.832% ( (+/-) 1.120% )
LDA                            	: 93.008% ( (+/-) 1.375% )
Gaussian Naive Bayes           	: 28.066% ( (+/-) 1.950% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Data', y_axis = 'Accuracy')

figure_path = 'drive/MyDrive/figures/evaluation_pe_entropy.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 7. PE Imports (One-Hot Encoded)

Considering all pe import functions at once ends up generating a dataset that has over 20 000 columns. So we will analyse imports belonging to different libraries (e.g. kernel32.dll, user32.dll...) separately and try to select the ones with the highest feature importance (top 1000).

**Libraries**

In [None]:
file_name = 'encoded_pe_imports_dll_libraries_dataset.csv'

libraries_df = pd.read_csv(os.path.join(folder_path, file_name), index_col='md5')

libraries_df.head(3)

Unnamed: 0_level_0,advapi32.dll,kernel32.dll,user32.dll,msvcrt.dll,cmutil.dll,ole32.dll,shell32.dll,version.dll,label,sublabel,...,libgobject-2.0-0.dll,libgtk-win32-2.0-0.dll,libgimp-2.0-0.dll,libgimpbase-2.0-0.dll,libgimpwidgets-2.0-0.dll,pcwum.dll,xevtchn.sys,xenutil.sys,netinst.dll,srcore.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,1,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 85.37 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 79.128% ( (+/-) 1.738% )
KNN                            	: 82.537% ( (+/-) 1.740% )
Decision tree                  	: 85.003% ( (+/-) 1.612% )
Random Forest                  	: 86.140% ( (+/-) 1.735% )
SVM                            	: 84.323% ( (+/-) 1.543% )
RegularNets                    	: 83.882% ( (+/-) 1.658% )
LDA                            	: 78.602% ( (+/-) 1.595% )
Gaussian Naive Bayes           	: 67.038% ( (+/-) 1.100% )


In [None]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Libraries Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

**Selecting Libraries**

In [None]:
def k_best_selection(folder_path, file_name, k=10):
    X, y = create_X_y(folder_path, file_name)
    selector = SelectKBest(f_classif, k=k).fit(X, y)
    selected_columns_indices = selector.get_support(indices=True)
    selected_df = X.iloc[:,selected_columns_indices]
    selected_columns = selected_df.columns.tolist()
    return selected_columns

In [None]:
k_best_selected_libraries = k_best_selection(folder_path, file_name, k=10)
k_best_selected_libraries

['kernel32.dll',
 'user32.dll',
 'msvcrt.dll',
 'ole32.dll',
 'shell32.dll',
 'oleaut32.dll',
 'comctl32.dll',
 'comdlg32.dll',
 'winmm.dll',
 'ntdll.dll']

In [None]:
def most_recurrent(folder_path, file_name, portion=0.05):
    df = get_data(folder_path, file_name)
    limit = int(portion*len(df.columns))
    most_recurrent_ = df.drop(['label', 'sublabel'], axis=1).sum().sort_values(ascending=False).iloc[:limit]
    return most_recurrent_

In [None]:
#Retrieve the most recurrent libraries in PE imports across all data samples
most_recurrent_libraries = most_recurrent(folder_path, file_name, portion=0.05)
selected_libraries_indices = [ most_recurrent_libraries.index.tolist().index(column) for column in selected_columns ]

#Plot
colors = np.repeat('lightslategray', limit)
colors[selected_libraries_indices] = 'lightblue'

fig = go.Figure( data = [ go.Bar(x=most_recurrent_libraries.index, 
                                 y=most_recurrent_libraries.values,  
                                 marker_color=colors) ] )

fig.update_layout(title='The top 5% most recurrent libraries in PE imports', title_x=0.3)

fig.show()

Let's explore what's the majority class in function of the involved libraries in PE imports:

In [None]:
def plot_class_distribution(df, libraries, title):
    
    malware = []
    goodware = []
    for library in libraries:
        subset = df[df[library] != 0]
        total = len(subset)
        malware.append(subset['label'].sum())
        goodware.append(total - subset['label'].sum())

    fig = go.Figure(data=[
        go.Bar(name='Malware', x=libraries, y=malware, marker_color=np.repeat('crimson', len(libraries))),
        go.Bar(name='Goodware', x=libraries, y=goodware, marker_color=np.repeat('lightslategray', len(libraries))) ])
    
    fig.update_layout(barmode='group', 
                      title=title, 
                      title_x=0.2)
    fig.show()

In [None]:
#Most recurrent libraries: 
plot_class_distribution(df = libraries_df,
                        libraries = most_recurrent_libraries.iloc[:10].index.tolist(),
                        title = 'The class distribution for the most recurrent libraries in PE imports')

In [None]:
#Most recurrent libraries: 
plot_class_distribution(df = libraries_df,
                        libraries = selected_columns,
                        title = 'The class distribution for the k-selected libraries in PE imports')

**kernel32.dll**

In [None]:
dll_name = 'kernel32.dll'

In [None]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3609
Nb Features: 1007


Unnamed: 0_level_0,GetCommandLineW,CloseHandle,CreateFileW,lstrlenW,GetWindowsDirectoryW,WritePrivateProfileSectionW,CompareStringW,GetPrivateProfileSectionW,lstrcmpW,GetLastError,...,SetSearchPathMode,ExpungeConsoleCommandHistoryW,SetConsoleNumberOfCommandsW,GetConsoleCommandHistoryLengthW,GetConsoleCommandHistoryW,K32QueryWorkingSetEx,NormalizeString,CallbackMayRunLong,TrySubmitThreadpoolCallback,CloseThreadpoolCleanupGroup
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,1,1,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 199.82 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 88.593% ( (+/-) 1.489% )
KNN                            	: 87.467% ( (+/-) 1.712% )
Decision tree                  	: 86.756% ( (+/-) 1.484% )
Random Forest                  	: 90.395% ( (+/-) 1.765% )
SVM                            	: 89.563% ( (+/-) 1.700% )
RegularNets                    	: 88.685% ( (+/-) 1.775% )
LDA                            	: 85.249% ( (+/-) 2.146% )
Gaussian Naive Bayes           	: 63.664% ( (+/-) 4.716% )


In [None]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

**user32.dll**

In [None]:
dll_name = 'user32.dll'

In [None]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2746
Nb Features: 728


Unnamed: 0_level_0,EndDialog,CheckDlgButton,SetWindowTextW,CharPrevW,CharNextW,LoadStringW,MessageBoxW,IsDlgButtonChecked,SetFocus,GetDlgItem,...,UnregisterSessionPort,RegisterSessionPort,CheckDesktopByThreadId,DwmStopRedirection,DwmStartRedirection,AlignRects,GetGestureConfig,IsTouchWindow,GetIconInfoExW,GetWindowMinimizeRect
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 135.40 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 84.134% ( (+/-) 2.529% )
KNN                            	: 83.345% ( (+/-) 2.424% )
Decision tree                  	: 84.377% ( (+/-) 2.211% )
Random Forest                  	: 88.541% ( (+/-) 2.037% )
SVM                            	: 85.238% ( (+/-) 2.788% )
RegularNets                    	: 85.786% ( (+/-) 1.991% )
LDA                            	: 81.500% ( (+/-) 2.523% )
Gaussian Naive Bayes           	: 51.529% ( (+/-) 1.441% )


In [None]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

Let's find the features that best classify malware from goodware:

In [None]:
k_best_selected_user32 = k_best_selection(folder_path, file_name, k=20)

In [None]:
#K best selected PE imports of the user32.dll library
plot_class_distribution(df = get_data(folder_path, file_name),
                        libraries = k_best_selected_user32,
                        title = 'The class distribution for the k-best selected PE imports in "user32.dll"')

In [None]:
most_recurrent_ = most_recurrent(folder_path, file_name).index.tolist()[:20]

In [None]:
#most recurrent PE imports of the user32.dll library
plot_class_distribution(df = get_data(folder_path, file_name),
                        libraries = most_recurrent_,
                        title = 'The class distribution for the most recurrent PE imports in "user32.dll"')

**advapi32.dll**

In [None]:
dll_name = 'msvcrt.dll'

In [None]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 1675
Nb Features: 789


Unnamed: 0_level_0,_controlfp,?terminate@@YAXXZ,memset,_vsnwprintf,__set_app_type,__p__fmode,__p__commode,__setusermatherr,_amsg_exit,_initterm,...,_adj_fptan,_execvp,_wspawnl,_execlp,__lc_handle,_utime,__crtGetLocaleInfoW,_safe_fprem1,_cabs,_findnext64
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
03e11b5b2a4c124b867d18e00783024e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 77.34 seconds


In [None]:
print_evaluation_results(results, names)

LogisticRegression             	: 89.991% ( (+/-) 2.135% )
KNN                            	: 90.569% ( (+/-) 2.009% )
Decision tree                  	: 91.343% ( (+/-) 1.749% )
Random Forest                  	: 93.154% ( (+/-) 1.577% )
SVM                            	: 90.568% ( (+/-) 1.883% )
RegularNets                    	: 90.926% ( (+/-) 2.215% )
LDA                            	: 86.111% ( (+/-) 1.868% )
Gaussian Naive Bayes           	: 55.403% ( (+/-) 1.852% )


In [None]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

#### Summary

In [None]:
IFrame(figure_path, width=900, height=600)