## **Imports and configurations**

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [5]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser'

## **Models evaluation**

In [24]:
def preview_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    print(f'Nb Observations: {df.shape[0]}')
    print(f'Nb Features: {df.shape[1] - 1}')
    return df.drop(['label'], axis=1).head(3)

def get_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    return df

def get_null_columns(df):
    null_per_column = (df==0).sum()
    null_columns = null_per_column[null_per_column==len(df)].index.tolist()
    return null_columns

def create_X_y(folder_path, file_name, drop_null_columns=False, index_value = 'md5'):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, index_col = index_value)
    #X = df.drop('label', axis=1)
    X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['sublabel']
    return shuffle(X, y)


def create_regular_net():
    model = Sequential()
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer1'))
    model.add(Dense(units=10, kernel_initializer = 'uniform', activation = 'relu', name='dense_layer2'))
    model.add(Dense(1, activation = 'sigmoid', name = 'dense_output'))   #sigmoid for binary 
    model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics = ['accuracy'])
    return model


def wrap_regular_net():
    model = KerasClassifier(build_fn=create_regular_net, epochs=50, batch_size=64, verbose=0)
    return model


def create_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['KNN'] = KNeighborsClassifier()
    models['Decision tree'] = DecisionTreeClassifier()
    models['Random Forest'] = RandomForestClassifier()
    #models['Stochastic Gradient Descent'] = SGDClassifier()
    models['SVM'] = SVC()
    models['RegularNets'] = wrap_regular_net()
    models['LDA'] = LDA()
    models['Gaussian Naive Bayes'] = GaussianNB()
    return models


def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


def get_evaluation_results(folder_path, file_name, drop_null_columns=False, scaler=None):  
    
    #generate features and target
    X, y = create_X_y(folder_path, file_name, drop_null_columns)
    
    #generate models
    models = create_models()
    
    #evaluate models and store results
    results, names = list(), list()
    for name, model in models.items():
        if scaler is not None:
            pipeline = Pipeline([('transformer', scaler), ('estimator', model)])
        else:
            pipeline = model
        scores = evaluate_model(pipeline, X, y)
        results.append(scores)
        names.append(name) 
        
    return names, results


def print_evaluation_results(results, names):  
    for name, scores in zip(names, results):
        print(f"{name:30} \t: {np.mean(scores)*100:.3f}% ( (+/-) {np.std(scores)*100:.3f}% )")  

### 1. apistats (one-hot encoded)

In [20]:
file_name = 'onehot_encoded_apistats_ransom_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2618
Nb Features: 300


Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,...,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df = get_data(folder_path, file_name)
df.head()

Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,...,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
00a53241bf9c9425c6df8da44a5ca4f4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X, y = create_X_y(folder_path, file_name)

In [26]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 71.57 seconds


In [27]:
print_evaluation_results(results, names)

LogisticRegression             	: 97.632% ( (+/-) 1.045% )
KNN                            	: 97.479% ( (+/-) 1.139% )
Decision tree                  	: 96.855% ( (+/-) 1.011% )
Random Forest                  	: 97.454% ( (+/-) 1.021% )
SVM                            	: 97.492% ( (+/-) 1.087% )
RegularNets                    	: 97.734% ( (+/-) 0.898% )
LDA                            	: 96.995% ( (+/-) 0.965% )
Gaussian Naive Bayes           	: 52.916% ( (+/-) 2.695% )


In [28]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_onehot_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 1. apistats (count encoded)

In [17]:
file_name = 'count_encoded_apistats_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3822
Nb Columns: 305


Unnamed: 0_level_0,LdrUnloadDll,RegCloseKey,GetSystemTimeAsFileTime,LoadStringW,GetSystemInfo,RegQueryValueExA,LdrGetProcedureAddress,MessageBoxTimeoutW,RegSetValueExA,NtTerminateProcess,...,CDocument_write,WSAConnect,ExitWindowsEx,CopyFileExW,NtDeleteFile,CreateServiceW,WNetGetProviderNameW,RtlCreateUserProcess,NtLoadKeyEx,NtLoadDriver
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,4,3,1,2,1,13,6,1,6,3,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,6,2,1,0,55,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,9,1,0,0,0,21,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation on {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation on 8 models required a duration of 80.87 seconds


In [31]:
print_evaluation_results(results, names)

LogisticRegression             	: 87.031% ( (+/-) 1.824% )
KNN                            	: 88.497% ( (+/-) 1.433% )
Decision tree                  	: 91.035% ( (+/-) 1.535% )
Random Forest                  	: 95.125% ( (+/-) 0.768% )
SVM                            	: 75.475% ( (+/-) 1.817% )
RegularNets                    	: 88.654% ( (+/-) 1.667% )
LDA                            	: 82.104% ( (+/-) 1.790% )
Gaussian Naive Bayes           	: 66.841% ( (+/-) 2.230% )


In [32]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Count Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_count_apistats_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 3. dll_loaded (one-hot encoded)

In [33]:
file_name = 'onehot_encoded_dll_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3162
Nb Columns: 2242


Unnamed: 0_level_0,RTUTILS.DLL,kernel32.dll,label,kernel32,C:\Users\cucko\AppData\Local\Temp\emedcfd.dll,RpcRtRemote.dll,CRYPTSP.dll,CLBCatQ.DLL,msimg32.dll,libssl32.dll,...,C:\Users\cucko\AppData\Local\Temp\VmX.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_1d4457e8e6917937845f55ebbce2fc49.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_1d50b69a05d60d4f9f703b789a2933de.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_268eef019bf65b2987e945afaf29643f.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_3c1b2fabb7d74bc5be0820eae4107f8a.exe,C:\Users\cucko\AppData\Local\Temp\VirusShare_43b844c35e1a933e9214588be81ce772.dll,C:\Users\cucko\AppData\Local\Temp\VirusShare_933b11bc4799f8d9f65466fb2e3ea659.exe,C:\Users\cucko\AppData\Local\Temp\VirusShare_9fc3ed6c9b8056fbf155f79569ca7cb1.exe,D3DIM700.DLL,C:\Users\cucko\AppData\Local\Temp\VirusShare_bcbdef1678049378be04719ed29078d2.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 7 models required a duration of 228.69 seconds


In [64]:
print_evaluation_results(results, names)

LogisticRegression             	: 87.022% ( (+/-) 1.843% )
KNN                            	: 84.061% ( (+/-) 1.940% )
Decision tree                  	: 84.535% ( (+/-) 1.923% )
Random Forest                  	: 87.118% ( (+/-) 2.130% )
SVM                            	: 87.602% ( (+/-) 1.943% )
RegularNets                    	: 86.717% ( (+/-) 1.411% )
Gaussian Naive Bayes           	: 64.800% ( (+/-) 3.020% )


In [65]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "dll_loaded" Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_onehot_dll_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 4. File operations (counts summary)

In [107]:
file_name = 'file_operations_counts_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3496
Nb Columns: 11


Unnamed: 0_level_0,label,file_opened,file_exists,file_read,file_deleted,file_failed,file_created,file_recreated,file_written,file_copied,file_moved
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00263ca2071dc9a6ee577eb356b0d1d9,0,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,4,7,1,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,0,0,0,0,0,0,0,0


In [108]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 22.46 seconds


In [109]:
print_evaluation_results(results, names)

LogisticRegression             	: 58.266% ( (+/-) 2.109% )
KNN                            	: 75.192% ( (+/-) 2.862% )
Decision tree                  	: 78.042% ( (+/-) 2.153% )
Random Forest                  	: 79.091% ( (+/-) 2.083% )
SVM                            	: 57.179% ( (+/-) 1.495% )
RegularNets                    	: 62.568% ( (+/-) 2.701% )
LDA                            	: 56.665% ( (+/-) 0.367% )
Gaussian Naive Bayes           	: 49.542% ( (+/-) 1.778% )


In [110]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On File Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_file_operation_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 5. Registry Key Operations (counts summary)

In [111]:
file_name = 'regkeys_counts_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3496
Nb Columns: 5


Unnamed: 0_level_0,regkey_opened,regkey_read,regkey_written,label,regkey_deleted
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00263ca2071dc9a6ee577eb356b0d1d9,2,8,6,0,0
0137996cd3aa197ae8eb64fef12c044a,1,2,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,13,9,0,0,0


In [112]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 18.64 seconds


In [113]:
print_evaluation_results(results, names)

LogisticRegression             	: 63.883% ( (+/-) 2.358% )
KNN                            	: 80.293% ( (+/-) 2.049% )
Decision tree                  	: 80.692% ( (+/-) 1.993% )
Random Forest                  	: 82.237% ( (+/-) 2.137% )
SVM                            	: 64.560% ( (+/-) 1.944% )
RegularNets                    	: 65.400% ( (+/-) 2.344% )
LDA                            	: 56.388% ( (+/-) 0.153% )
Gaussian Naive Bayes           	: 56.913% ( (+/-) 0.597% )


In [114]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Registry Key Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_registry_key_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 6. PE Entropy

In [115]:
file_name = 'pe_entropy_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 4308
Nb Columns: 795


Unnamed: 0_level_0,.text,.data,.rsrc,.reloc,label,.rdata,.pdata,_RDATA,UPX0,UPX1,...,/47,/61,/73,/84,/95,dtwwkix,afaqowd,q\xa0\x01\x00c,PAGEtext,PAGEdata
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,6.068004,4.873371,4.527203,5.159578,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0137996cd3aa197ae8eb64fef12c044a,6.49023,2.382517,0.0,5.372628,0,5.108187,5.160316,1.117635,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0282f83bbfb58c08b54dbd8015e54d2e,6.342347,1.021041,4.717804,4.662975,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name, drop_null_columns=True, scaler=StandardScaler())

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 89.60 seconds


In [117]:
print_evaluation_results(results, names)

LogisticRegression             	: 75.487% ( (+/-) 2.132% )
KNN                            	: 85.894% ( (+/-) 1.149% )
Decision tree                  	: 87.388% ( (+/-) 1.321% )
Random Forest                  	: 90.475% ( (+/-) 1.124% )
SVM                            	: 74.288% ( (+/-) 1.953% )
RegularNets                    	: 80.671% ( (+/-) 2.255% )
LDA                            	: 75.294% ( (+/-) 2.105% )
Gaussian Naive Bayes           	: 46.425% ( (+/-) 1.060% )


In [118]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_pe_entropy.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 7. PE Imports (One-Hot Encoded)

Considering all pe import functions at once ends up generating a dataset that has over 20 000 columns. So we will analyse imports belonging to different libraries (e.g. kernel32.dll, user32.dll...) separately and try to select the ones with the highest feature importance (top 1000).

**Libraries**

In [126]:
file_name = 'encoded_pe_imports_dll_libraries_dataset.csv'

libraries_df = pd.read_csv(os.path.join(folder_path, file_name), index_col='md5')

libraries_df.head(3)

Unnamed: 0_level_0,advapi32.dll,kernel32.dll,user32.dll,msvcrt.dll,cmutil.dll,ole32.dll,shell32.dll,version.dll,label,sublabel,...,libgobject-2.0-0.dll,libgtk-win32-2.0-0.dll,libgimp-2.0-0.dll,libgimpbase-2.0-0.dll,libgimpwidgets-2.0-0.dll,pcwum.dll,xevtchn.sys,xenutil.sys,netinst.dll,srcore.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,1,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 85.37 seconds


In [76]:
print_evaluation_results(results, names)

LogisticRegression             	: 79.128% ( (+/-) 1.738% )
KNN                            	: 82.537% ( (+/-) 1.740% )
Decision tree                  	: 85.003% ( (+/-) 1.612% )
Random Forest                  	: 86.140% ( (+/-) 1.735% )
SVM                            	: 84.323% ( (+/-) 1.543% )
RegularNets                    	: 83.882% ( (+/-) 1.658% )
LDA                            	: 78.602% ( (+/-) 1.595% )
Gaussian Naive Bayes           	: 67.038% ( (+/-) 1.100% )


In [77]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Libraries Data', y_axis = 'Accuracy')

figure_path = 'figures/evaluation_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

**Selecting Libraries**

In [121]:
def k_best_selection(folder_path, file_name, k=10):
    X, y = create_X_y(folder_path, file_name)
    selector = SelectKBest(f_classif, k=k).fit(X, y)
    selected_columns_indices = selector.get_support(indices=True)
    selected_df = X.iloc[:,selected_columns_indices]
    selected_columns = selected_df.columns.tolist()
    return selected_columns

In [127]:
k_best_selected_libraries = k_best_selection(folder_path, file_name, k=10)
k_best_selected_libraries

['kernel32.dll',
 'user32.dll',
 'msvcrt.dll',
 'ole32.dll',
 'shell32.dll',
 'oleaut32.dll',
 'comctl32.dll',
 'comdlg32.dll',
 'winmm.dll',
 'ntdll.dll']

In [98]:
def most_recurrent(folder_path, file_name, portion=0.05):
    df = get_data(folder_path, file_name)
    limit = int(portion*len(df.columns))
    most_recurrent_ = df.drop(['label', 'sublabel'], axis=1).sum().sort_values(ascending=False).iloc[:limit]
    return most_recurrent_

In [131]:
#Retrieve the most recurrent libraries in PE imports across all data samples
most_recurrent_libraries = most_recurrent(folder_path, file_name, portion=0.05)
selected_libraries_indices = [ most_recurrent_libraries.index.tolist().index(column) for column in selected_columns ]

#Plot
colors = np.repeat('lightslategray', limit)
colors[selected_libraries_indices] = 'lightblue'

fig = go.Figure( data = [ go.Bar(x=most_recurrent_libraries.index, 
                                 y=most_recurrent_libraries.values,  
                                 marker_color=colors) ] )

fig.update_layout(title='The top 5% most recurrent libraries in PE imports', title_x=0.3)

fig.show()

Let's explore what's the majority class in function of the involved libraries in PE imports:

In [113]:
def plot_class_distribution(df, libraries, title):
    
    malware = []
    goodware = []
    for library in libraries:
        subset = df[df[library] != 0]
        total = len(subset)
        malware.append(subset['label'].sum())
        goodware.append(total - subset['label'].sum())

    fig = go.Figure(data=[
        go.Bar(name='Malware', x=libraries, y=malware, marker_color=np.repeat('crimson', len(libraries))),
        go.Bar(name='Goodware', x=libraries, y=goodware, marker_color=np.repeat('lightslategray', len(libraries))) ])
    
    fig.update_layout(barmode='group', 
                      title=title, 
                      title_x=0.2)
    fig.show()

In [76]:
#Most recurrent libraries: 
plot_class_distribution(df = libraries_df,
                        libraries = most_recurrent_libraries.iloc[:10].index.tolist(),
                        title = 'The class distribution for the most recurrent libraries in PE imports')

In [78]:
#Most recurrent libraries: 
plot_class_distribution(df = libraries_df,
                        libraries = selected_columns,
                        title = 'The class distribution for the k-selected libraries in PE imports')

**kernel32.dll**

In [7]:
dll_name = 'kernel32.dll'

In [8]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 3609
Nb Features: 1007


Unnamed: 0_level_0,GetCommandLineW,CloseHandle,CreateFileW,lstrlenW,GetWindowsDirectoryW,WritePrivateProfileSectionW,CompareStringW,GetPrivateProfileSectionW,lstrcmpW,GetLastError,...,SetSearchPathMode,ExpungeConsoleCommandHistoryW,SetConsoleNumberOfCommandsW,GetConsoleCommandHistoryLengthW,GetConsoleCommandHistoryW,K32QueryWorkingSetEx,NormalizeString,CallbackMayRunLong,TrySubmitThreadpoolCallback,CloseThreadpoolCleanupGroup
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,1,1,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 199.82 seconds


In [10]:
print_evaluation_results(results, names)

LogisticRegression             	: 88.593% ( (+/-) 1.489% )
KNN                            	: 87.467% ( (+/-) 1.712% )
Decision tree                  	: 86.756% ( (+/-) 1.484% )
Random Forest                  	: 90.395% ( (+/-) 1.765% )
SVM                            	: 89.563% ( (+/-) 1.700% )
RegularNets                    	: 88.685% ( (+/-) 1.775% )
LDA                            	: 85.249% ( (+/-) 2.146% )
Gaussian Naive Bayes           	: 63.664% ( (+/-) 4.716% )


In [84]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

**user32.dll**

In [85]:
dll_name = 'user32.dll'

In [86]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 2746
Nb Features: 728


Unnamed: 0_level_0,EndDialog,CheckDlgButton,SetWindowTextW,CharPrevW,CharNextW,LoadStringW,MessageBoxW,IsDlgButtonChecked,SetFocus,GetDlgItem,...,UnregisterSessionPort,RegisterSessionPort,CheckDesktopByThreadId,DwmStopRedirection,DwmStartRedirection,AlignRects,GetGestureConfig,IsTouchWindow,GetIconInfoExW,GetWindowMinimizeRect
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0137996cd3aa197ae8eb64fef12c044a,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 135.40 seconds


In [108]:
print_evaluation_results(results, names)

LogisticRegression             	: 84.134% ( (+/-) 2.529% )
KNN                            	: 83.345% ( (+/-) 2.424% )
Decision tree                  	: 84.377% ( (+/-) 2.211% )
Random Forest                  	: 88.541% ( (+/-) 2.037% )
SVM                            	: 85.238% ( (+/-) 2.788% )
RegularNets                    	: 85.786% ( (+/-) 1.991% )
LDA                            	: 81.500% ( (+/-) 2.523% )
Gaussian Naive Bayes           	: 51.529% ( (+/-) 1.441% )


In [109]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

Let's find the features that best classify malware from goodware:

In [122]:
k_best_selected_user32 = k_best_selection(folder_path, file_name, k=20)

In [123]:
#K best selected PE imports of the user32.dll library
plot_class_distribution(df = get_data(folder_path, file_name),
                        libraries = k_best_selected_user32,
                        title = 'The class distribution for the k-best selected PE imports in "user32.dll"')

In [124]:
most_recurrent_ = most_recurrent(folder_path, file_name).index.tolist()[:20]

In [125]:
#most recurrent PE imports of the user32.dll library
plot_class_distribution(df = get_data(folder_path, file_name),
                        libraries = most_recurrent_,
                        title = 'The class distribution for the most recurrent PE imports in "user32.dll"')

**advapi32.dll**

In [135]:
dll_name = 'msvcrt.dll'

In [136]:
file_name = 'encoded_' + dll_name.split('.')[0] + '_pe_imports_dataset.csv'

preview_data(folder_path, file_name)

Nb Observations: 1675
Nb Features: 789


Unnamed: 0_level_0,_controlfp,?terminate@@YAXXZ,memset,_vsnwprintf,__set_app_type,__p__fmode,__p__commode,__setusermatherr,_amsg_exit,_initterm,...,_adj_fptan,_execvp,_wspawnl,_execlp,__lc_handle,_utime,__crtGetLocaleInfoW,_safe_fprem1,_cabs,_findnext64
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00263ca2071dc9a6ee577eb356b0d1d9,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
0282f83bbfb58c08b54dbd8015e54d2e,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
03e11b5b2a4c124b867d18e00783024e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
start = time.time()

names, results = get_evaluation_results(folder_path, file_name)

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 77.34 seconds


In [138]:
print_evaluation_results(results, names)

LogisticRegression             	: 89.991% ( (+/-) 2.135% )
KNN                            	: 90.569% ( (+/-) 2.009% )
Decision tree                  	: 91.343% ( (+/-) 1.749% )
Random Forest                  	: 93.154% ( (+/-) 1.577% )
SVM                            	: 90.568% ( (+/-) 1.883% )
RegularNets                    	: 90.926% ( (+/-) 2.215% )
LDA                            	: 86.111% ( (+/-) 1.868% )
Gaussian Naive Bayes           	: 55.403% ( (+/-) 1.852% )


In [139]:
fig = plot_evaluation_boxplots(results, 
                               names, 
                               title = 'Models Performance On One-hot Encoded ' + dll_name.split('.')[0] + ' PE Imports Data', 
                               y_axis = 'Accuracy')

figure_path = 'figures/evaluation_encoded_pe_imports.html'

save_figures_to_html(figure_path, [fig])

#### Summary

In [140]:
IFrame(figure_path, width=900, height=600)