## **Imports and configurations**

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [8]:
from helpers import *
from learners import *
from plot_utils import *

## Data loading :

In [11]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser/ransom_datasets'

files = get_file_list(folder_path, extensions=['gz'])

datasets = prepare_datasets(datasets_path = folder_path)

apistats_counts
loaded data shape :   (2618, 301)
fileops_created_nested_files
loaded data shape :   (1232, 9747)
fileops_deleted_nested_files
loaded data shape :   (1029, 5537)
fileops_exists_nested_files
loaded data shape :   (1545, 6847)
fileops_failed_nested_files
loaded data shape :   (1300, 3073)
fileops_opened_nested_files
loaded data shape :   (2094, 12066)
fileops_read_nested_files
loaded data shape :   (1508, 10430)
fileops_recreated_nested_files
loaded data shape :   (539, 445)
fileops_summary
loaded data shape :   (2392, 12)
fileops_written_nested_files
loaded data shape :   (1265, 9891)
loaded_dll_onehot
loaded data shape :   (2168, 821)
pe_entropy_analysis
loaded data shape :   (3051, 715)
pe_imports_advapi32
loaded data shape :   (1532, 455)
pe_imports_comctl32
loaded data shape :   (598, 92)
pe_imports_gdi32
loaded data shape :   (883, 541)
pe_imports_kernel32
loaded data shape :   (2425, 886)
pe_imports_libraries
loaded data shape :   (3058, 286)
pe_imports_msvcrt
load

## **Models evaluation**

### 1. apistats (count encoded)

In this section, on top of studying apistats, we'll also compare the performance of different SMOTE techniques to decide which gives the highest performance.

In [12]:
datasets['apistats_counts'].head()

Unnamed: 0_level_0,GetUserNameExW,SetFileTime,GetFileVersionInfoSizeW,GetFileAttributesW,RegOpenKeyExW,NtDelayExecution,SetErrorMode,RegOpenKeyExA,RtlRemoveVectoredExceptionHandler,SetFilePointerEx,...,NetUserGetInfo,DecryptMessage,EncryptMessage,ReadCabinetState,CryptProtectMemory,CryptUnprotectMemory,WNetGetProviderNameW,CreateRemoteThreadEx,RtlCreateUserProcess,system
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,8,3,1173,876,475,1387,810,58,1,102,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,8,10,1,10,2,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,2,0,6,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
00a53241bf9c9425c6df8da44a5ca4f4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,4,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
class_distribution(datasets['apistats_counts'], label='sublabel')

Class distribution:
malware      : 2206
ransomware   : 412

Majority class classifier accuracy = 84.26%


In [104]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(datasets['apistats_counts'], label='sublabel')
smoter = smote(X, y, category='adaptive', over_strategy=0.5, under_strategy=0.8, k_neighbors=5, fit=False)

#compute cross-validation scores
names, results = get_evaluation_results(df = datasets['apistats_counts'], 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 73.79 seconds


In [103]:
#standard smote
print_evaluation_results(results, names)

LogisticRegression             	: 92.734% ( (+/-) 1.368% )
KNN                            	: 95.985% ( (+/-) 1.203% )
Decision tree                  	: 96.183% ( (+/-) 1.309% )
Random Forest                  	: 97.710% ( (+/-) 0.865% )
SVM                            	: 92.543% ( (+/-) 1.089% )
RegularNets                    	: 86.450% ( (+/-) 3.343% )
LDA                            	: 87.405% ( (+/-) 2.152% )
Gaussian Naive Bayes           	: 62.405% ( (+/-) 3.027% )


In [94]:
#bordeline smote
print_evaluation_results(results, names)

LogisticRegression             	: 92.366% ( (+/-) 1.323% )
KNN                            	: 94.656% ( (+/-) 1.223% )
Decision tree                  	: 95.420% ( (+/-) 1.054% )
Random Forest                  	: 97.515% ( (+/-) 0.930% )
SVM                            	: 92.366% ( (+/-) 5.243% )
RegularNets                    	: 84.351% ( (+/-) 3.141% )
LDA                            	: 84.924% ( (+/-) 3.367% )
Gaussian Naive Bayes           	: 60.878% ( (+/-) 2.657% )


In [97]:
#svm smote
print_evaluation_results(results, names)

LogisticRegression             	: 92.734% ( (+/-) 1.202% )
KNN                            	: 95.420% ( (+/-) 1.500% )
Decision tree                  	: 95.802% ( (+/-) 1.471% )
Random Forest                  	: 97.710% ( (+/-) 0.739% )
SVM                            	: 92.748% ( (+/-) 1.061% )
RegularNets                    	: 83.397% ( (+/-) 2.909% )
LDA                            	: 86.999% ( (+/-) 2.794% )
Gaussian Naive Bayes           	: 62.333% ( (+/-) 2.966% )


In [105]:
#adaptive smote
print_evaluation_results(results, names)

LogisticRegression             	: 92.734% ( (+/-) 1.438% )
KNN                            	: 94.466% ( (+/-) 1.365% )
Decision tree                  	: 95.785% ( (+/-) 1.346% )
Random Forest                  	: 98.084% ( (+/-) 0.900% )
SVM                            	: 92.366% ( (+/-) 1.182% )
RegularNets                    	: 86.998% ( (+/-) 3.656% )
LDA                            	: 87.380% ( (+/-) 2.593% )
Gaussian Naive Bayes           	: 60.115% ( (+/-) 2.599% )


**Conclusion** : The adaptive SMOTE ADASYN gives the highest accuracy with minimum variability.

In [106]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On Count Encoded "apistats" Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_count_apistats.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 2. dll_loaded (one-hot encoded)

In [109]:
df = datasets['loaded_dll_onehot']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 1875
ransomware   : 293

Majority class classifier accuracy = 86.49%


Unnamed: 0_level_0,advapi32,user32,api-ms-win-service-management-l2-1-0,shell32,mlang,c:/windows/system32/wbem/wbemsvc,wininet,sspicli,wintrust,api-ms-win-security-sddl-l1-1-0,...,c:/program files (x86)/mozilla firefox/mozcrt19,c:/windows/syswow64/28463/kgcu,kgcu,c:/windows/winsxs/amd64_microsoft.vc80.crt_1fc8b3b9a1e18e3b_8.0.50727.4940_none_88df89932faf0bf6/msvcr80,c:/windows/winsxs/amd64_microsoft.vc80.crt_1fc8b3b9a1e18e3b_8.0.50727.4940_none_88df89932faf0bf6/msvcm80,c:/windows/microsoft.net/framework64/v2.0.50727/version,idndl,msvcr100,c:/windows/system32/hhctrl,c:/windows/dxgidebug
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00cc33352ae8e526f7533119fb823cb1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
start = time.time()

names, results = get_evaluation_results(df, label='sublabel')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

Cross-validation 8 models required a duration of 93.55 seconds


In [131]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1192, 'ransomware': 954}
Cross-validation 8 models required a duration of 87.02 seconds


In [125]:
print_evaluation_results(results, names)

LogisticRegression             	: 89.171% ( (+/-) 3.654% )
KNN                            	: 86.605% ( (+/-) 4.136% )
Decision tree                  	: 92.166% ( (+/-) 2.821% )
Random Forest                  	: 93.765% ( (+/-) 3.144% )
SVM                            	: 89.171% ( (+/-) 3.841% )
RegularNets                    	: 92.166% ( (+/-) 2.895% )
LDA                            	: 84.064% ( (+/-) 3.592% )
Gaussian Naive Bayes           	: 55.300% ( (+/-) 4.434% )


In [132]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On One-Hot Encoded "dll_loaded" Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_onehot_dll_data.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 3. File operations 

In this section we'll study the summary information about file operations. As for the specific nested file operation of each category, as they are composed of a number of features in the order of 10k, we'll study them in the feature selection notebook, in order to perform variable selection before training.

**File Operations (Counts Summary)**

In [126]:
df = datasets['fileops_summary']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 1981
ransomware   : 411

Majority class classifier accuracy = 82.82%


Unnamed: 0_level_0,file_opened,file_read,file_created,file_moved,file_written,file_recreated,file_failed,file_deleted,file_exists,label,sublabel,file_copied
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
000a3ea381d7d70be8b6fe1ee51dca22,69,18,16,1,18,7,7,6,37,1,1,0
001cfa63ad79aaf3e4a2b85a2e7f227f,2,0,0,1,0,0,1,0,4,1,1,0
003e845bdcc5367220bf13f7170da16f,7,1,3,0,2,0,0,0,4,1,1,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,0,0,0,0,0,4,1,1,0
00cc33352ae8e526f7533119fb823cb1,0,0,0,0,0,0,0,0,0,1,1,0


In [127]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1188, 'ransomware': 951}
Cross-validation 8 models required a duration of 77.37 seconds


In [128]:
print_evaluation_results(results, names)

LogisticRegression             	: 82.845% ( (+/-) 0.533% )
KNN                            	: 87.657% ( (+/-) 3.403% )
Decision tree                  	: 90.377% ( (+/-) 2.089% )
Random Forest                  	: 92.469% ( (+/-) 2.603% )
SVM                            	: 81.420% ( (+/-) 14.718% )
RegularNets                    	: 82.427% ( (+/-) 1.505% )
LDA                            	: 82.427% ( (+/-) 0.799% )
Gaussian Naive Bayes           	: 79.541% ( (+/-) 24.767% )


In [130]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On File Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_file_operation_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 4. Registry Key Operations

In this section we'll study the summary information about registry key operations. As for the specific nested registry keys of each category, as they are composed of a number of features in the order of 10k, we'll study them in the feature selection notebook, in order to perform variable selection before training.

**Registry Key Operations (Counts Summary)**

In [135]:
df = datasets['regkeys_summary']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 1981
ransomware   : 411

Majority class classifier accuracy = 82.82%


Unnamed: 0_level_0,regkey_opened,regkey_written,regkey_deleted,regkey_read,label,sublabel
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000a3ea381d7d70be8b6fe1ee51dca22,222,60,4,561,1,1
001cfa63ad79aaf3e4a2b85a2e7f227f,9,0,0,5,1,1
003e845bdcc5367220bf13f7170da16f,3,1,0,9,1,1
00bb04604996c97b7b4f8b2c767c0f40,1,0,0,6,1,1
00cc33352ae8e526f7533119fb823cb1,0,0,0,0,1,1


In [136]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1155, 'ransomware': 924}
Cross-validation 8 models required a duration of 74.01 seconds


In [137]:
print_evaluation_results(results, names)

LogisticRegression             	: 54.071% ( (+/-) 6.116% )
KNN                            	: 86.611% ( (+/-) 2.683% )
Decision tree                  	: 87.238% ( (+/-) 2.764% )
Random Forest                  	: 87.657% ( (+/-) 2.982% )
SVM                            	: 52.720% ( (+/-) 2.173% )
RegularNets                    	: 54.393% ( (+/-) 3.101% )
LDA                            	: 50.105% ( (+/-) 4.028% )
Gaussian Naive Bayes           	: 46.138% ( (+/-) 3.400% )


In [138]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On RegKeys Operation Counts Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_registry_key_counts.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 5. PE Entropy

In [139]:
df = datasets['pe_entropy_analysis']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 2608
ransomware   : 443

Majority class classifier accuracy = 85.48%


Unnamed: 0_level_0,.text,.rdata,.data,.data2,.rsrc,.reloc,label,sublabel,.pdata,UPX0,...,hultkjl,#\r\n\x00c,.init,\x99\xe3\x00\x00c,T\t\x00oc,CDS0,CDS1,PS\xff\xd5\xab\xeb\xe7\xc3,\x10@\x00\x88=A,\xfc\xe4@\x00\xfc\x0f@
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,7,5,0,0,5,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,4,0,2,0,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,6,0,5,0,7,0,1,1,4,0,...,0,0,0,0,0,0,0,0,0,0
00a53241bf9c9425c6df8da44a5ca4f4,4,0,3,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,4,0,7,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1616, 'ransomware': 1293}
Cross-validation 8 models required a duration of 96.63 seconds


In [141]:
print_evaluation_results(results, names)

LogisticRegression             	: 78.525% ( (+/-) 2.559% )
KNN                            	: 88.852% ( (+/-) 2.010% )
Decision tree                  	: 91.162% ( (+/-) 1.759% )
Random Forest                  	: 92.295% ( (+/-) 1.486% )
SVM                            	: 86.885% ( (+/-) 2.291% )
RegularNets                    	: 83.770% ( (+/-) 4.018% )
LDA                            	: 77.869% ( (+/-) 2.841% )
Gaussian Naive Bayes           	: 27.169% ( (+/-) 2.127% )


In [142]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_pe_entropy.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### 6. PE Imports (One-Hot Encoded)

Considering all pe import functions at once ends up generating a dataset that has over 20 000 columns. So we will analyse imports belonging to different libraries (e.g. kernel32.dll, user32.dll...) separately and try to select either the ones with the highest feature importance in the most recurrent libraries. <br/>

For now, let's first study the performance on training on just the libraries, before checking the individual pe imports.

**Libraries**

In [144]:
df = datasets['pe_imports_libraries']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 2614
ransomware   : 444

Majority class classifier accuracy = 85.48%


Unnamed: 0_level_0,kernel32.dll,user32.dll,gdi32.dll,advapi32.dll,shell32.dll,ole32.dll,shlwapi.dll,comctl32.dll,label,sublabel,...,cmnclim.dll,api-ms-win-core-threadpool-l1-1-0.dll,ktmw32.dll,msvcm80.dll,bdeui.dll,mrxsmb.sys,sspisrv.dll,api-ms-win-crt-convert-l1-1-0.dll,api-ms-win-crt-filesystem-l1-1-0.dll,api-ms-win-crt-environment-l1-1-0.dll
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,1,0,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,1,1,1,1,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
00a53241bf9c9425c6df8da44a5ca4f4,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
00bb04604996c97b7b4f8b2c767c0f40,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [145]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1621, 'ransomware': 1297}
Cross-validation 8 models required a duration of 42.07 seconds


In [146]:
print_evaluation_results(results, names)

LogisticRegression             	: 78.396% ( (+/-) 2.206% )
KNN                            	: 76.961% ( (+/-) 3.557% )
Decision tree                  	: 80.719% ( (+/-) 2.402% )
Random Forest                  	: 80.882% ( (+/-) 2.228% )
SVM                            	: 80.524% ( (+/-) 2.282% )
RegularNets                    	: 80.065% ( (+/-) 2.470% )
LDA                            	: 76.432% ( (+/-) 2.705% )
Gaussian Naive Bayes           	: 34.641% ( (+/-) 2.285% )


In [147]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Entropy Libraries Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

**Selecting Libraries**

In [192]:
from learners import *

In [194]:
X, y = get_X_y(df, label='sublabel')
smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
    }
smoter = smote(X, y, **smote_params, fit=False)

In [195]:
k_best_selected_libraries = select_k_best(df, select_function=chi2, k=10, label='sublabel', smoter=smoter)
k_best_selected_libraries

['gdi32.dll',
 'advapi32.dll',
 'shell32.dll',
 'shlwapi.dll',
 'comctl32.dll',
 'msvcrt.dll',
 'version.dll',
 'comdlg32.dll',
 'wininet.dll',
 'ntdll.dll']

In [196]:
limit= 15
most_recurrent_libraries = get_most_recurrent(df, portion=0.05, limit=limit)
most_recurrent_libraries

kernel32.dll    2425
user32.dll      1845
advapi32.dll    1532
gdi32.dll        883
msvcrt.dll       851
shell32.dll      773
ole32.dll        627
comctl32.dll     598
version.dll      487
oleaut32.dll     430
shlwapi.dll      327
wininet.dll      295
ntdll.dll        246
ws2_32.dll       240
comdlg32.dll     220
dtype: int64

In [197]:
#Retrieve the most recurrent libraries in PE imports across all data samples
selected_libraries_indices = []
for column in k_best_selected_libraries:
    if column in most_recurrent_libraries.index.tolist():
        selected_libraries_indices.append(most_recurrent_libraries.index.tolist().index(column))

#Plot
colors = np.repeat('lightslategray', limit)
colors[selected_libraries_indices] = 'lightblue'

fig = go.Figure( data = [ go.Bar(x=most_recurrent_libraries.index, 
                                 y=most_recurrent_libraries.values,  
                                 marker_color=colors) ] )

fig.update_layout(title='The top 5% most recurrent libraries in PE imports', title_x=0.3)

fig.show()

Let's explore what's the majority class in function of the involved libraries in PE imports:

In [198]:
def plot_class_distribution(df, libraries, title, label='sublabel'):
    
    malware = []
    ransomware = []
    
    #Balancing:
    X, y = get_X_y(df, label=label)
    smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
    }
    X, y = smote(X, y, **smote_params, fit=True)
    X['sublabel'] = y
    df = X.copy()
    
    for library in libraries:
        subset = df[df[library] != 0]
        total = len(subset)
        malware.append(subset[label].sum())
        ransomware.append(subset[label].sum() - total)

    fig = go.Figure(data=[
        go.Bar(name='Malware', x=libraries, y=malware, marker_color=np.repeat('crimson', len(libraries))),
        go.Bar(name='Ransomware', x=libraries, y=ransomware, marker_color=np.repeat('lightslategray', len(libraries))) ])
    
    fig.update_layout(barmode='group', 
                      title=title, 
                      title_x=0.2)
    fig.show()

In [199]:
#Most recurrent libraries: 
plot_class_distribution(df = df,
                        libraries = most_recurrent_libraries.iloc[:10].index.tolist(),
                        title = 'The class distribution for the most recurrent libraries in PE imports')

In [200]:
#Most recurrent libraries: 
plot_class_distribution(df = df,
                        libraries = k_best_selected_libraries,
                        title = 'The class distribution for the k-selected libraries in PE imports')

**kernel32.dll**

We can look at the performance on PE Imports of the most recurrent library kernel32.dll as an example. As for our final model, We'll pick the data from the 8 most recurrent libraries as they will provide enough observation data for training.

In [202]:
df = datasets['pe_imports_kernel32']
class_distribution(df, label='sublabel')
df.head()

Class distribution:
malware      : 2077
ransomware   : 348

Majority class classifier accuracy = 85.65%


Unnamed: 0_level_0,GetCommandLineW,GetComputerNameA,GetConsoleAliasW,GetConsoleAliasesLengthA,GetConsoleAliasesLengthW,GetConsoleCP,GetCurrentDirectoryW,GetCurrentProcess,GetCurrentProcessId,GetCurrentThreadId,...,SetThreadStackGuarantee,GetSystemRegistryQuota,SetDllDirectoryA,CreateSocketHandle,InvalidateConsoleDIBits,GetConsoleCommandHistoryA,SetConsoleFont,LZCopy,DeleteFileTransactedW,RtlDeleteFunctionTable
md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a3ea381d7d70be8b6fe1ee51dca22,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001cfa63ad79aaf3e4a2b85a2e7f227f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
003e845bdcc5367220bf13f7170da16f,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
00cc33352ae8e526f7533119fb823cb1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00d9d5a0319cddef48add9257d4721df,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [203]:
start = time.time()

#create a smote pipeline
X, y = get_X_y(df, label='sublabel')

smote_params = {
    'category' : 'adaptive',
    'over_strategy' : 0.5,
    'under_strategy' : 0.8,
    'k_neighbors' : 5,
}

smoter = smote(X, y, **smote_params, fit=False)

print_new_distribution(X, y, smote_params)


#compute cross-validation scores
names, results = get_evaluation_results(df = df, 
                                        label='sublabel', 
                                        smoter=smoter, 
                                        scaler=MinMaxScaler(),
                                        scoring ='accuracy')

print(f'Cross-validation {len(names)} models required a duration of {time.time()-start:.2f} seconds')

The new class distribution after SAMPLING strategy : {'malware': 1292, 'ransomware': 1034}
Cross-validation 8 models required a duration of 85.86 seconds


In [204]:
print_evaluation_results(results, names)

LogisticRegression             	: 91.424% ( (+/-) 1.879% )
KNN                            	: 84.703% ( (+/-) 2.667% )
Decision tree                  	: 93.085% ( (+/-) 1.936% )
Random Forest                  	: 94.323% ( (+/-) 1.636% )
SVM                            	: 89.801% ( (+/-) 1.692% )
RegularNets                    	: 93.237% ( (+/-) 1.548% )
LDA                            	: 83.011% ( (+/-) 2.358% )
Gaussian Naive Bayes           	: 56.275% ( (+/-) 4.084% )


In [205]:
fig = plot_evaluation_boxplots(results, names, title = 'Models Performance On PE Imports of kernel32.dll library Data', y_axis = 'Accuracy')

figure_path = 'figures/ransomware/evaluation_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)