In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

from IPython.display import IFrame, clear_output

In [3]:
import os
import time
import logging
from functools import wraps
import pickle
import re

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from numpy import hstack

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [9]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif

from sklearn.utils import shuffle
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
#from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

In [5]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser/ransom_datasets'

In [13]:
from helpers import *
from learners import *

In [39]:
def batch_feature_selection(categories, 
                            k_range = [100, 1001], 
                            step=100, 
                            scoring='accuracy',
                            dataset_prefix='encoded_nested_fileops_',
                            dataset='file_operations', 
                            label='sublabel',
                            with_smote=True):

    #initialization
    k_range_ = range(k_range[0], k_range[1], step)
    summaries = dict()
    figure_path = 'figures/feature-selection/ransomware/' + dataset + '_' + scoring + '_step' + str(step) + '.html'

    #loop through datasets
    for category in categories:

        #load data
        print('-------------------------------------------------')
        print(category.capitalize(), ':\n')
        if dataset_prefix is not None:
            file_name = dataset_prefix + category + '.pkl.gz'
        else:
            file_name = 'fileops_' + category + '_nested_files.pkl.gz'
        df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

        #print class distribution
        class_distribution(df, label=label)

        #build a model with smote
        steps = []
        if with_smote:
            smote_params = {
                'category' : 'adaptive',
                'over_strategy' : 0.5,
                'under_strategy' : 0.8,
                'k_neighbors' : 5,
            }
            X, y = get_X_y(df, label=label)
            steps.extend(smote(X, y, **smote_params, fit=False))
        steps.append(('classifier', RandomForestClassifier()))
        pipe = Pipeline(steps=steps)
        
        #perform grid-search
        scores, summary = search_feature_importances(df = df, 
                                                     k_values = k_range_, 
                                                     model=pipe, 
                                                     scoring=scoring, 
                                                     verbose=False,
                                                     label=label)

        #append summary results
        summaries[category] = summary

        #plot scores
        fig = plot_evaluation_boxplots(list(scores.values()), 
                                       list(scores.keys()), 
                                       title='Performance per selected ' + category.capitalize() + ' ' + dataset + ' features  [ ' + scoring.capitalize() + '=f(k) ]', 
                                       y_axis=scoring.capitalize(), 
                                       x_axis='k', 
                                       showlegend=False)

        #save plot to html
        save_figures_to_html(figure_path, [fig])
        print()


    #compile summary results and save them
    overall_summary = summaries_to_df(summaries, 
                                      k_range=k_range_, 
                                      path='selected-features/k-search-summary/ransomware', 
                                      file=dataset + '_' + scoring + '_step' + str(step) + '.csv')

## API Calls

In [10]:
categories = ['counts']

In [25]:
batch_feature_selection(categories, 
                        k_range = [100, 301], 
                        step=25, 
                        scoring='accuracy',
                        dataset_prefix='apistats_',
                        dataset='apistats',
                        label='sublabel')

-------------------------------------------------
Counts :

loaded data shape :   (2618, 301)
Class distribution:
malware      : 2206
ransomware   : 412

Majority class classifier accuracy = 84.26%

Best value : k = 125 --> accuracy = 98.09% ( (+/-) 0.67% )



In [27]:
pd.read_csv('selected-features/k-search-summary/ransomware/apistats_accuracy_step25.csv', index_col=0)

Unnamed: 0,counts
100,97.71% (+/- 0.63%)
125,98.09% (+/- 0.67%)
150,98.08% (+/- 0.52%)
175,98.09% (+/- 0.65%)
200,98.09% (+/- 0.6%)
225,97.9% (+/- 0.71%)
250,98.09% (+/- 0.62%)
275,98.09% (+/- 0.55%)
300,98.09% (+/- 0.71%)


In [31]:
selected_k = [250]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'apistats_' + category + '.pkl.gz'
    df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #build model
    steps = []
    smote_params = {
            'category' : 'adaptive',
            'over_strategy' : 0.5,
            'under_strategy' : 0.8,
            'k_neighbors' : 5,
          }
    X, y = get_X_y(df, label='sublabel')
    steps.extend(smote(X, y, **smote_params, fit=False))
    steps.append(('classifier', RandomForestClassifier()))
    pipe = Pipeline(steps=steps)
        
    #save k selected column names
    save_selected_features(df = df,
                           k = k, 
                           model=pipe, 
                           prefix = 'apistats_' + category,
                           path='selected-features/ransomware',
                           file= 'apistats_' + category + '.pkl',
                           label = 'sublabel')

loaded data shape :   (2618, 301)


## Registry Key Operations

In [33]:
categories = ['opened', 'read', 'written', 'deleted']

In [36]:
batch_feature_selection(categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix=None,
                        dataset='regkeys_operations',
                        label='sublabel')

-------------------------------------------------
Opened :

loaded data shape :   (1852, 5554)
Class distribution:
malware      : 1581
ransomware   : 271

Majority class classifier accuracy = 85.37%

Best value : k = 350 --> accuracy = 98.39% ( (+/-) 0.77% )

-------------------------------------------------
Read :

loaded data shape :   (2268, 7286)
Class distribution:
malware      : 1869
ransomware   : 399

Majority class classifier accuracy = 82.41%

Best value : k = 850 --> accuracy = 97.36% ( (+/-) 2.74% )

-------------------------------------------------
Written :

loaded data shape :   (1244, 2086)
Class distribution:
malware      : 991
ransomware   : 253

Majority class classifier accuracy = 79.66%

Best value : k = 100 --> accuracy = 95.97% ( (+/-) 1.41% )

-------------------------------------------------
Deleted :

loaded data shape :   (534, 471)
Class distribution:
malware      : 521
ransomware   : 13

Majority class classifier accuracy = 97.57%

Best value : k = 300 --> 

In [37]:
pd.read_csv('selected-features/k-search-summary/ransomware/regkeys_operations_accuracy_step50.csv', index_col=0)

Unnamed: 0,opened,read,written,deleted
100,98.11% (+/- 0.82%),96.7% (+/- 2.44%),95.97% (+/- 1.41%),88.68% (+/- 7.12%)
150,98.38% (+/- 0.79%),95.15% (+/- 2.45%),95.2% (+/- 1.81%),88.68% (+/- 7.23%)
200,98.38% (+/- 0.84%),96.48% (+/- 2.66%),95.18% (+/- 1.62%),88.68% (+/- 6.62%)
250,98.38% (+/- 0.75%),94.71% (+/- 2.46%),95.2% (+/- 1.63%),88.68% (+/- 6.72%)
300,98.38% (+/- 0.85%),94.71% (+/- 2.43%),95.2% (+/- 1.76%),89.62% (+/- 6.67%)
350,98.39% (+/- 0.77%),94.48% (+/- 2.47%),95.2% (+/- 1.3%),88.68% (+/- 6.85%)
400,98.38% (+/- 0.89%),94.71% (+/- 2.55%),95.56% (+/- 1.49%),87.86% (+/- 6.38%)
450,98.38% (+/- 0.86%),94.71% (+/- 2.28%),95.58% (+/- 1.59%),87.86% (+/- 6.68%)
500,98.38% (+/- 0.85%),94.7% (+/- 2.31%),95.97% (+/- 1.37%),87.74% (+/- 6.94%)
550,98.38% (+/- 0.71%),94.7% (+/- 2.38%),95.18% (+/- 1.46%),88.68% (+/- 6.68%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**Opened regkeys :** k=150 <br/>
**Read regkeys :** k=850 <br/>
**Written regkeys :** k=100 <br/>
**Deleted regkeys :** k=50 <br/>

**Total regkeys operations selected columns =** 1150 features

In [38]:
selected_k = [150, 850, 100, 50]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'regkeys_' + category + '_nested_keys.pkl.gz'
    df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')
    
    #build model
    steps = []
    smote_params = {
            'category' : 'adaptive',
            'over_strategy' : 0.5,
            'under_strategy' : 0.8,
            'k_neighbors' : 5,
          }
    X, y = get_X_y(df, label='sublabel')
    steps.extend(smote(X, y, **smote_params, fit=False))
    steps.append(('classifier', RandomForestClassifier()))
    pipe = Pipeline(steps=steps)

    #save k selected column names
    save_selected_features(df = df, 
                           k = k, 
                           model=pipe, 
                           prefix = 'regkeys_' + category,
                           path='selected-features/ransomware', 
                           file= 'regkeys_' + category + '_nested_keys.pkl',
                           label='sublabel')

loaded data shape :   (1852, 5554)
loaded data shape :   (2268, 7286)
loaded data shape :   (1244, 2086)
loaded data shape :   (534, 471)


## File Operations

In [42]:
categories = ['opened', 'exists', 'read', 'written', 'created', 'deleted', 'failed', 'recreated']

In [43]:
batch_feature_selection(categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix=None,
                        dataset='file_operations',
                        label='sublabel')

-------------------------------------------------
Opened :

loaded data shape :   (2094, 12066)
Class distribution:
malware      : 1692
ransomware   : 402

Majority class classifier accuracy = 80.8%

Best value : k = 500 --> accuracy = 98.56% ( (+/-) 0.72% )

-------------------------------------------------
Exists :

loaded data shape :   (1545, 6847)
Class distribution:
malware      : 1275
ransomware   : 270

Majority class classifier accuracy = 82.52%

Best value : k = 100 --> accuracy = 98.38% ( (+/-) 0.94% )

-------------------------------------------------
Read :

loaded data shape :   (1508, 10430)
Class distribution:
malware      : 1236
ransomware   : 272

Majority class classifier accuracy = 81.96%

Best value : k = 100 --> accuracy = 98.34% ( (+/-) 0.79% )

-------------------------------------------------
Written :

loaded data shape :   (1265, 9891)
Class distribution:
malware      : 1006
ransomware   : 259

Majority class classifier accuracy = 79.53%

Best value : k = 650

In [44]:
pd.read_csv('selected-features/k-search-summary/ransomware/file_operations_accuracy_step50.csv', index_col=0)

Unnamed: 0,opened,exists,read,written,created,deleted,failed,recreated
100,98.1% (+/- 0.78%),98.38% (+/- 0.94%),98.34% (+/- 0.79%),94.07% (+/- 1.76%),96.75% (+/- 2.19%),98.06% (+/- 0.93%),97.69% (+/- 1.11%),87.04% (+/- 4.13%)
150,98.09% (+/- 0.87%),98.06% (+/- 1.05%),98.01% (+/- 0.7%),94.44% (+/- 1.8%),94.74% (+/- 1.64%),98.06% (+/- 1.02%),97.69% (+/- 1.4%),88.89% (+/- 4.33%)
200,98.09% (+/- 0.95%),98.38% (+/- 0.95%),98.01% (+/- 0.81%),94.49% (+/- 1.97%),95.12% (+/- 1.29%),98.54% (+/- 1.16%),97.69% (+/- 1.2%),88.78% (+/- 4.05%)
250,98.09% (+/- 0.76%),98.06% (+/- 1.01%),98.34% (+/- 0.89%),94.86% (+/- 1.72%),95.12% (+/- 1.69%),98.06% (+/- 1.22%),97.69% (+/- 1.41%),87.04% (+/- 3.93%)
300,97.62% (+/- 0.72%),98.06% (+/- 0.95%),98.01% (+/- 0.73%),95.24% (+/- 1.89%),96.75% (+/- 1.8%),98.06% (+/- 1.14%),97.69% (+/- 1.42%),88.89% (+/- 4.29%)
350,98.09% (+/- 0.82%),98.38% (+/- 0.93%),98.01% (+/- 0.8%),94.86% (+/- 1.88%),95.14% (+/- 1.54%),98.06% (+/- 1.18%),97.69% (+/- 1.39%),87.04% (+/- 4.19%)
400,98.1% (+/- 0.79%),98.06% (+/- 0.98%),98.01% (+/- 0.74%),94.49% (+/- 1.77%),95.12% (+/- 1.76%),98.06% (+/- 1.28%),97.69% (+/- 1.42%),88.89% (+/- 4.5%)
450,98.1% (+/- 0.76%),98.06% (+/- 1.01%),98.01% (+/- 0.89%),94.49% (+/- 2.0%),95.12% (+/- 1.78%),98.06% (+/- 1.26%),97.69% (+/- 1.27%),87.04% (+/- 4.4%)
500,98.56% (+/- 0.72%),98.06% (+/- 0.86%),98.34% (+/- 0.98%),95.24% (+/- 1.67%),95.55% (+/- 1.65%),98.06% (+/- 1.21%),97.69% (+/- 1.28%),87.96% (+/- 4.19%)
550,98.33% (+/- 0.82%),98.06% (+/- 0.95%),98.01% (+/- 0.68%),94.86% (+/- 1.86%),95.93% (+/- 1.82%),98.06% (+/- 1.28%),97.69% (+/- 1.41%),88.89% (+/- 4.13%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**Opened files :** k=500 <br/>
**Exists files :** k=100 <br/>
**Read files :** k=100 <br/>
**Written files :** k=300 <br/>
**Created files :** k=300 <br/>
**Deleted files :** k=200 <br/>
**Failed files :** k=100 <br/>
**Recreated files :** k=50 <br/>

**Total file operations selected columns =** 1650 features

In [45]:
selected_k = [500, 100, 100, 300, 300, 200, 100, 50]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'fileops_' + category + '_nested_files.pkl.gz'
    df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')
    
    #build model
    steps = []
    smote_params = {
            'category' : 'adaptive',
            'over_strategy' : 0.5,
            'under_strategy' : 0.8,
            'k_neighbors' : 5,
          }
    X, y = get_X_y(df, label='sublabel')
    steps.extend(smote(X, y, **smote_params, fit=False))
    steps.append(('classifier', RandomForestClassifier()))
    pipe = Pipeline(steps=steps)

    #save k selected column names
    save_selected_features(df = df, 
                           k = k, 
                           model=pipe, 
                           prefix = 'fileops_' + category,
                           path='selected-features/ransomware', 
                           file= 'fileops_' + category + '_nested_files.pkl',
                           label='sublabel')

loaded data shape :   (2094, 12066)
loaded data shape :   (1545, 6847)
loaded data shape :   (1508, 10430)
loaded data shape :   (1265, 9891)
loaded data shape :   (1232, 9747)
loaded data shape :   (1029, 5537)
loaded data shape :   (1300, 3073)
loaded data shape :   (539, 445)


## Loaded DLL

In [46]:
batch_feature_selection(categories = ['onehot'], 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='loaded_dll_',
                        dataset='loaded_dll',
                        label='sublabel')

-------------------------------------------------
Onehot :

loaded data shape :   (2168, 821)
Class distribution:
malware      : 1875
ransomware   : 293

Majority class classifier accuracy = 86.49%

Best value : k = 1000 --> accuracy = 93.78% ( (+/-) 1.85% )



In [47]:
pd.read_csv('selected-features/k-search-summary/ransomware/loaded_dll_accuracy_step50.csv', index_col=0)

Unnamed: 0,onehot
100,91.46% (+/- 2.89%)
150,89.14% (+/- 2.74%)
200,88.94% (+/- 2.97%)
250,90.3% (+/- 2.68%)
300,93.09% (+/- 2.84%)
350,93.55% (+/- 2.43%)
400,93.76% (+/- 1.47%)
450,93.55% (+/- 1.78%)
500,93.53% (+/- 1.77%)
550,93.55% (+/- 2.55%)


In [49]:
#load data
file_name = 'loaded_dll_onehot.pkl.gz'
df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#build model
steps = []
smote_params = {
      'category' : 'adaptive',
       'over_strategy' : 0.5,
       'under_strategy' : 0.8,
       'k_neighbors' : 5,
  }
X, y = get_X_y(df, label='sublabel')
steps.extend(smote(X, y, **smote_params, fit=False))
steps.append(('classifier', RandomForestClassifier()))
pipe = Pipeline(steps=steps)

#save k selected column names
save_selected_features(df = df,
                       k = 400, 
                       model=pipe, 
                       prefix = 'loaded_dll',
                       path='selected-features/ransomware',
                       file= 'loaded_dll_onehot.pkl',
                       label='sublabel')

loaded data shape :   (2168, 821)


## PE Entropy

In [50]:
batch_feature_selection(categories = ['analysis'], 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='pe_entropy_',
                        dataset='pe_entropy',
                        label='sublabel')

-------------------------------------------------
Analysis :

loaded data shape :   (3051, 715)
Class distribution:
malware      : 2608
ransomware   : 443

Majority class classifier accuracy = 85.48%

Best value : k = 100 --> accuracy = 93.11% ( (+/-) 1.69% )



In [51]:
pd.read_csv('selected-features/k-search-summary/ransomware/pe_entropy_accuracy_step50.csv', index_col=0)

Unnamed: 0,analysis
100,93.11% (+/- 1.69%)
150,92.31% (+/- 1.45%)
200,92.13% (+/- 1.09%)
250,92.62% (+/- 1.31%)
300,92.46% (+/- 1.18%)
350,92.46% (+/- 1.42%)
400,92.3% (+/- 1.18%)
450,91.97% (+/- 1.47%)
500,92.47% (+/- 0.93%)
550,92.3% (+/- 1.2%)


In [52]:
#load data
file_name = 'pe_entropy_analysis.pkl.gz'
df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#build model
steps = []
smote_params = {
      'category' : 'adaptive',
       'over_strategy' : 0.5,
       'under_strategy' : 0.8,
       'k_neighbors' : 5,
  }
X, y = get_X_y(df, label='sublabel')
steps.extend(smote(X, y, **smote_params, fit=False))
steps.append(('classifier', RandomForestClassifier()))
pipe = Pipeline(steps=steps)

#save k selected column names
save_selected_features(df = df,
                       k = 100, 
                       model=pipe, 
                       prefix = 'pe_entropy',
                       path='selected-features/ransomware',
                       file= 'pe_entropy_analysis.pkl',
                       label='sublabel')

loaded data shape :   (3051, 715)


## PE Imports

**Libraries :**

In [53]:
batch_feature_selection(categories = ['libraries'], 
                        k_range = [50, 551], 
                        step=25, 
                        scoring='accuracy',
                        dataset_prefix='pe_imports_',
                        dataset='pe_imports',
                        label='sublabel')

-------------------------------------------------
Libraries :

loaded data shape :   (3058, 286)
Class distribution:
malware      : 2614
ransomware   : 444

Majority class classifier accuracy = 85.48%

Best value : k = 425 --> accuracy = 81.86% ( (+/-) 2.28% )



In [54]:
pd.read_csv('selected-features/k-search-summary/ransomware/pe_imports_accuracy_step25.csv', index_col=0)

Unnamed: 0,libraries
50,81.05% (+/- 2.07%)
75,81.18% (+/- 2.09%)
100,80.88% (+/- 2.33%)
125,81.51% (+/- 2.2%)
150,81.37% (+/- 2.22%)
175,81.51% (+/- 2.16%)
200,81.51% (+/- 2.34%)
225,80.72% (+/- 2.16%)
250,81.05% (+/- 2.22%)
275,81.05% (+/- 2.05%)


In [55]:
#load data
file_name = 'pe_imports_libraries.pkl.gz'
df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#build model
steps = []
smote_params = {
      'category' : 'adaptive',
       'over_strategy' : 0.5,
       'under_strategy' : 0.8,
       'k_neighbors' : 5,
  }
X, y = get_X_y(df, label='sublabel')
steps.extend(smote(X, y, **smote_params, fit=False))
steps.append(('classifier', RandomForestClassifier()))
pipe = Pipeline(steps=steps)

#save k selected column names
save_selected_features(df = df,
                       k = 75, 
                       model=pipe, 
                       prefix = 'pe_imports_libraries',
                       path='selected-features/ransomware',
                       file= 'pe_imports_libraries.pkl',
                       label='sublabel')

loaded data shape :   (3058, 286)


**Imports per library (most frequent libraries):**

In [57]:
categories = ['kernel32', 'user32', 'advapi32', 'shell32', 'ole32']

batch_feature_selection(categories = categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='pe_imports_',
                        dataset='pe_imports_top_libraries',
                        label='sublabel')

-------------------------------------------------
Kernel32 :

loaded data shape :   (2425, 886)
Class distribution:
malware      : 2077
ransomware   : 348

Majority class classifier accuracy = 85.65%

Best value : k = 700 --> accuracy = 95.06% ( (+/-) 1.49% )

-------------------------------------------------
User32 :

loaded data shape :   (1845, 678)
Class distribution:
malware      : 1512
ransomware   : 333

Majority class classifier accuracy = 81.95%

Best value : k = 1000 --> accuracy = 93.77% ( (+/-) 1.99% )

-------------------------------------------------
Advapi32 :

loaded data shape :   (1532, 455)
Class distribution:
malware      : 1404
ransomware   : 128

Majority class classifier accuracy = 91.64%

Best value : k = 400 --> accuracy = 95.12% ( (+/-) 1.49% )

-------------------------------------------------
Shell32 :

loaded data shape :   (773, 156)
Class distribution:
malware      : 718
ransomware   : 55

Majority class classifier accuracy = 92.88%

Best value : k = 750 

In [58]:
pd.read_csv('selected-features/k-search-summary/ransomware/pe_imports_top_libraries_accuracy_step50.csv', index_col=0)

Unnamed: 0,kernel32,user32,advapi32,shell32,ole32
100,94.21% (+/- 1.59%),91.89% (+/- 2.0%),90.85% (+/- 2.91%),92.91% (+/- 2.85%),85.6% (+/- 4.01%)
150,95.04% (+/- 1.36%),93.48% (+/- 2.02%),91.21% (+/- 2.62%),93.51% (+/- 3.37%),87.2% (+/- 4.49%)
200,94.63% (+/- 1.41%),93.5% (+/- 1.8%),91.83% (+/- 2.63%),93.51% (+/- 2.95%),90.48% (+/- 3.31%)
250,94.85% (+/- 1.32%),92.97% (+/- 2.13%),92.81% (+/- 2.64%),92.21% (+/- 2.83%),90.4% (+/- 3.37%)
300,94.43% (+/- 1.31%),93.5% (+/- 1.81%),93.46% (+/- 1.91%),93.51% (+/- 2.95%),90.48% (+/- 3.11%)
350,94.63% (+/- 1.27%),92.93% (+/- 2.0%),94.77% (+/- 1.69%),93.51% (+/- 2.9%),90.48% (+/- 4.91%)
400,94.43% (+/- 1.29%),93.5% (+/- 1.96%),95.12% (+/- 1.49%),93.51% (+/- 2.93%),89.61% (+/- 3.85%)
450,94.24% (+/- 1.52%),92.97% (+/- 1.79%),94.77% (+/- 2.09%),92.91% (+/- 2.56%),90.4% (+/- 3.56%)
500,94.63% (+/- 1.42%),93.48% (+/- 1.86%),95.12% (+/- 1.83%),93.51% (+/- 2.68%),90.4% (+/- 3.86%)
550,94.63% (+/- 1.46%),93.48% (+/- 1.76%),94.12% (+/- 1.96%),93.51% (+/- 2.77%),90.48% (+/- 4.09%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**kernel32 :** k=150 <br/>
**user32 :** k=200 <br/>
**advapi32 :** k=400 <br/>
**shell32 :** k=150 <br/>
**ole32 :** k=300 <br/>

**Total PE imports selected columns =** 1200 features

In [59]:
selected_k = [150, 200, 400, 150, 300]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'pe_imports_' + category + '.pkl.gz'
    df = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #build model
    steps = []
    smote_params = {
            'category' : 'adaptive',
            'over_strategy' : 0.5,
            'under_strategy' : 0.8,
            'k_neighbors' : 5,
          }
    X, y = get_X_y(df, label='sublabel')
    steps.extend(smote(X, y, **smote_params, fit=False))
    steps.append(('classifier', RandomForestClassifier()))
    pipe = Pipeline(steps=steps)    
    
    #save k selected column names
    save_selected_features(df = df, 
                           k = k, 
                           model=pipe, 
                           prefix = 'pe_imports_' + category,
                           path='selected-features/ransomware', 
                           file= 'pe_imports_' + category + '.pkl',
                           label='sublabel')

loaded data shape :   (2425, 886)
loaded data shape :   (1845, 678)
loaded data shape :   (1532, 455)
loaded data shape :   (773, 156)
loaded data shape :   (627, 303)
