In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.filterwarnings('ignore')

from IPython.display import IFrame, clear_output

In [4]:
import os
import time
import logging
from functools import wraps
import pickle
import re

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from numpy import hstack

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [5]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif

from sklearn.utils import shuffle
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

In [6]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser'

In [43]:
from helpers import *
from learners import *

In [96]:
def batch_feature_selection(categories, 
                            k_range = [100, 1001], 
                            step=100, 
                            scoring='accuracy',
                            dataset_prefix='encoded_nested_fileops_',
                            dataset='file_operations', 
                            label='label'):

    #initialization
    k_range_ = range(k_range[0], k_range[1], step)
    summaries = dict()
    figure_path = 'figures/feature-selection/' + dataset + '_' + scoring + '_step' + str(step) + '.html'

    #loop through datasets
    for category in categories:

        #load data
        print('-------------------------------------------------')
        print(category.capitalize(), ':\n')
        file_name = dataset_prefix + category + '.pkl.gz'
        fileops = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

        #print class distribution
        class_distribution(fileops, label=label)

        #perform grid-search
        scores, summary = search_feature_importances(df = fileops, 
                                                     k_values = k_range_, 
                                                     model=RandomForestClassifier(), 
                                                     scoring=scoring, 
                                                     verbose=False,
                                                     label=label)

        #append summary results
        summaries[category] = summary

        #plot scores
        fig = plot_evaluation_boxplots(list(scores.values()), 
                                       list(scores.keys()), 
                                       title='Performance per selected ' + category.capitalize() + ' ' + dataset + ' features  [ ' + scoring.capitalize() + '=f(k) ]', 
                                       y_axis=scoring.capitalize(), 
                                       x_axis='k', 
                                       showlegend=False)

        #save plot to html
        save_figures_to_html(figure_path, [fig])
        print()


    #compile summary results and save them
    overall_summary = summaries_to_df(summaries, 
                                      k_range=k_range_, 
                                      path='selected-features/k-search-summary', 
                                      file=dataset + '_' + scoring + '_step' + str(step) + '.csv')

## API Calls

In [37]:
categories = ['onehot', 'counts']

In [45]:
batch_feature_selection(categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='encoded_apistats_',
                        dataset='apistats',
                        label='label')

-------------------------------------------------
Onehot :

loaded data shape :  (3761, 304)
Class distribution:
malware      : 2190
goodware     : 1571

Majority class classifier accuracy = 58.23%

Best value : k = 250 --> accuracy = 94.73% ( (+/-) 0.95% )

-------------------------------------------------
Counts :

loaded data shape :  (3822, 305)
Class distribution:
malware      : 2244
goodware     : 1578

Majority class classifier accuracy = 58.71%

Best value : k = 350 --> accuracy = 95.19% ( (+/-) 1.5% )



In [46]:
pd.read_csv('selected-features/k-search-summary/apistats_accuracy_step50.csv', index_col=0)

Unnamed: 0,onehot,counts
100,94.35% (+/- 1.13%),94.82% (+/- 1.53%)
150,94.52% (+/- 1.05%),94.93% (+/- 1.44%)
200,94.52% (+/- 1.14%),95.02% (+/- 1.4%)
250,94.73% (+/- 0.95%),95.03% (+/- 1.53%)
300,94.66% (+/- 1.07%),94.98% (+/- 1.37%)
350,94.57% (+/- 0.98%),95.19% (+/- 1.5%)
400,94.57% (+/- 1.07%),94.9% (+/- 1.44%)
450,94.58% (+/- 1.0%),95.13% (+/- 1.54%)
500,94.67% (+/- 1.05%),95.01% (+/- 1.3%)
550,94.58% (+/- 0.89%),94.88% (+/- 1.4%)


In [47]:
selected_k = [250, 250]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'encoded_apistats_' + category + '.pkl.gz'
    apistats = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #save k selected column names
    save_selected_features(df = apistats,
                           k = k, 
                           model=RandomForestClassifier(), 
                           prefix = 'apistats_' + category,
                           path='selected-features',
                           file= 'apistats_' + category + '.pkl')

loaded data shape :  (3761, 304)
loaded data shape :  (3822, 305)


## Registry Key Operations

In [21]:
categories = ['opened', 'read', 'written', 'deleted']

In [23]:
batch_feature_selection(categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='encoded_nested_regkeys_',
                        dataset='regkeys_operations',
                        label='label')

-------------------------------------------------
Opened :

loaded data shape :  (2614, 19974)
Class distribution:
malware      : 1572
goodware     : 1042

Majority class classifier accuracy = 60.14%

Best value : k = 250 --> accuracy = 91.58% ( (+/-) 1.45% )

-------------------------------------------------
Read :

loaded data shape :  (3354, 20305)
Class distribution:
malware      : 1856
goodware     : 1498

Majority class classifier accuracy = 55.34%

Best value : k = 800 --> accuracy = 88.81% ( (+/-) 1.32% )

-------------------------------------------------
Written :

loaded data shape :  (1232, 2970)
Class distribution:
malware      : 985
goodware     : 247

Majority class classifier accuracy = 79.95%

Best value : k = 150 --> accuracy = 93.05% ( (+/-) 1.98% )

-------------------------------------------------
Deleted :

loaded data shape :  (614, 475)
Class distribution:
malware      : 518
goodware     : 96

Majority class classifier accuracy = 84.36%

Best value : k = 600 --> 

In [26]:
pd.read_csv('selected-features/k-search-summary/regkeys_operations_accuracy_step50.csv', index_col=0)

Unnamed: 0,opened,read,written,deleted
100,90.87% (+/- 1.05%),87.82% (+/- 1.29%),92.69% (+/- 1.93%),90.56% (+/- 3.11%)
150,90.96% (+/- 1.25%),88.02% (+/- 1.28%),93.05% (+/- 1.98%),89.8% (+/- 3.5%)
200,90.89% (+/- 1.47%),88.25% (+/- 1.11%),92.67% (+/- 2.04%),89.8% (+/- 3.35%)
250,91.58% (+/- 1.45%),88.45% (+/- 1.17%),92.88% (+/- 2.13%),90.34% (+/- 3.45%)
300,91.39% (+/- 1.49%),88.47% (+/- 1.13%),92.61% (+/- 2.01%),90.5% (+/- 3.12%)
350,91.46% (+/- 1.49%),88.52% (+/- 1.25%),92.4% (+/- 2.15%),90.83% (+/- 3.2%)
400,91.48% (+/- 1.48%),88.57% (+/- 1.34%),92.43% (+/- 2.12%),90.82% (+/- 3.6%)
450,91.26% (+/- 1.38%),88.49% (+/- 1.23%),92.4% (+/- 2.14%),90.78% (+/- 3.4%)
500,91.21% (+/- 1.29%),88.52% (+/- 1.22%),92.37% (+/- 2.06%),90.83% (+/- 3.63%)
550,91.14% (+/- 1.42%),88.78% (+/- 1.22%),92.15% (+/- 2.17%),90.99% (+/- 3.47%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**Opened regkeys :** k=250 <br/>
**Read regkeys :** k=550 <br/>
**Written regkeys :** k=150 <br/>
**Deleted regkeys :** k=100 <br/>

**Total regkeys operations selected columns =** 1050 features

In [27]:
selected_k = [250, 550, 150, 100]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'encoded_nested_regkeys_' + category + '.pkl.gz'
    regkeys = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #save k selected column names
    save_selected_features(df = regkeys, 
                           k = k, 
                           model=RandomForestClassifier(), 
                           prefix = 'regkeys_' + category,
                           path='selected-features', 
                           file= 'regkeys_' + category + '_nested_keys.pkl')

loaded data shape :  (2614, 19974)
loaded data shape :  (3354, 20305)
loaded data shape :  (1232, 2970)
loaded data shape :  (614, 475)


## File Operations

In [14]:
categories = ['opened', 'exists', 'read', 'written', 'created', 'deleted', 'failed', 'recreated']

In [13]:
batch_feature_selection(categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='encoded_nested_fileops_',
                        dataset='file_operations',
                        label='label')

-------------------------------------------------
Opened :

loaded data shape :  (2763, 16035)
Class distribution:
malware      : 1676
goodware     : 1087

Majority class classifier accuracy = 60.66%

Best value : k = 250 --> accuracy = 82.64% ( (+/-) 2.18% )

-------------------------------------------------
Exists :

loaded data shape :  (2163, 22777)
Class distribution:
malware      : 1264
goodware     : 899

Majority class classifier accuracy = 58.44%

Best value : k = 750 --> accuracy = 77.02% ( (+/-) 2.14% )

-------------------------------------------------
Read :

loaded data shape :  (1820, 10428)
Class distribution:
malware      : 1225
goodware     : 595

Majority class classifier accuracy = 67.31%

Best value : k = 300 --> accuracy = 84.05% ( (+/-) 1.93% )

-------------------------------------------------
Written :

loaded data shape :  (1343, 9190)
Class distribution:
malware      : 998
goodware     : 345

Majority class classifier accuracy = 74.31%

Best value : k = 250 -

In [17]:
fileops_accuracy_summary_

Unnamed: 0,opened,exists,read,written,created,deleted,failed,recreated
100,81.87% (+/- 2.35%),75.9% (+/- 2.63%),83.72% (+/- 2.12%),87.69% (+/- 1.87%),84.9% (+/- 2.27%),82.7% (+/- 3.55%),87.27% (+/- 1.9%),84.01% (+/- 4.26%)
150,82.22% (+/- 2.2%),76.5% (+/- 2.43%),83.79% (+/- 2.29%),87.81% (+/- 1.93%),85.0% (+/- 2.27%),83.21% (+/- 3.39%),86.93% (+/- 1.81%),84.41% (+/- 3.46%)
200,82.14% (+/- 2.05%),76.38% (+/- 2.43%),83.88% (+/- 1.94%),88.21% (+/- 2.19%),85.51% (+/- 2.2%),83.02% (+/- 3.31%),86.58% (+/- 1.69%),83.56% (+/- 3.66%)
250,82.64% (+/- 2.18%),76.71% (+/- 2.29%),83.59% (+/- 1.98%),88.48% (+/- 2.33%),85.59% (+/- 2.25%),82.48% (+/- 3.15%),87.4% (+/- 1.78%),81.31% (+/- 3.47%)
300,82.46% (+/- 1.87%),76.73% (+/- 2.3%),84.05% (+/- 1.93%),88.21% (+/- 2.3%),85.54% (+/- 2.36%),82.67% (+/- 3.12%),87.06% (+/- 2.26%),81.86% (+/- 4.02%)
350,82.25% (+/- 2.09%),76.73% (+/- 2.41%),84.01% (+/- 1.86%),87.96% (+/- 2.38%),85.72% (+/- 2.41%),82.7% (+/- 3.09%),87.46% (+/- 1.97%),82.22% (+/- 3.82%)
400,82.18% (+/- 1.96%),76.54% (+/- 2.34%),83.96% (+/- 1.99%),88.16% (+/- 2.32%),85.46% (+/- 2.29%),82.51% (+/- 2.98%),87.25% (+/- 2.04%),81.81% (+/- 3.75%)
450,82.31% (+/- 1.91%),76.71% (+/- 2.34%),83.88% (+/- 1.97%),87.96% (+/- 2.31%),85.51% (+/- 2.45%),82.39% (+/- 3.06%),86.94% (+/- 2.18%),81.85% (+/- 4.03%)
500,82.3% (+/- 1.78%),76.91% (+/- 2.21%),83.81% (+/- 1.93%),87.84% (+/- 2.24%),85.54% (+/- 2.41%),82.39% (+/- 3.05%),87.15% (+/- 2.2%),82.36% (+/- 3.99%)
550,82.28% (+/- 1.88%),76.7% (+/- 2.13%),83.86% (+/- 1.98%),87.99% (+/- 2.26%),85.36% (+/- 2.54%),82.48% (+/- 3.01%),86.89% (+/- 2.24%),81.16% (+/- 4.09%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**Opened files :** k=250 <br/>
**Exists files :** k=250 <br/>
**Read files :** k=350 <br/>
**Written files :** k=200 <br/>
**Created files :** k=350 <br/>
**Deleted files :** k=150 <br/>
**Failed files :** k=300 <br/>
**Recreated files :** k=150 <br/>

**Total file operations selected columns =** 2000 features

In [20]:
selected_k = [250, 250, 350, 200, 350, 150, 300, 150]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'encoded_nested_fileops_' + category + '.pkl.gz'
    fileops = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #save k selected column names
    save_selected_features(df = fileops, 
                           k = k, 
                           model=RandomForestClassifier(), 
                           prefix = 'fileops_' + category,
                           path='selected-features', 
                           file= 'fileops_' + category + '_nested_files.pkl')

loaded data shape :  (2763, 16035)
loaded data shape :  (2163, 22777)
loaded data shape :  (1820, 10428)
loaded data shape :  (1343, 9190)
loaded data shape :  (1291, 13449)
loaded data shape :  (1058, 5024)
loaded data shape :  (1744, 5545)
loaded data shape :  (667, 588)


## Loaded DLL

In [105]:
batch_feature_selection(categories = ['onehot'], 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='loaded_dlls_',
                        dataset='loaded_dlls',
                        label='label')

-------------------------------------------------
Onehot :

loaded data shape :  (3162, 1082)
Class distribution:
malware      : 1860
goodware     : 1302

Majority class classifier accuracy = 58.82%

Best value : k = 100 --> accuracy = 87.34% ( (+/-) 1.56% )



In [106]:
pd.read_csv('selected-features/k-search-summary/loaded_dlls_accuracy_step50.csv', index_col=0)

Unnamed: 0,onehot
100,87.34% (+/- 1.56%)
150,87.34% (+/- 1.61%)
200,86.71% (+/- 1.68%)
250,86.73% (+/- 1.81%)
300,86.41% (+/- 1.77%)
350,86.39% (+/- 1.59%)
400,86.71% (+/- 1.72%)
450,85.78% (+/- 1.83%)
500,86.39% (+/- 1.75%)
550,86.23% (+/- 1.64%)


In [108]:
#load data
file_name = 'loaded_dlls_onehot.pkl.gz'
loaded_dlls = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#save k selected column names
save_selected_features(df = loaded_dlls,
                       k = 150, 
                       model=RandomForestClassifier(), 
                       prefix = 'loaded_dll',
                       path='selected-features',
                       file= 'loaded_dll_onehot.pkl')

loaded data shape :  (3162, 1082)


## PE Entropy

In [112]:
batch_feature_selection(categories = ['analysis'], 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='pe_entropy_',
                        dataset='pe_entropy',
                        label='label')

-------------------------------------------------
Analysis :

loaded data shape :  (4308, 795)
Class distribution:
malware      : 2600
goodware     : 1708

Majority class classifier accuracy = 60.35%

Best value : k = 400 --> accuracy = 90.82% ( (+/-) 1.35% )



In [113]:
pd.read_csv('selected-features/k-search-summary/pe_entropy_accuracy_step50.csv', index_col=0)

Unnamed: 0,analysis
100,90.6% (+/- 1.34%)
150,90.47% (+/- 1.08%)
200,90.6% (+/- 1.3%)
250,90.72% (+/- 1.13%)
300,90.71% (+/- 1.17%)
350,90.26% (+/- 1.29%)
400,90.82% (+/- 1.35%)
450,90.48% (+/- 1.27%)
500,90.48% (+/- 1.15%)
550,90.47% (+/- 1.26%)


In [114]:
#load data
file_name = 'pe_entropy_analysis.pkl.gz'
pe_entropy = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#save k selected column names
save_selected_features(df = pe_entropy,
                       k = 300, 
                       model=RandomForestClassifier(), 
                       prefix = 'pe_entropy',
                       path='selected-features',
                       file= 'pe_entropy_analysis.pkl')

loaded data shape :  (4308, 795)


## PE Imports

**Libraries :**

In [119]:
batch_feature_selection(categories = ['libraries'], 
                        k_range = [50, 551], 
                        step=25, 
                        scoring='accuracy',
                        dataset_prefix='pe_imports_',
                        dataset='pe_imports',
                        label='label')

-------------------------------------------------
Libraries :

loaded data shape :  (4312, 545)
Class distribution:
malware      : 2603
goodware     : 1709

Majority class classifier accuracy = 60.37%

Best value : k = 325 --> accuracy = 86.21% ( (+/-) 1.64% )



In [120]:
pd.read_csv('selected-features/k-search-summary/pe_imports_accuracy_step25.csv', index_col=0)

Unnamed: 0,libraries
50,84.24% (+/- 1.75%)
75,85.17% (+/- 1.81%)
100,85.42% (+/- 1.66%)
125,85.52% (+/- 1.73%)
150,85.61% (+/- 1.7%)
175,85.75% (+/- 1.7%)
200,86.1% (+/- 1.58%)
225,85.88% (+/- 1.77%)
250,85.85% (+/- 1.75%)
275,85.85% (+/- 1.72%)


In [121]:
#load data
file_name = 'pe_imports_libraries.pkl.gz'
pe_imports_libraries = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

#save k selected column names
save_selected_features(df = pe_imports_libraries,
                       k = 200, 
                       model=RandomForestClassifier(), 
                       prefix = 'pe_imports_libraries',
                       path='selected-features',
                       file= 'pe_imports_libraries.pkl')

loaded data shape :  (4312, 545)


**Imports per library (most frequent libraries):**

In [124]:
categories = ['kernel32', 'user32', 'advapi32', 'msvcrt', 'gdi32', 'shell32', 'ole32', 'comctl32']
#['kernel32', 'user32', 'msvcrt', 'ole32', 'shell32', 'oleaut32', 'comctl32', 'comdlg32', 'winmm', 'ntdll']

In [125]:
batch_feature_selection(categories = categories, 
                        k_range = [100, 1001], 
                        step=50, 
                        scoring='accuracy',
                        dataset_prefix='pe_imports_',
                        dataset='pe_imports_top_libraries',
                        label='label')

-------------------------------------------------
Kernel32 :

loaded data shape :  (3609, 1008)
Class distribution:
malware      : 2070
goodware     : 1539

Majority class classifier accuracy = 57.36%

Best value : k = 550 --> accuracy = 90.98% ( (+/-) 1.48% )

-------------------------------------------------
User32 :

loaded data shape :  (2746, 729)
Class distribution:
malware      : 1508
goodware     : 1238

Majority class classifier accuracy = 54.92%

Best value : k = 250 --> accuracy = 88.89% ( (+/-) 2.1% )

-------------------------------------------------
Advapi32 :

loaded data shape :  (2482, 503)
Class distribution:
malware      : 1399
goodware     : 1083

Majority class classifier accuracy = 56.37%

Best value : k = 300 --> accuracy = 88.1% ( (+/-) 2.49% )

-------------------------------------------------
Msvcrt :

loaded data shape :  (1675, 790)
Class distribution:
malware      : 847
goodware     : 828

Majority class classifier accuracy = 50.57%

Best value : k = 200 --

In [126]:
pd.read_csv('selected-features/k-search-summary/pe_imports_top_libraries_accuracy_step50.csv', index_col=0)

Unnamed: 0,kernel32,user32,advapi32,msvcrt,gdi32,shell32,ole32,comctl32
100,90.57% (+/- 1.53%),88.16% (+/- 2.13%),87.5% (+/- 2.43%),94.05% (+/- 1.92%),85.37% (+/- 2.19%),82.1% (+/- 2.35%),82.25% (+/- 3.25%),75.38% (+/- 3.79%)
150,90.44% (+/- 1.29%),88.16% (+/- 2.22%),87.9% (+/- 2.59%),94.01% (+/- 1.89%),85.11% (+/- 1.96%),82.04% (+/- 2.59%),81.52% (+/- 3.07%),75.57% (+/- 3.68%)
200,90.86% (+/- 1.42%),88.52% (+/- 2.07%),87.3% (+/- 2.52%),94.33% (+/- 1.9%),85.11% (+/- 2.18%),81.73% (+/- 2.77%),81.46% (+/- 2.88%),75.67% (+/- 3.74%)
250,90.58% (+/- 1.29%),88.89% (+/- 2.1%),87.5% (+/- 2.45%),94.03% (+/- 2.22%),85.67% (+/- 2.15%),82.92% (+/- 2.99%),82.18% (+/- 2.99%),75.67% (+/- 3.5%)
300,90.86% (+/- 1.31%),88.71% (+/- 1.96%),88.1% (+/- 2.49%),93.45% (+/- 1.72%),85.37% (+/- 1.93%),81.73% (+/- 2.53%),81.88% (+/- 3.11%),75.0% (+/- 3.75%)
350,90.86% (+/- 1.43%),87.8% (+/- 2.27%),87.7% (+/- 2.72%),93.73% (+/- 1.93%),85.37% (+/- 2.14%),82.1% (+/- 2.65%),82.61% (+/- 3.19%),74.9% (+/- 3.61%)
400,90.58% (+/- 1.46%),88.36% (+/- 2.01%),87.53% (+/- 2.48%),93.45% (+/- 1.98%),85.72% (+/- 2.39%),81.99% (+/- 2.64%),81.82% (+/- 3.08%),75.38% (+/- 3.64%)
450,90.72% (+/- 1.49%),88.36% (+/- 2.03%),87.7% (+/- 2.47%),93.71% (+/- 1.99%),85.37% (+/- 2.34%),81.99% (+/- 2.5%),81.82% (+/- 3.1%),75.0% (+/- 3.59%)
500,90.72% (+/- 1.4%),88.73% (+/- 2.03%),87.7% (+/- 2.21%),93.45% (+/- 2.14%),85.41% (+/- 2.03%),82.04% (+/- 2.54%),81.46% (+/- 3.27%),75.0% (+/- 3.72%)
550,90.98% (+/- 1.48%),88.36% (+/- 2.05%),87.7% (+/- 2.48%),94.01% (+/- 1.99%),85.37% (+/- 2.02%),82.1% (+/- 2.44%),82.18% (+/- 2.96%),75.29% (+/- 3.65%)


Although some k values gives the highest accuracy, if we take into account **minimizing model complexity and the variance on the accuracy results**, we'll opt for the following k values: <br/><br/>
**kernel32 :** k=200 <br/>
**user32 :** k=250 <br/>
**advapi32 :** k=150 <br/>
**msvcrt :** k=200 <br/>
**gdi32 :** k=250 <br/>
**shell32 :** k=150 <br/>
**ole32 :** k=100 <br/>
**comctl32 :** k=100 <br/>

**Total PE imports selected columns =** 1400 features

In [127]:
selected_k = [200, 250, 150, 200, 250, 150, 100, 100]

for category, k in zip(categories, selected_k):
    
    #load data
    file_name = 'pe_imports_' + category + '.pkl.gz'
    pe_imports = get_data(file_path = os.path.join(folder_path, file_name), compression='gzip')

    #save k selected column names
    save_selected_features(df = pe_imports, 
                           k = k, 
                           model=RandomForestClassifier(), 
                           prefix = 'pe_imports_' + category,
                           path='selected-features', 
                           file= 'pe_imports_' + category + '.pkl')

loaded data shape :  (3609, 1008)
loaded data shape :  (2746, 729)
loaded data shape :  (2482, 503)
loaded data shape :  (1675, 790)
loaded data shape :  (1641, 543)
loaded data shape :  (1616, 176)
loaded data shape :  (1374, 312)
loaded data shape :  (1318, 84)
