# Data Sprint #21: Classification of Malware with PE headers

### About the Data

        Dataset is related to Portable Executable files for malware detection. There are 55 features in the dataset 
        (excluding target variable). The features consist of 19 image dos headers, 7 file headers and 29 optional headers.

   #### IMAGE_DOS_HEADER (19)

        "e_magic", "e_cblp", "e_cp","e_crlc","e_cparhdr",
        "e_minalloc","e_maxalloc","e_ss","e_sp",
        "e_csum","e_ip","e_cs","e_lfarlc","e_ovno","e_res",
        "e_oemid","e_oeminfo","e_res2","e_lfanew"
        
   #### FILE_HEADER (7)

        "Machine","NumberOfSections","CreationYear","PointerToSymbolTable",
        "NumberOfSymbols","SizeOfOptionalHeader","Characteristics"
        
   #### OPTIONAL_HEADER (29)

        "Magic", "MajorLinkerVersion", "MinorLinkerVersion", "SizeOfCode", "SizeOfInitializedData", 
        "SizeOfUninitializedData", "AddressOfEntryPoint",
        "BaseOfCode", "BaseOfData", "ImageBase", "SectionAlignment", "FileAlignment",
        "MajorOperatingSystemVersion", "MinorOperatingSystemVersion",
        "MajorImageVersion", "MinorImageVersion", "MajorSubsystemVersion",
        "MinorSubsystemVersion", "SizeOfImage", "SizeOfHeaders", "CheckSum",
        "Subsystem", "DllCharacteristics", "SizeOfStackReserve", "SizeOfStackCommit",
        "SizeOfHeapReserve", "SizeOfHeapCommit", "LoaderFlags", "NumberOfRvaAndSizes"
        

        The first field, e_magic, is the so-called magic number. 
        This field is used to identify an MS-DOS-compatible file type. 
        
  #### TARGET_VARIABLE: 
        class - 0 (benign), 1 (malware)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [3]:
SEED = 7

In [4]:
data = pd.read_csv('MalwareData.csv')
data.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
0,23117,144,3,0,4,0,65535,0,184,0,...,115397,2,33792,1048576,4096,1048576,4096,0,16,1
1,23117,144,3,0,4,0,65535,0,184,0,...,0,2,0,10485760,40960,6291456,16384,0,16,1
2,23117,144,3,0,4,0,65535,0,184,0,...,0,2,0,1048576,4096,1048576,4096,0,16,0
3,23117,144,3,0,4,0,65535,0,184,0,...,142244,2,33088,262144,8192,1048576,4096,0,16,0
4,23117,144,3,0,4,0,65535,0,184,0,...,60601,2,1024,1048576,4096,1048576,4096,0,16,0


In [5]:
data.shape

(3888, 56)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3888 entries, 0 to 3887
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   e_magic                      3888 non-null   int64  
 1   e_cblp                       3888 non-null   int64  
 2   e_cp                         3888 non-null   int64  
 3   e_crlc                       3888 non-null   int64  
 4   e_cparhdr                    3888 non-null   int64  
 5   e_minalloc                   3888 non-null   int64  
 6   e_maxalloc                   3888 non-null   int64  
 7   e_ss                         3888 non-null   int64  
 8   e_sp                         3888 non-null   int64  
 9   e_csum                       3888 non-null   int64  
 10  e_ip                         3888 non-null   int64  
 11  e_cs                         3888 non-null   int64  
 12  e_lfarlc                     3888 non-null   int64  
 13  e_ovno            

In [7]:
data.isnull().sum()

e_magic                           0
e_cblp                            0
e_cp                              0
e_crlc                            0
e_cparhdr                         0
e_minalloc                        0
e_maxalloc                        0
e_ss                              0
e_sp                              0
e_csum                            0
e_ip                              0
e_cs                              0
e_lfarlc                          0
e_ovno                            0
e_res                          3888
e_oemid                           0
e_oeminfo                         0
e_res2                         3888
e_lfanew                          0
Machine                           0
NumberOfSections                  0
CreationYear                      0
PointerToSymbolTable              0
NumberOfSymbols                   0
SizeOfOptionalHeader              0
Characteristics                   0
Magic                             0
MajorLinkerVersion          

In [8]:
corrs = pd.DataFrame(data.corr())
corrs.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
e_magic,,,,,,,,,,,...,,,,,,,,,,
e_cblp,,1.0,0.991646,,-0.160597,-0.004072,-0.184235,-0.002285,-0.004813,-0.000188,...,-0.000226,-0.002031,-0.007984,-0.001539,-0.001506,-0.00173,-0.002546,-0.00158,0.000302,0.009586
e_cp,,0.991646,1.0,,-0.092625,-0.020386,-0.108101,-0.013631,-0.015435,0.000606,...,0.000431,0.006677,-0.001415,0.005063,-0.002009,0.004193,0.003982,-0.00515,-0.000973,-0.011097
e_crlc,,,,,,,,,,,...,,,,,,,,,,
e_cparhdr,,-0.160597,-0.092625,,1.0,-0.002329,0.828384,0.001455,0.017205,0.002377,...,0.002278,0.037409,-0.038135,0.019246,0.007428,0.021682,0.024091,0.00213,-0.003817,-0.081856


In [9]:
data.drop(columns = ['e_res', 'e_res2'], inplace = True, axis = 'columns')
data.shape

(3888, 54)

In [65]:
data.describe()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,...,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,23117.0,148.252058,2.947016,0.0,3.967593,2.120885,65110.442901,16.851852,199.741255,0.001029,...,1785479.0,2.132202,9489.058899,2008231.0,10724.42,2065907.0,5694.156379,31511.87,15.972737,0.51749
std,0.0,591.610421,2.291419,0.0,0.357188,65.785975,5195.740601,1050.77749,1048.23158,0.039275,...,69063780.0,0.474741,14841.770405,4682014.0,82568.07,4397444.0,6510.114435,1342464.0,0.648187,0.499758
min,23117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23117.0,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,0.0,...,0.0,2.0,0.0,1048576.0,4096.0,1048576.0,4096.0,0.0,16.0,0.0
50%,23117.0,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,0.0,...,78941.5,2.0,0.0,1048576.0,4096.0,1048576.0,4096.0,0.0,16.0,1.0
75%,23117.0,144.0,3.0,0.0,4.0,0.0,65535.0,0.0,184.0,0.0,...,212812.2,2.0,32768.0,1048576.0,8192.0,1048576.0,4096.0,0.0,16.0,1.0
max,23117.0,37008.0,144.0,0.0,4.0,4096.0,65535.0,65520.0,65534.0,2.0,...,4294967000.0,16.0,37184.0,33554430.0,2097152.0,33554430.0,65536.0,63429510.0,16.0,1.0


In [12]:
x = data.drop('class', axis = 'columns')
y = data['class']
x.shape, y.shape

((3888, 53), (3888,))

In [13]:
ros = RandomOverSampler(random_state=42)
# fit predictor and target variable
x_ros, y_ros = ros.fit_resample(x, y)
print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({1: 2012, 0: 1876})
Resample dataset shape Counter({1: 2012, 0: 2012})


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.1)
x_train.shape, x_test.shape

((3499, 53), (389, 53))

In [28]:
std = StandardScaler()
x_train_scaled = std.fit_transform(x_train)
x_test_scaled = std.fit_transform(x_test)


In [15]:
def GetBasedModel():
    basedModels = []
    #basedModels.append(('LR'   , LogisticRegression()))
    #basedModels.append(('LDA'  , LinearDiscriminantAnalysis()))
    basedModels.append(('KNN'  , KNeighborsClassifier(n_jobs = -1)))
    basedModels.append(('CART' , DecisionTreeClassifier()))
    #basedModels.append(('NB'   , GaussianNB()))
    #basedModels.append(('SVM'  , SVC(probability=True)))
    basedModels.append(('AB'   , AdaBoostClassifier()))
    basedModels.append(('GBM'  , GradientBoostingClassifier()))
    basedModels.append(('RF'   , RandomForestClassifier(n_jobs = -1)))
    basedModels.append(('ET'   , ExtraTreesClassifier(n_jobs = -1)))

    
    return basedModels

In [16]:
def BasedLine2(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 5
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1', n_jobs = -1, verbose = 3)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [61]:
models = GetBasedModel()
names,results = BasedLine2(x_ros, y_ros,models)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.8s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


KNN: 0.934598 (0.007310)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.3s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


CART: 0.968225 (0.006969)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


AB: 0.966334 (0.002780)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


GBM: 0.977216 (0.003001)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


RF: 0.984198 (0.003806)
ET: 0.981216 (0.005487)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [64]:
results[2].max()

0.9715698393077874

In [74]:
models = GetBasedModel()
names,results = BasedLine2(x_train, y_train,models)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


KNN: 0.935078 (0.017047)
CART: 0.966297 (0.006456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


AB: 0.966981 (0.007376)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


GBM: 0.973569 (0.008667)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


RF: 0.982685 (0.001791)
ET: 0.980457 (0.005006)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [50]:
rf_tuner = RandomForestClassifier(n_jobs = -1)
param_grid = { 
    'n_estimators': [500, 600, 800],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8, None],
    'criterion' :['gini', 'entropy'],
    'min_samples_split':[1, 2, 3],
    'min_samples_leaf':[1, 2, 3],
    'bootstrap':[True, False]
}
CV_rfc.fit(x_ros, y_ros)
CV_rfc.best_score_




Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.0s finished


0.9836003213744939

In [51]:
CV_rfc.best_params_

{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 3,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'entropy',
 'bootstrap': False}

In [19]:
rfc_tuned = RandomForestClassifier(n_estimators= 500,
 min_samples_split= 5,
 min_samples_leaf= 1,
 max_features= 'log2',
 max_depth=None,
 criterion= 'entropy',
 bootstrap=False)
rfc_tuned.fit(x_train, y_train)
rfc_tuned_pred = rfc_tuned.predict(x_test)
rfc_report = classification_report(y_test, rfc_tuned_pred)
print(rfc_report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       187
           1       0.98      1.00      0.99       202

    accuracy                           0.99       389
   macro avg       0.99      0.99      0.99       389
weighted avg       0.99      0.99      0.99       389



In [20]:


et = ExtraTreesClassifier(n_jobs = -1)
et.fit(x_train, y_train)
et_pred = et.predict(x_test)
et_report = classification_report(y_test, et_pred)
print(et_report)


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       187
           1       0.98      1.00      0.99       202

    accuracy                           0.99       389
   macro avg       0.99      0.99      0.99       389
weighted avg       0.99      0.99      0.99       389



In [60]:
gb = GradientBoostingClassifier()
gb.fit(x_ros, y_ros)
gb_pred = gb.predict(x_test)
gb_report = classification_report(y_test, et_pred)
print(et_report)


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       187
           1       0.98      1.00      0.99       202

    accuracy                           0.99       389
   macro avg       0.99      0.99      0.99       389
weighted avg       0.99      0.99      0.99       389



In [52]:
gb.fit(x_ros, y_ros)
et.fit(x_ros, y_ros)

ExtraTreesClassifier(n_jobs=-1)

In [53]:
voting_clf = VotingClassifier(estimators=[('RFC', rfc_tuned), ('ETC', et),('GBC', gb) ], voting='hard', n_jobs = -1)
voting_clf.fit(x_ros, y_ros)
preds = voting_clf.predict(x_test)
acc = accuracy_score(y_test, preds)
#l_loss = log_loss(y_test, preds)
f1 = f1_score(y_test, preds)

In [54]:
f1

1.0

In [43]:
rfc_tuned.fit(x_ros, y_ros)


RandomForestClassifier(bootstrap=False, criterion='entropy',
                       max_features='log2', min_samples_split=5,
                       n_estimators=500)

In [28]:
test = pd.read_csv('test.csv')
test.isnull().sum()

e_magic                           0
e_cblp                            0
e_cp                              0
e_crlc                            0
e_cparhdr                         0
e_minalloc                        0
e_maxalloc                        0
e_ss                              0
e_sp                              0
e_csum                            0
e_ip                              0
e_cs                              0
e_lfarlc                          0
e_ovno                            0
e_res                          1296
e_oemid                           0
e_oeminfo                         0
e_res2                         1296
e_lfanew                          0
Machine                           0
NumberOfSections                  0
CreationYear                      0
PointerToSymbolTable              0
NumberOfSymbols                   0
SizeOfOptionalHeader              0
Characteristics                   0
Magic                             0
MajorLinkerVersion          

In [29]:
test.drop(columns = ['e_res', 'e_res2'], inplace = True, axis = 'columns')
test.shape

(1296, 53)

In [55]:
preds = voting_clf.predict(test)

In [44]:
preds = rfc_tuned.predict(test)

In [56]:
predictions = pd.DataFrame({'prediction':preds})
predictions.to_csv('preds.csv', index = False)

In [57]:
gbc = GradientBoostingClassifier()
parameters = {
    'n_estimators': [80, 90, 100, 125, 150, 200, 250, 300, 350, 400],
    'max_depth': [2,3,4,5,8,16,None],
    'learning_rate': [0.03, 0.1, 0.3, 0.5, 0.8]
}
gbm_cv = RandomizedSearchCV(gbc, parameters, cv=5, n_jobs = -1, verbose = 2)
gbm_cv.fit(x_ros, y_ros)
gbm_cv.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   19.1s finished


0.985090077562498

In [58]:
gbm_cv.best_params_

{'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.1}