In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import pickle
import os
from  xml.dom import minidom
import xmltodict
import json



In [33]:
df = pd.read_csv("Dataset/Ransomware.csv")
df.dropna(how = 'any' , inplace = True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62485 entries, 0 to 62484
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FileName            62485 non-null  object
 1   md5Hash             62485 non-null  object
 2   Machine             62485 non-null  int64 
 3   DebugSize           62485 non-null  int64 
 4   DebugRVA            62485 non-null  int64 
 5   MajorImageVersion   62485 non-null  int64 
 6   MajorOSVersion      62485 non-null  int64 
 7   ExportRVA           62485 non-null  int64 
 8   ExportSize          62485 non-null  int64 
 9   IatVRA              62485 non-null  int64 
 10  MajorLinkerVersion  62485 non-null  int64 
 11  MinorLinkerVersion  62485 non-null  int64 
 12  NumberOfSections    62485 non-null  int64 
 13  SizeOfStackReserve  62485 non-null  int64 
 14  DllCharacteristics  62485 non-null  int64 
 15  ResourceSize        62485 non-null  int64 
 16  BitcoinAddresses    62

In [34]:
df.columns

Index(['FileName', 'md5Hash', 'Machine', 'DebugSize', 'DebugRVA',
       'MajorImageVersion', 'MajorOSVersion', 'ExportRVA', 'ExportSize',
       'IatVRA', 'MajorLinkerVersion', 'MinorLinkerVersion',
       'NumberOfSections', 'SizeOfStackReserve', 'DllCharacteristics',
       'ResourceSize', 'BitcoinAddresses', 'Benign'],
      dtype='object')

In [35]:
df_features = df [['DebugSize','ExportSize','IatVRA','SizeOfStackReserve','DllCharacteristics', 'MajorLinkerVersion', 'MinorLinkerVersion', 'NumberOfSections','ResourceSize','DebugRVA','ExportRVA']]
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62485 entries, 0 to 62484
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   DebugSize           62485 non-null  int64
 1   ExportSize          62485 non-null  int64
 2   IatVRA              62485 non-null  int64
 3   SizeOfStackReserve  62485 non-null  int64
 4   DllCharacteristics  62485 non-null  int64
 5   MajorLinkerVersion  62485 non-null  int64
 6   MinorLinkerVersion  62485 non-null  int64
 7   NumberOfSections    62485 non-null  int64
 8   ResourceSize        62485 non-null  int64
 9   DebugRVA            62485 non-null  int64
 10  ExportRVA           62485 non-null  int64
dtypes: int64(11)
memory usage: 5.2 MB


In [36]:
df_result = df["Benign"]
#Convert target column value to int (27118 good(1) and 35367 infrcted(3))
df_result.value_counts()

0    35367
1    27118
Name: Benign, dtype: int64

In [37]:
feature_train, feature_test, result_train, result_test = train_test_split(df_features, df_result, test_size=0.20, random_state=42)
print(f'feature_train : {feature_train.shape}')
print(f'feature_test : {feature_test.shape}')
print(f'result_train : {result_train.shape}')
print(f'result_test : {result_test.shape}')

feature_train : (49988, 11)
feature_test : (12497, 11)
result_train : (49988,)
result_test : (12497,)


In [38]:
#Generate Hyperparameter lists for tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt', None]
# Maximum number of levels in tree
max_depth = [2,5,8,11]
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['log2', 'sqrt', None], 'max_depth': [2, 5, 8, 11], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [False]}


In [39]:
rfc_Model = RandomForestClassifier()
rfc_Grid = GridSearchCV(estimator = rfc_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 5)
rfc_Grid.fit(feature_train , result_train )
rfc_Grid.best_params_

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


{'bootstrap': False,
 'max_depth': 11,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 10}

In [40]:
rfc_Grid.score(feature_test,result_test)

0.9930383291990078

In [41]:
classifier= RandomForestClassifier(bootstrap= rfc_Grid.best_params_["bootstrap"],
 max_depth= rfc_Grid.best_params_["max_depth"],
 max_features= rfc_Grid.best_params_["max_features"],
 min_samples_leaf= rfc_Grid.best_params_["min_samples_leaf"],
 min_samples_split= rfc_Grid.best_params_["min_samples_split"],
 n_estimators= rfc_Grid.best_params_["n_estimators"])  
RansomwareDetector_RandomForest = classifier.fit(feature_train,result_train)  


In [42]:
with open("Models/RansomwareDetector_RandomForest",'wb') as f:
    pickle.dump(RansomwareDetector_RandomForest,f)

In [1]:
import pefile
import pandas as pd

pe = pefile.PE("protoc-gen-grpc-web-1.5.0-windows-x86_64.exe", fast_load=True)



features ={}
# Version of linker that produced file.
features = features|{"DebugSize": [pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size]}
features = features|{"ExportSize": [pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size]}
features = features|{"IatVRA":[pe.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress]}
features = features|{"SizeOfStackReserve": [pe.OPTIONAL_HEADER.SizeOfStackReserve]}
features = features|{"DllCharacteristics": [pe.OPTIONAL_HEADER.DllCharacteristics]}
#features = features|{"BitcoinAddresses": [pe.OPTIONAL_HEADER.DllCharacteristics]}
features = features|{"MajorLinkerVersion": [pe.OPTIONAL_HEADER.MajorLinkerVersion]}
features = features|{"MinorLinkerVersion": [pe.OPTIONAL_HEADER.MinorLinkerVersion]}
features = features|{"NumberOfSections": [pe.FILE_HEADER.NumberOfSections]}
features = features|{"ResourceSize": [pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size]}
features = features|{"DebugRVA": [pe.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress]}
features = features|{"ExportRVA": [pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress]}
print(features)
df_input_predict = pd.DataFrame(features)



print(df_input_predict)

{'DebugSize': [84], 'ExportSize': [84], 'IatVRA': [1990656], 'SizeOfStackReserve': [1048576], 'DllCharacteristics': [33120], 'MajorLinkerVersion': [14], 'MinorLinkerVersion': [37], 'NumberOfSections': [6], 'ResourceSize': [0], 'DebugRVA': [2423776], 'ExportRVA': [0]}
   DebugSize  ExportSize   IatVRA  SizeOfStackReserve  DllCharacteristics  \
0         84          84  1990656             1048576               33120   

   MajorLinkerVersion  MinorLinkerVersion  NumberOfSections  ResourceSize  \
0                  14                  37                 6             0   

   DebugRVA  ExportRVA  
0   2423776          0  


In [5]:
with open("Models/RansomwareDetector_RandomForest",'rb') as f:
    model = pickle.load(f)
result = model.predict(df_input_predict)
print(result)

[1]
