## Ransomware random forest ensemble

Uses dataset with 20 features selected from random forest

In [140]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
dataSet = pd.read_csv('dataset/ransomware_rf.csv', encoding = 'unicode_escape')
dataSet = dataSet.iloc[:, 1:]
dataSet.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,0,0.662169,-0.230795,-0.368331,-0.499224,-0.479053,-0.143198,-0.128846,1.377379,-0.223468,...,-0.349775,-0.422875,-0.393004,-0.068145,-0.193024,-0.3589,-0.40636,-0.335907,1.329492,Benign
1,0,0.662169,-0.230743,-0.369939,-0.501952,-0.485911,-0.035761,-0.009968,-0.704224,-0.223433,...,-0.35239,-0.425028,-0.398424,-0.042102,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
2,0,-0.238853,-0.23078,-0.369972,-0.501965,-0.485922,0.175303,0.221401,-0.70774,-0.223467,...,-0.35242,-0.425044,-0.398436,0.074325,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
3,0,0.90328,-0.224925,-0.3647,-0.499898,-0.484241,-0.143935,-0.129112,-0.705722,-0.223484,...,-0.352436,-0.425053,-0.398442,-0.101575,-0.18863,-0.3589,-0.40879,-0.33805,-0.505671,Benign
4,0,0.330493,-0.23079,-0.367863,-0.498489,-0.477793,-0.143578,-0.128975,1.377379,-0.223459,...,-0.349157,-0.42001,-0.39174,-0.094055,-0.19314,-0.3589,-0.405693,-0.333292,-0.044641,Benign


Train test split

In [142]:
train_ratio = 0.9
n_samples = dataSet.shape[0]
indices = np.random.permutation(n_samples)

train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = dataSet.iloc[train_indices]
test_data = dataSet.iloc[test_indices]


In [143]:
def split_Xy(df):
    X = df.drop('Label', axis=1)
    y = df['Label']
    return X, y

In [144]:
ransom_types = dataSet['Label'].value_counts()
ransom_list = list(ransom_types.keys())
print(ransom_list)


['SVpeng', 'PornDroid', 'Koler', 'Benign', 'RansomBO', 'Charger', 'Simplocker', 'WannaLocker', 'Jisut', 'Lockerpin', 'Pletor']


### Binary classification decision tree

**Decision tree classifier**

_Creates binary classification dataset for each class an trains a decision tree_

In [145]:
class RandForestBinaryClassifier:
    def __init__(self):
        self.label = 'Benign'
        self.dtc = RandomForestClassifier(n_estimators=15, random_state=42, min_samples_split=3)
        self.df = None

    def split_Xy(self, df):
        X = df.drop('Label', axis=1)
        y = df['Label']
        return X, y
    
    def modify_and_balance_labels(self, df):
        base = self.label
        df['Label'] = df['Label'].apply(lambda x: 0 if x == base else 1)

        num_label_1 = df[df['Label'] == 1].shape[0]
        num_label_0 = df[df['Label'] == 0].shape[0]

        if num_label_1 > num_label_0:
            indices_to_remove = random.sample(df[df['Label'] == 1].index.tolist(), num_label_1 - num_label_0)
            df = df.drop(indices_to_remove)

        return df

    def set_binary_label(self, df, label):
        self.label = label
        self.df = df.copy()
        self.df = self.modify_and_balance_labels(self.df)

    def train_tree(self):
        X_train, y_train = self.split_Xy(self.df)
        self.dtc.fit(X_train,y_train)

    def predict_probability(self, X_test):
        return self.dtc.predict_proba(X_test)[:,1]

    def test_model_accuracy(self, test_data):
        td = test_data.copy()
        X_test, y_test = self.split_Xy(
            self.modify_and_balance_labels(td)
        )
        y_pred = self.dtc.predict(X_test)
        probs = self.dtc.predict_proba(X_test)
        certainty = np.max(probs,axis=1)
        mean = np.mean(certainty)
        std = np.std(certainty)
        accuracy = np.mean(y_test == y_pred)

        return accuracy,mean,std
    
    def save_dataset(self):
        self.df.to_csv(f"{self.label}_balanced_ransomware.csv")


**Create decision tree for each malware class**

In [146]:
dtB = RandForestBinaryClassifier()
dtB.set_binary_label(train_data,'Pletor')
dtB.train_tree()
accuracy,mean,std = dtB.test_model_accuracy(test_data)

print(f"Accuracy: {accuracy * 100:.2f}% | Certainty: {mean* 100:.2f} ±{std* 100:.2f} ")

Accuracy: 92.30% | Certainty: 91.88 ±13.15 


In [147]:
tree_ensemble = {}

for label in ransom_list:
    tree_ensemble[label] = RandForestBinaryClassifier()
    dtB = tree_ensemble[label]
    dtB.set_binary_label(train_data,label)
    dtB.train_tree()
    accuracy,mean,std = tree_ensemble[label].test_model_accuracy(test_data)
    print(f"Accuracy for {label}: {accuracy * 100:.2f}% | Certainty: {mean* 100:.2f} ±{std* 100:.2f} ")


Accuracy for SVpeng: 96.78% | Certainty: 96.21 ±9.50 


KeyboardInterrupt: 

### Unified model with random forests

In [None]:
ransom_predictions = {}
for label in ransom_list:
    ransom_predictions[label] = [] 

In [None]:
def predict_for_row(X_test_row):
    prediction = 'Benign'
    for label in ransom_list:
        ans = tree_ensemble[label].predict_probability(X_test_row.to_frame().T)
        ransom_predictions[label].append(ans[0])
        if ans[0] == 1:
            prediction = label

    return prediction

In [None]:
def traverse_tree_ensemble(X_test):
    y_pred = X_test.apply(predict_for_row,axis=1)

    return y_pred

In [None]:
pred_df = pd.DataFrame(columns=ransom_types)

X_test, y_test = split_Xy(test_data)

y_pred = traverse_tree_ensemble(X_test)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 0.75%


In [None]:
new_df = pd.DataFrame(ransom_predictions)
new_df.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor
0,1.0,1.0,1.0,0.933333,0.066667,1.0,0.066667,0.311111,1.0,1.0,0.933333
1,1.0,1.0,1.0,0.333333,0.111111,1.0,0.0,0.1,1.0,1.0,0.622222
2,1.0,0.466667,0.477778,0.455556,1.0,0.166667,0.833333,1.0,0.466667,0.466667,0.8
3,1.0,1.0,0.933333,0.8,0.233333,0.933333,0.155556,0.066667,0.933333,0.866667,0.311111
4,1.0,1.0,1.0,0.8,0.066667,1.0,0.066667,0.155556,1.0,1.0,1.0


In [None]:
new_df['Label'] = y_test.reset_index(drop=True)
new_df.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1.0,1.0,1.0,0.933333,0.066667,1.0,0.066667,0.311111,1.0,1.0,0.933333,Simplocker
1,1.0,1.0,1.0,0.333333,0.111111,1.0,0.0,0.1,1.0,1.0,0.622222,WannaLocker
2,1.0,0.466667,0.477778,0.455556,1.0,0.166667,0.833333,1.0,0.466667,0.466667,0.8,Charger
3,1.0,1.0,0.933333,0.8,0.233333,0.933333,0.155556,0.066667,0.933333,0.866667,0.311111,Simplocker
4,1.0,1.0,1.0,0.8,0.066667,1.0,0.066667,0.155556,1.0,1.0,1.0,WannaLocker


In [None]:
y_test.head()

296049     Simplocker
380978    WannaLocker
58639         Charger
284541     Simplocker
388250    WannaLocker
Name: Label, dtype: object

In [None]:
new_df.to_csv('ransom_forest_class.csv')

In [None]:
new_df.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1.0,1.0,1.0,0.933333,0.066667,1.0,0.066667,0.311111,1.0,1.0,0.933333,Simplocker
1,1.0,1.0,1.0,0.333333,0.111111,1.0,0.0,0.1,1.0,1.0,0.622222,WannaLocker
2,1.0,0.466667,0.477778,0.455556,1.0,0.166667,0.833333,1.0,0.466667,0.466667,0.8,Charger
3,1.0,1.0,0.933333,0.8,0.233333,0.933333,0.155556,0.066667,0.933333,0.866667,0.311111,Simplocker
4,1.0,1.0,1.0,0.8,0.066667,1.0,0.066667,0.155556,1.0,1.0,1.0,WannaLocker


In [None]:
y_test.head()

296049     Simplocker
380978    WannaLocker
58639         Charger
284541     Simplocker
388250    WannaLocker
Name: Label, dtype: object

In [None]:
boolpreds = pd.read_csv('ransom_forest_class.csv', encoding = 'unicode_escape')
boolpreds = boolpreds.iloc[:, 1:]
boolpreds.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1.0,1.0,1.0,0.933333,0.066667,1.0,0.066667,0.311111,1.0,1.0,0.933333,Simplocker
1,1.0,1.0,1.0,0.333333,0.111111,1.0,0.0,0.1,1.0,1.0,0.622222,WannaLocker
2,1.0,0.466667,0.477778,0.455556,1.0,0.166667,0.833333,1.0,0.466667,0.466667,0.8,Charger
3,1.0,1.0,0.933333,0.8,0.233333,0.933333,0.155556,0.066667,0.933333,0.866667,0.311111,Simplocker
4,1.0,1.0,1.0,0.8,0.066667,1.0,0.066667,0.155556,1.0,1.0,1.0,WannaLocker


In [None]:
train_ratio = 0.8
n_samples = boolpreds.shape[0]
indices = np.random.permutation(n_samples)

train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_preds = boolpreds.iloc[train_indices]
test_preds = boolpreds.iloc[test_indices]

In [None]:
X_train, y_train = split_Xy(train_preds)
dtc = DecisionTreeClassifier(max_depth=50,random_state=42)
dtc.fit(X_train,y_train)

In [None]:
X_test, y_test = split_Xy(test_preds)

y_pred = dtc.predict(X_test)

accuracy = np.mean(y_test == y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 37.41%


Adding binary tree predictions to std dataset

In [None]:
stock_data = test_data.reset_index(drop=True)
stock_data.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,337,0.016232,-0.230793,-0.351246,-0.48516,-0.456046,-0.14479,-0.129581,1.377379,-0.215504,...,-0.329183,-0.403445,-0.379433,-0.101714,-0.18759,-0.382595,-0.382795,-0.308781,-0.288983,Simplocker
1,337,0.926284,-0.230798,-0.108085,3.129296,4.023403,-0.144953,-0.129671,1.377379,-0.223483,...,-0.180692,1.980268,1.426782,-0.102186,-0.218692,-0.3589,1.23964,0.762593,0.834839,WannaLocker
2,3,-2.056377,-0.230395,-0.369626,-0.501829,-0.485811,-0.129989,-0.121468,-0.754196,-0.223484,...,-0.352436,-0.425053,-0.398442,-0.10234,-0.073589,2.136308,-0.40879,-0.33805,-0.658338,Charger
3,337,0.690978,-0.230793,-0.314513,0.395174,0.451535,-0.144914,-0.129647,1.377379,-0.223286,...,-0.256361,0.722368,0.701142,-0.102212,-0.230022,2.22443,-0.041062,0.165149,-0.394227,Simplocker
4,337,0.29416,-0.225299,1.1309,0.672928,0.471166,-0.144961,-0.129673,-0.704192,-0.218376,...,1.020774,1.077608,0.724167,-0.10234,-0.233953,-0.3589,2.067711,2.138695,-0.658338,WannaLocker


In [None]:
pred_data = new_df.reset_index(drop=True)
pred_data.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1.0,1.0,1.0,0.933333,0.066667,1.0,0.066667,0.311111,1.0,1.0,0.933333,Simplocker
1,1.0,1.0,1.0,0.333333,0.111111,1.0,0.0,0.1,1.0,1.0,0.622222,WannaLocker
2,1.0,0.466667,0.477778,0.455556,1.0,0.166667,0.833333,1.0,0.466667,0.466667,0.8,Charger
3,1.0,1.0,0.933333,0.8,0.233333,0.933333,0.155556,0.066667,0.933333,0.866667,0.311111,Simplocker
4,1.0,1.0,1.0,0.8,0.066667,1.0,0.066667,0.155556,1.0,1.0,1.0,WannaLocker


In [None]:
full_df = pd.concat([stock_data,pred_data],axis=1)

In [None]:
full_df = full_df.groupby(level=0, axis=1).first()
full_df['Label']

0         Simplocker
1        WannaLocker
2            Charger
3         Simplocker
4        WannaLocker
            ...     
39197          Koler
39198       RansomBO
39199         Benign
39200         SVpeng
39201      PornDroid
Name: Label, Length: 39202, dtype: object

In [None]:
full_df.to_csv('ransom_forest_pred_meta.csv')