## Ransomware decision tree ensemble

Uses dataset with 20 features selected from random forest

In [359]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
dataSet = pd.read_csv('dataset/ransomware_rf.csv', encoding = 'unicode_escape')
dataSet = dataSet.iloc[:, 1:]
dataSet.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,0,0.662169,-0.230795,-0.368331,-0.499224,-0.479053,-0.143198,-0.128846,1.377379,-0.223468,...,-0.349775,-0.422875,-0.393004,-0.068145,-0.193024,-0.3589,-0.40636,-0.335907,1.329492,Benign
1,0,0.662169,-0.230743,-0.369939,-0.501952,-0.485911,-0.035761,-0.009968,-0.704224,-0.223433,...,-0.35239,-0.425028,-0.398424,-0.042102,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
2,0,-0.238853,-0.23078,-0.369972,-0.501965,-0.485922,0.175303,0.221401,-0.70774,-0.223467,...,-0.35242,-0.425044,-0.398436,0.074325,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
3,0,0.90328,-0.224925,-0.3647,-0.499898,-0.484241,-0.143935,-0.129112,-0.705722,-0.223484,...,-0.352436,-0.425053,-0.398442,-0.101575,-0.18863,-0.3589,-0.40879,-0.33805,-0.505671,Benign
4,0,0.330493,-0.23079,-0.367863,-0.498489,-0.477793,-0.143578,-0.128975,1.377379,-0.223459,...,-0.349157,-0.42001,-0.39174,-0.094055,-0.19314,-0.3589,-0.405693,-0.333292,-0.044641,Benign


Train test split

In [361]:
train_ratio = 0.9
n_samples = dataSet.shape[0]
indices = np.random.permutation(n_samples)

train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = dataSet.iloc[train_indices]
test_data = dataSet.iloc[test_indices]

train_data.shape, test_data.shape


((352817, 21), (39202, 21))

In [362]:
def split_Xy(df):
    X = df.drop('Label', axis=1)
    y = df['Label']
    return X, y

In [363]:
ransom_types = dataSet['Label'].value_counts()
ransom_list = list(ransom_types.keys())
print(ransom_list)


['SVpeng', 'PornDroid', 'Koler', 'Benign', 'RansomBO', 'Charger', 'Simplocker', 'WannaLocker', 'Jisut', 'Lockerpin', 'Pletor']


### Binary classification decision tree

**Decision tree classifier**

_Creates binary classification dataset for each class an trains a decision tree_

In [364]:
class DecisionTreeBinaryClassifier:
    def __init__(self):
        self.label = 'Benign'
        self.dtc = DecisionTreeClassifier(max_depth=20,random_state=42)
        self.df = None

    def split_Xy(self, df):
        X = df.drop('Label', axis=1)
        y = df['Label']
        return X, y
    
    def modify_and_balance_labels(self, df):
        base = self.label
        df['Label'] = df['Label'].apply(lambda x: 0 if x == base else 1)

        num_label_1 = df[df['Label'] == 1].shape[0]
        num_label_0 = df[df['Label'] == 0].shape[0]

        if num_label_1 > num_label_0:
            indices_to_remove = random.sample(df[df['Label'] == 1].index.tolist(), num_label_1 - num_label_0)
            df = df.drop(indices_to_remove)

        return df

    def set_binary_label(self, df, label):
        self.label = label
        self.df = df.copy()
        self.df = self.modify_and_balance_labels(self.df)

    def train_tree(self):
        X_train, y_train = self.split_Xy(self.df)
        self.dtc.fit(X_train,y_train)

    def predict(self, X_test):
        return self.dtc.predict(X_test)

    def test_model_accuracy(self, test_data):
        td = test_data.copy()
        X_test, y_test = self.split_Xy(
            self.modify_and_balance_labels(td)
        )
        y_pred = self.predict(X_test)

        return np.mean(y_test == y_pred) # accuracy
    
    def save_dataset(self):
        self.df.to_csv(f"{self.label}_balanced_ransomware.csv")


**Create decision tree for each malware class**

In [365]:
dtB = DecisionTreeBinaryClassifier()
dtB.set_binary_label(train_data,'Benign')
dtB.train_tree()
accuracy = dtB.test_model_accuracy(test_data)

print(f"Accuracy: {accuracy * 100:.2f}%")

test_data.head()

Accuracy: 74.49%


Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
222055,0,0.596382,-0.230761,-0.369706,-0.501777,-0.485656,-0.132171,-0.115652,-0.710051,-0.223449,...,-0.352176,-0.424803,-0.398124,-0.10234,-0.233953,-0.382595,-0.408516,-0.337776,-0.658338,PornDroid
155433,0,0.783584,0.600242,0.378305,-0.208605,-0.24734,-0.144957,-0.129668,-0.670302,0.548378,...,0.332201,-0.049846,-0.118594,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338,Lockerpin
119187,0,0.72758,-0.230797,-0.238919,0.51593,0.391678,-0.144942,-0.129663,1.377379,-0.223466,...,-0.123497,0.876812,0.630932,-0.102119,0.272575,-0.3589,0.25262,0.573083,0.387047,Koler
209802,0,0.223805,-0.23078,-0.369972,-0.501965,-0.485922,0.178017,0.224376,-0.756117,-0.223467,...,-0.35242,-0.425044,-0.398436,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338,PornDroid
69190,0,1.194645,-0.230797,-0.077684,4.254091,3.521639,-0.144954,-0.129672,1.377379,-0.223483,...,-0.327401,-0.253546,-0.234716,-0.102167,0.21569,-0.3589,1.755835,-0.239483,1.209252,Charger


In [366]:
tree_ensemble = {}

for label in ransom_list:
    tree_ensemble[label] = DecisionTreeBinaryClassifier()
    dtB = tree_ensemble[label]
    dtB.set_binary_label(train_data,label)
    dtB.train_tree()
    accuracy = tree_ensemble[label].test_model_accuracy(test_data)
    print(f"Accuracy for {label}: {accuracy * 100:.2f}%")

# for label in ransom_list:
#     accuracy = tree_ensemble[label].test_model_accuracy(test_data)
#     print(f"Accuracy for {label}: {accuracy * 100:.2f}%")

Accuracy for SVpeng: 96.73%
Accuracy for PornDroid: 76.42%
Accuracy for Koler: 75.70%
Accuracy for Benign: 74.45%
Accuracy for RansomBO: 84.19%
Accuracy for Charger: 75.83%
Accuracy for Simplocker: 83.76%
Accuracy for WannaLocker: 83.00%
Accuracy for Jisut: 75.69%
Accuracy for Lockerpin: 74.27%
Accuracy for Pletor: 93.15%


### Unified model with decision trees

In [367]:
ransom_predictions = {}
for label in ransom_list:
    ransom_predictions[label] = [] 

In [368]:
def predict_for_row(X_test_row):
    prediction = 'Benign'
    for label in ransom_list:
        ans = tree_ensemble[label].predict(X_test_row.to_frame().T)
        ransom_predictions[label].append(ans[0])
        if ans[0] == 1:
            prediction = label

    return prediction

In [369]:
def traverse_tree_ensemble(X_test):
    y_pred = X_test.apply(predict_for_row,axis=1)

    return y_pred

In [370]:
pred_df = pd.DataFrame(columns=ransom_types)

X_test, y_test = split_Xy(test_data)

y_pred = traverse_tree_ensemble(X_test)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 0.19%


In [371]:
new_df = pd.DataFrame(ransom_predictions)
new_df.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor
0,1,0,1,1,1,1,1,1,1,1,1
1,1,0,0,1,1,0,1,1,0,0,1
2,1,1,0,0,1,0,1,1,1,0,1
3,1,0,0,1,1,0,1,1,0,0,1
4,1,0,0,1,1,1,1,1,0,1,1


In [372]:
new_df['Label'] = y_test.reset_index(drop=True)
new_df.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1,0,1,1,1,1,1,1,1,1,1,PornDroid
1,1,0,0,1,1,0,1,1,0,0,1,Lockerpin
2,1,1,0,0,1,0,1,1,1,0,1,Koler
3,1,0,0,1,1,0,1,1,0,0,1,PornDroid
4,1,0,0,1,1,1,1,1,0,1,1,Charger


In [373]:
new_df.to_csv('ransomware_tree_classes.csv')

In [375]:
boolpreds = pd.read_csv('ransomware_tree_classes.csv', encoding = 'unicode_escape')
boolpreds = boolpreds.iloc[:, 1:]
boolpreds.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1,0,1,1,1,1,1,1,1,1,1,PornDroid
1,1,0,0,1,1,0,1,1,0,0,1,Lockerpin
2,1,1,0,0,1,0,1,1,1,0,1,Koler
3,1,0,0,1,1,0,1,1,0,0,1,PornDroid
4,1,0,0,1,1,1,1,1,0,1,1,Charger


In [376]:
train_ratio = 0.5
n_samples = boolpreds.shape[0]
indices = np.random.permutation(n_samples)

train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_preds = boolpreds.iloc[train_indices]
test_preds = boolpreds.iloc[test_indices]

In [377]:
X_train, y_train = split_Xy(train_preds)
dtc = DecisionTreeClassifier(max_depth=50,random_state=42)
dtc.fit(X_train,y_train)

In [378]:
X_test, y_test = split_Xy(test_preds)

y_pred = dtc.predict(X_test)

accuracy = np.mean(y_test == y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 38.66%


Adding binary tree predictions to std dataset

In [379]:
stock_data = test_data.reset_index(drop=True)
stock_data.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,0,0.596382,-0.230761,-0.369706,-0.501777,-0.485656,-0.132171,-0.115652,-0.710051,-0.223449,...,-0.352176,-0.424803,-0.398124,-0.10234,-0.233953,-0.382595,-0.408516,-0.337776,-0.658338,PornDroid
1,0,0.783584,0.600242,0.378305,-0.208605,-0.24734,-0.144957,-0.129668,-0.670302,0.548378,...,0.332201,-0.049846,-0.118594,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338,Lockerpin
2,0,0.72758,-0.230797,-0.238919,0.51593,0.391678,-0.144942,-0.129663,1.377379,-0.223466,...,-0.123497,0.876812,0.630932,-0.102119,0.272575,-0.3589,0.25262,0.573083,0.387047,Koler
3,0,0.223805,-0.23078,-0.369972,-0.501965,-0.485922,0.178017,0.224376,-0.756117,-0.223467,...,-0.35242,-0.425044,-0.398436,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338,PornDroid
4,0,1.194645,-0.230797,-0.077684,4.254091,3.521639,-0.144954,-0.129672,1.377379,-0.223483,...,-0.327401,-0.253546,-0.234716,-0.102167,0.21569,-0.3589,1.755835,-0.239483,1.209252,Charger


In [380]:
pred_data = new_df.reset_index(drop=True)
pred_data.head()

Unnamed: 0,SVpeng,PornDroid,Koler,Benign,RansomBO,Charger,Simplocker,WannaLocker,Jisut,Lockerpin,Pletor,Label
0,1,0,1,1,1,1,1,1,1,1,1,PornDroid
1,1,0,0,1,1,0,1,1,0,0,1,Lockerpin
2,1,1,0,0,1,0,1,1,1,0,1,Koler
3,1,0,0,1,1,0,1,1,0,0,1,PornDroid
4,1,0,0,1,1,1,1,1,0,1,1,Charger


In [381]:
full_df = pd.concat([stock_data,pred_data],axis=1)

In [382]:
full_df = full_df.groupby(level=0, axis=1).first()
full_df['Label']

0          PornDroid
1          Lockerpin
2              Koler
3          PornDroid
4            Charger
            ...     
39197      Lockerpin
39198         Benign
39199      Lockerpin
39200    WannaLocker
39201    WannaLocker
Name: Label, Length: 39202, dtype: object

In [383]:
full_df.to_csv('ransom_pred_meta.csv')