In [861]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
import random
%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')

In [896]:
# read the dataset
dataSet = pd.read_csv('dataset/ransom_with_ip.csv', encoding = 'unicode_escape')
dataSet = dataSet.iloc[:, 1:]

# dataSet = dataSet.drop(['Flow ID',' Timestamp'],axis=1)
# dataSet = pd.concat([dataSet[' Timestamp'],dataSet['Label'],dataSet['Flow ID']],axis=1)
dataSet.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,0,0.662169,-0.230795,-0.368331,-0.499224,-0.479053,-0.143198,-0.128846,1.377379,-0.223468,...,-0.349775,-0.422875,-0.393004,-0.068145,-0.193024,-0.3589,-0.40636,-0.335907,1.329492,Benign
1,0,0.662169,-0.230743,-0.369939,-0.501952,-0.485911,-0.035761,-0.009968,-0.704224,-0.223433,...,-0.35239,-0.425028,-0.398424,-0.042102,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
2,0,-0.238853,-0.23078,-0.369972,-0.501965,-0.485922,0.175303,0.221401,-0.70774,-0.223467,...,-0.35242,-0.425044,-0.398436,0.074325,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
3,0,0.90328,-0.224925,-0.3647,-0.499898,-0.484241,-0.143935,-0.129112,-0.705722,-0.223484,...,-0.352436,-0.425053,-0.398442,-0.101575,-0.18863,-0.3589,-0.40879,-0.33805,-0.505671,Benign
4,0,0.330493,-0.23079,-0.367863,-0.498489,-0.477793,-0.143578,-0.128975,1.377379,-0.223459,...,-0.349157,-0.42001,-0.39174,-0.094055,-0.19314,-0.3589,-0.405693,-0.333292,-0.044641,Benign


Binary data conversion

In [897]:
def modify_and_balance_labels(df, base = 'Benign'):

    df['Label'] = df['Label'].apply(lambda x: 0 if x == base else 1)

    num_label_1 = df[df['Label'] == 1].shape[0]
    num_label_0 = df[df['Label'] == 0].shape[0]

    if num_label_1 > num_label_0:
        indices_to_remove = random.sample(df[df['Label'] == 1].index.tolist(), num_label_1 - num_label_0)
        df = df.drop(indices_to_remove)

    return df

In [898]:
# dataSet = modify_and_balance_labels(dataSet) # Comment out for multiclass
dataSet.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,...,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size,Label
0,0,0.662169,-0.230795,-0.368331,-0.499224,-0.479053,-0.143198,-0.128846,1.377379,-0.223468,...,-0.349775,-0.422875,-0.393004,-0.068145,-0.193024,-0.3589,-0.40636,-0.335907,1.329492,Benign
1,0,0.662169,-0.230743,-0.369939,-0.501952,-0.485911,-0.035761,-0.009968,-0.704224,-0.223433,...,-0.35239,-0.425028,-0.398424,-0.042102,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
2,0,-0.238853,-0.23078,-0.369972,-0.501965,-0.485922,0.175303,0.221401,-0.70774,-0.223467,...,-0.35242,-0.425044,-0.398436,0.074325,-0.233953,-0.3589,-0.40879,-0.33805,-0.545069,Benign
3,0,0.90328,-0.224925,-0.3647,-0.499898,-0.484241,-0.143935,-0.129112,-0.705722,-0.223484,...,-0.352436,-0.425053,-0.398442,-0.101575,-0.18863,-0.3589,-0.40879,-0.33805,-0.505671,Benign
4,0,0.330493,-0.23079,-0.367863,-0.498489,-0.477793,-0.143578,-0.128975,1.377379,-0.223459,...,-0.349157,-0.42001,-0.39174,-0.094055,-0.19314,-0.3589,-0.405693,-0.333292,-0.044641,Benign


Train test split

In [899]:
train_ratio = 0.5
n_samples = dataSet.shape[0]
indices = np.random.permutation(n_samples)

# Train/test split
train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = dataSet.iloc[train_indices]
test_data = dataSet.iloc[test_indices]


In [900]:
def split_Xy(df):
    X = df.drop('Label', axis=1)
    y = df['Label']
    return X, y

In [901]:
X_train, y_train = split_Xy(train_data)
X_test, y_test = split_Xy(test_data)

### Random forest

In [902]:
rf = RandomForestClassifier(n_estimators=20, random_state=42, min_samples_split=5)
rf.fit(X_train, y_train)

In [903]:
probs = rf.predict_proba(X_test)
certainty = np.max(probs,axis=1)
mean = np.mean(certainty)
std = np.std(certainty)

# Show predicted class probabilities for the first test instance
print(certainty)  # Array of probabilities for each class for the first test instance

# The highest probability corresponds to the predicted class
y_pred = rf.predict(X_test)
print(y_pred)
accuracy = np.mean(y_test == y_pred)
print(f"Accuracy: {accuracy * 100:.2f}% | Certainty: {mean* 100:.2f} ±{std* 100:.2f} ")
# confidence = probs[predicted_class]
# print(f"Predicted class: {predicted_class}, Confidence: {confidence}")

[0.32166667 0.39071429 0.58708333 ... 0.33333333 0.39611111 0.40583333]
['Koler' 'Lockerpin' 'Koler' ... 'Simplocker' 'PornDroid' 'RansomBO']
Accuracy: 46.46% | Certainty: 52.11 ±23.64 


### Simple decision tree

In [904]:
dtc = DecisionTreeClassifier(max_depth=100,random_state=42)
dtc.fit(X_train,y_train)

In [905]:
y_pred = dtc.predict(X_test)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 47.70%
