In [142]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')

In [143]:
# read the dataset
dataSet = pd.read_csv('dataset/Android_Ransomeware.csv', encoding = 'unicode_escape')
dataSet = dataSet.iloc[:, 1:]
dataSet = dataSet.drop(['Flow ID', ' Timestamp', ' Destination IP'],axis=1)
dataSet.head()

Unnamed: 0,Source IP,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,10.42.0.211,51023,443,6,151054,6,8,1076.0,4575.0,821.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,10.42.0.211,51023,443,6,349,2,0,23.0,0.0,23.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,10.42.0.211,34259,443,6,119,2,0,23.0,0.0,23.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,10.42.0.211,55509,443,6,37055,1,1,31.0,0.0,31.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,10.42.0.211,44852,443,6,178727,6,7,1313.0,307.0,753.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


### Preprocessing

In [144]:
def modify_and_balance_labels(df, base = 'Benign'):

    df['Label'] = df['Label'].apply(lambda x: 0 if x == base else 1)

    num_label_1 = df[df['Label'] == 1].shape[0]
    num_label_0 = df[df['Label'] == 0].shape[0]

    if num_label_1 > num_label_0:
        indices_to_remove = random.sample(df[df['Label'] == 1].index.tolist(), num_label_1 - num_label_0)
        df = df.drop(indices_to_remove)

    return df

Handle duplicates and NaN values

In [145]:
def handle_dups_n_nans(dataSet):
    dataSet = dataSet.drop_duplicates()
    dataSet.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataSet.dropna(inplace=True)

    return dataSet

Normalise numerical features

In [146]:
def normalise_features(dataSet):
    numerical_features = dataSet.select_dtypes(include=['float64', 'int64']).columns

    mean = dataSet[numerical_features].mean()
    std = dataSet[numerical_features].std()

    dataSet[numerical_features] = (dataSet[numerical_features] - mean) / std

    return dataSet


Encode labels with integers

In [147]:
def encode_categorical(dataSet):
    categorical_columns = dataSet.select_dtypes(exclude=['float64', 'int64']).columns

    for column in categorical_columns:
        if column !='Label':
            unique_categories = dataSet[column].unique()
            category_to_label = {category: label for label, category in enumerate(unique_categories)}
            dataSet[column] = dataSet[column].map(category_to_label)

    return dataSet

Full preprocessing function

In [148]:
def perform_preprocessing(dataSet):
    return encode_categorical(
        normalise_features(
            handle_dups_n_nans(
                dataSet
            )
        )
    )

In [149]:
def split_Xy(df):
    X = df.drop('Label', axis=1)
    y = df['Label']
    return X, y

In [150]:
# dataSet = perform_preprocessing(dataSet)
# dataSet.to_csv('ransomware_stock.csv')

In [151]:
dataSet = perform_preprocessing(dataSet)
# dataSet = modify_and_balance_labels(dataSet,'Benign')
dataSet.head()

Unnamed: 0,Source IP,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0.662169,-0.3589,-0.515589,-0.479053,-0.032251,-0.008797,0.011524,-0.034388,1.02649,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
1,0,0.662169,-0.3589,-0.515589,-0.485911,-0.151886,-0.107622,-0.051825,-0.076007,-0.367444,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
2,0,-0.238853,-0.3589,-0.515589,-0.485922,-0.151886,-0.107622,-0.051825,-0.076007,-0.367444,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
3,0,0.90328,-0.3589,-0.515589,-0.484241,-0.181794,-0.095269,-0.051344,-0.076007,-0.35347,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
4,0,0.330493,-0.3589,-0.515589,-0.477793,-0.032251,-0.02115,0.025782,-0.073214,0.907709,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign


In [152]:
dataSet.shape

(392019, 82)

Split train and test data

In [153]:
train_ratio = 0.5
n_samples = dataSet.shape[0]
indices = np.random.permutation(n_samples)

train_size = int(train_ratio * n_samples)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = dataSet.iloc[train_indices]
test_data = dataSet.iloc[test_indices]

X_train, y_train = split_Xy(train_data)
X_test, y_test = split_Xy(test_data)

dataSet.head()

Unnamed: 0,Source IP,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0.662169,-0.3589,-0.515589,-0.479053,-0.032251,-0.008797,0.011524,-0.034388,1.02649,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
1,0,0.662169,-0.3589,-0.515589,-0.485911,-0.151886,-0.107622,-0.051825,-0.076007,-0.367444,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
2,0,-0.238853,-0.3589,-0.515589,-0.485922,-0.151886,-0.107622,-0.051825,-0.076007,-0.367444,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
3,0,0.90328,-0.3589,-0.515589,-0.484241,-0.181794,-0.095269,-0.051344,-0.076007,-0.35347,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign
4,0,0.330493,-0.3589,-0.515589,-0.477793,-0.032251,-0.02115,0.025782,-0.073214,0.907709,...,0.002823,-0.165351,-0.057795,-0.150163,-0.161693,-0.30659,-0.116142,-0.311546,-0.29547,Benign


### Random forest classifier 
_To identify important features_

In [154]:
features = dataSet.columns.tolist()

rf = RandomForestClassifier(n_estimators=20, random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_


In [155]:
feature_dict = {}
threshold = 0.035
num_features = 20

for feature, importance in zip(features, feature_importances):
    feature_dict[feature] = importance

print(feature_dict)

sorted_features = dict(sorted(feature_dict.items(),key=lambda ele:ele[1],reverse=True))

important_features = []

for feat in sorted_features.keys():
    important_features.append(feat)
    if(len(important_features) == num_features):
        break

print(sorted_features)
print(important_features)

{' Source IP': np.float64(0.18392064899201685), ' Source Port': np.float64(0.05874306454039822), ' Destination Port': np.float64(0.018269790127467846), ' Protocol': np.float64(0.00028331398910387323), ' Flow Duration': np.float64(0.04131366158923313), ' Total Fwd Packets': np.float64(0.003809553228361908), ' Total Backward Packets': np.float64(0.0032221742472360857), 'Total Length of Fwd Packets': np.float64(0.008438658017252507), ' Total Length of Bwd Packets': np.float64(0.008110249097013732), ' Fwd Packet Length Max': np.float64(0.008380800970971682), ' Fwd Packet Length Min': np.float64(0.0031992948473330693), ' Fwd Packet Length Mean': np.float64(0.008363887697431025), ' Fwd Packet Length Std': np.float64(0.00584927528939278), 'Bwd Packet Length Max': np.float64(0.005919143220760726), ' Bwd Packet Length Min': np.float64(0.0036572283143924502), ' Bwd Packet Length Mean': np.float64(0.008270073290576621), ' Bwd Packet Length Std': np.float64(0.005324178755191415), 'Flow Bytes/s': n

In [156]:
X_train_fs = X_train[important_features]
X_train_fs.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,Bwd Packets/s,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size
79482,0,1.11139,-0.209757,-0.351043,-0.494544,-0.479886,-0.144677,-0.129519,-0.548025,-0.223484,-0.10862,-0.352436,-0.425053,-0.398442,-0.10234,-0.225976,-0.3589,-0.40879,-0.33805,-0.658338
143647,0,0.92494,-0.18995,-0.333208,-0.487552,-0.4742,-0.144816,-0.129595,-0.710996,-0.223484,-0.108905,-0.352436,-0.425053,-0.398442,-0.10234,-0.217998,-0.382595,-0.40879,-0.33805,-0.658338
265036,337,0.262395,-0.104333,-0.256115,-0.457328,-0.449619,-0.144916,-0.12965,-0.756149,-0.223484,-0.10911,-0.352436,-0.425053,-0.398442,-0.101883,-0.233953,-0.384357,-0.40879,-0.33805,0.417714
380555,337,-0.051329,-0.230797,0.128324,2.808168,2.215086,-0.144958,-0.129673,1.377379,-0.223483,-0.109197,0.50875,3.808513,2.769674,-0.102246,-0.220195,-0.3589,1.987849,2.955454,1.014427
34152,643,-2.056377,-0.230276,5.812048,4.345152,3.456259,-0.144963,-0.129676,-0.751038,-0.223484,-0.109206,-0.352436,-0.425053,-0.398442,-0.10234,0.212221,2.951399,9.824693,-0.33805,-0.658338


In [157]:
X_test_fs = X_test[important_features]
X_test_fs.head()

Unnamed: 0,Source IP,Source Port,Flow IAT Min,Flow IAT Mean,Flow IAT Max,Flow Duration,Flow Packets/s,Fwd Packets/s,Init_Win_bytes_forward,Fwd IAT Min,Bwd Packets/s,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Total,Flow Bytes/s,Init_Win_bytes_backward,Destination Port,Flow IAT Std,Fwd IAT Std,Average Packet Size
249069,337,1.049366,-0.230516,-0.369735,-0.501872,-0.485846,-0.123565,-0.106219,-0.705787,-0.223222,-0.109208,-0.352203,-0.424925,-0.398347,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338
200613,0,1.03308,-0.130529,-0.279703,-0.466576,-0.45714,-0.144904,-0.129643,-0.756149,-0.223484,-0.109084,-0.352436,-0.425053,-0.398442,-0.102177,-0.233953,-0.384357,-0.40879,-0.33805,-0.293908
103109,0,0.22488,-0.23079,-0.369981,-0.501969,-0.485925,0.523664,0.236799,-0.752015,-0.223484,1.259846,-0.352436,-0.425053,-0.398442,0.394774,-0.21927,-0.3589,-0.40879,-0.33805,-0.505671
279259,337,1.071241,-0.214791,-0.355575,-0.496321,-0.481331,-0.144587,-0.129469,-0.756149,-0.223484,-0.108435,-0.352436,-0.425053,-0.398442,-0.100738,-0.233953,-0.384357,-0.40879,-0.33805,-0.151091
102220,0,-0.175055,-0.230797,-0.369988,-0.501971,-0.485927,3.666216,4.048137,-0.756117,-0.223483,-0.109208,-0.352434,-0.425052,-0.398442,-0.10234,-0.233953,-0.3589,-0.40879,-0.33805,-0.658338


In [158]:
important_features.append('Label')
dataSet[important_features].to_csv('ransom_with_ip.csv')

### Simple decision tree

In [104]:
dtc = DecisionTreeClassifier(max_depth=20,random_state=42)
dtc.fit(X_train_fs,y_train)

In [105]:
y_pred = dtc.predict(X_test_fs)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


### Bagging classifier

In [106]:
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=20,  # no. of base models
    random_state=42
)
bagging_model.fit(X_train_fs, y_train)

In [107]:
y_pred = bagging_model.predict(X_test_fs)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


### XGBoost classifier

In [108]:
xgb_model = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,  # Fraction of features used per tree
random_state=42
)
xgb_model.fit(X_train_fs, y_train)

In [109]:
y_pred = xgb_model.predict(X_test_fs)

accuracy = np.mean(y_test == y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.99%
