In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [13]:
headers = ['client_hello_len','client_hello_ext_num','server_hello_len','server_hello_ext_num','tls_version']

log_directory_benign = "datasets/DoHBrw/benign/zeek_logs"

log_files_benign = [f for f in os.listdir(log_directory_benign) if f.endswith('.log')]

df_list = []

for log_file in log_files_benign:
    file_path = os.path.join(log_directory_benign, log_file)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)  # or change delimiter depending on the log format
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    df = df.dropna()
    df_list.append(df)

df_doh_benign= pd.concat(df_list, ignore_index=True)
df_doh_benign = shuffle(df_doh_benign, random_state=42)
df_doh_benign.reset_index(drop=True, inplace=True)
df_doh_benign = df_doh_benign.set_axis(headers, axis=1)
df_doh_benign['c2'] = 0
df_doh_benign['source'] = 'DoH'
###########################################################################################################################
log_directory_malicious = "datasets/DoHBrw/malicious/zeek_logs"

log_files_malicious = [f for f in os.listdir(log_directory_malicious) if f.endswith('.log')]

df_list = []

for log_file in log_files_malicious:
    file_path = os.path.join(log_directory_malicious, log_file)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)  # or change delimiter depending on the log format
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    df = df.dropna()
    df_list.append(df)

df_doh_malicious= pd.concat(df_list, ignore_index=True)
df_doh_malicious = shuffle(df_doh_malicious, random_state=42)
df_doh_malicious.reset_index(drop=True, inplace=True)
df_doh_malicious = df_doh_malicious.set_axis(headers, axis=1)
df_doh_malicious['c2'] = 1
df_doh_malicious['source'] = 'DoH'

df_doh = pd.concat([df_doh_malicious,df_doh_benign])

In [14]:
df_all = df_doh
df_all = shuffle(df_all, random_state=42)
df_all.reset_index(drop=True, inplace=True)
df_all

Unnamed: 0,client_hello_len,client_hello_ext_num,server_hello_len,server_hello_ext_num,tls_version,c2,source
0,508,17.0,66.0,4.0,3.0,0,DoH
1,508,17.0,72.0,4.0,3.0,0,DoH
2,252,10.0,118.0,2.0,3.0,1,DoH
3,508,17.0,61.0,4.0,3.0,0,DoH
4,508,12.0,118.0,2.0,3.0,0,DoH
...,...,...,...,...,...,...,...
502719,508,17.0,80.0,6.0,3.0,0,DoH
502720,508,12.0,94.0,3.0,3.0,0,DoH
502721,508,14.0,118.0,2.0,3.0,0,DoH
502722,508,12.0,93.0,4.0,3.0,0,DoH


# Split Data

In [15]:
X = df_all.loc[:,headers].values.astype(np.float64)
y = df_all.loc[:,"c2"].values
y

array([0, 0, 1, ..., 0, 0, 0])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=42,  
                                   test_size=0.25,  
                                   shuffle=True) 

# Training

In [17]:
rf_model = RandomForestClassifier(max_depth=20, n_estimators = 5, max_leaf_nodes=500, n_jobs=4, random_state=42, bootstrap=False)                              
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy on test data: {accuracy}")
# print(classification_report(y_test,y_pred))
# importances = rf_model.feature_importances_
# feature_importances = pd.DataFrame({'feature': headers, 'importance': importances})
# feature_importances = feature_importances.sort_values('importance', ascending=False)
# feature_importances

In [19]:
# import pickle
# pickle.dump(rf_model, open("RF_DPDK.pkl", 'wb'))

# Transform RF model into Json Format

In [20]:
import pickle
import json

# Function to extract information from each tree in the Random Forest
def tree_to_json(tree):
    # Get the decision tree's internal attributes
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold
    class_names = tree.classes_

    nodes = []
    for i in range(n_nodes):
        node = {}
        # If it's a leaf node, assign class label
        if children_left[i] == children_right[i]:
            node['is_leaf'] = True
            # Extract class counts/probabilities and assign the class with the max count/probability
            class_counts = tree.tree_.value[i, 0]
            class_label = class_counts.argmax()  # Get the index of the maximum count
            node['class_label'] = int(class_names[class_label])  # Get the corresponding class label
        else:
            node['is_leaf'] = False
            node['class_label'] = -10

        node['feature'] = feature[i] if feature[i] != -2 else -2  # -2 indicates no split (leaf)
        node['threshold'] = threshold[i]
        node['left_child'] = children_left[i]
        node['right_child'] = children_right[i]
        
        nodes.append(node)
    
    return nodes

# Function to convert the Random Forest model to JSON format
def rf_to_json(rf_model):
    # print(rf_model)
    rf_data = {
        'n_estimators': len(rf_model.estimators_),
        'max_depth': rf_model.max_depth,
        'feature_importances': rf_model.feature_importances_.tolist(),
        'estimators': []
    }

    # Convert each estimator (tree) to JSON
    for estimator in rf_model.estimators_:
        # For each tree (estimator), get the nodes and relevant details
        estimators_data = {
            'n_nodes': estimator.tree_.node_count,
            'children_left': estimator.tree_.children_left.tolist(),
            'children_right': estimator.tree_.children_right.tolist(),
            'feature': estimator.tree_.feature.tolist(),
            'threshold': estimator.tree_.threshold.tolist(),
            'class_label': [] , # Initialize a list for class labels
            'leaves' : []
        }
        # Extract class labels from the tree's value attribute
        for i in range(estimator.tree_.node_count):
            if estimator.tree_.children_left[i] == estimator.tree_.children_right[i]:
                estimators_data['leaves'].append(1)
                class_counts = estimator.tree_.value[i, 0]  # Get the counts for each class at the leaf node
                max_class_index = class_counts.argmax()  # Get the index of the max class count
                estimators_data['class_label'].append(int(estimator.classes_[max_class_index]))  # Add the class label
            else:
                estimators_data['leaves'].append(0)
                estimators_data['class_label'].append(-10)  # Add the class label

        rf_data['estimators'].append(estimators_data)

    return rf_data

# Load the Random Forest model from the .pkl file
with open('RF_DPDK.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# Convert the model to JSON
rf_json = rf_to_json(rf_model)

# Write the JSON to a file
with open('rf_model.json', 'w') as json_file:
    json.dump(rf_json, json_file, indent=4)

print("Random Forest model saved to rf_model.json")

Random Forest model saved to rf_model.json
