In [39]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, make_scorer
from sklearn.tree import DecisionTreeClassifier as dtc 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [40]:
log_directory_tls12 = "/home/admin/C2_TLS/datasets/ms/TLS1.2/short_cert/"
log_files12 = [f for f in os.listdir(log_directory_tls12) if f.endswith('.log')]

log_directory_tls13 = "/home/admin/C2_TLS/datasets/ms/TLS1.3/"
log_files13 = [f for f in os.listdir(log_directory_tls13) if f.endswith('.log')]

df_list = []

for log_file in log_files12:
    file_path = os.path.join(log_directory_tls12, log_file)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    df = df.dropna()
    df_list.append(df)

for log_file in log_files13:
    file_path = os.path.join(log_directory_tls13, log_file)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    df = df.dropna()
    df_list.append(df)



headers = ['client_hello_len','client_hello_ext_num','server_hello_len','server_hello_ext_num','tls_version']
df_ms = pd.concat(df_list, ignore_index=True)
df_ms.shape
df_ms = shuffle(df_ms, random_state=42)
df_ms.reset_index(drop=True, inplace=True)
df_ms = df_ms.iloc[:50000]
df_ms = df_ms.set_axis(headers, axis=1)
df_ms['c2'] = 1
df_ms['source'] = 'MS'
df_ms_noise = df_ms.iloc[40001:41000]
df_ms_noise.loc[:, 'c2'] = 0

In [41]:
log_directory = "/home/admin/C2_TLS/datasets/tranco"

# List all .log files in the directory
log_files = [f for f in os.listdir(log_directory) if f.endswith('.log')]

# List to hold the DataFrames
df_list = []

# Loop through each log file and load it into a DataFrame
for log_file in log_files:
    file_path = os.path.join(log_directory, log_file)
    
    # Read the log file into a DataFrame (assuming CSV or tab-delimited format)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)  # or change delimiter depending on the log format
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    # Remove rows where any column contains NaN (or previously "-")
    df = df.dropna()
    # Append the DataFrame to the list
    df_list.append(df)

#Load Tranco dataset (i.e., Benign traffic)
df_tranco= pd.concat(df_list, ignore_index=True)
df_tranco = shuffle(df_tranco, random_state=42)
df_tranco.reset_index(drop=True, inplace=True)
# df_tranco = df_tranco.iloc[:100000]
df_tranco = df_tranco.set_axis(headers, axis=1)
df_tranco['c2'] = 0
df_tranco['source'] = 'tranco'

In [42]:
log_directory_benign = "/home/admin/C2_TLS/datasets/DoHBrw/benign"

# List all .log files in the directory
log_files_benign = [f for f in os.listdir(log_directory_benign) if f.endswith('.log')]

# List to hold the DataFrames
df_list = []

# Loop through each log file and load it into a DataFrame
for log_file in log_files_benign:
    file_path = os.path.join(log_directory_benign, log_file)
    
    # Read the log file into a DataFrame (assuming CSV or tab-delimited format)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)  # or change delimiter depending on the log format
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    # Remove rows where any column contains NaN (or previously "-")
    df = df.dropna()
    # Append the DataFrame to the list
    df_list.append(df)

#Load Tranco dataset (i.e., Benign traffic)
df_doh_benign= pd.concat(df_list, ignore_index=True)
df_doh_benign = shuffle(df_doh_benign, random_state=42)
df_doh_benign.reset_index(drop=True, inplace=True)
df_doh_benign = df_doh_benign.iloc[:50000]
df_doh_benign = df_doh_benign.set_axis(headers, axis=1)
df_doh_benign['c2'] = 0
df_doh_benign['source'] = 'DoH'
###########################################################################################################################
log_directory_malicious = "/home/admin/C2_TLS/datasets/DoHBrw/malicious/"

# List all .log files in the directory
log_files_malicious = [f for f in os.listdir(log_directory_malicious) if f.endswith('.log')]

# List to hold the DataFrames
df_list = []

# Loop through each log file and load it into a DataFrame
for log_file in log_files_malicious:
    file_path = os.path.join(log_directory_malicious, log_file)
    
    # Read the log file into a DataFrame (assuming CSV or tab-delimited format)
    df = pd.read_csv(file_path, sep=",", header=None, skiprows=8, low_memory=False)  # or change delimiter depending on the log format
    df = df.iloc[:-1]
    df.replace("-", pd.NA, inplace=True)
    # Remove rows where any column contains NaN (or previously "-")
    df = df.dropna()
    # Append the DataFrame to the list
    df_list.append(df)

#Load Tranco dataset (i.e., Benign traffic)
df_doh_malicious= pd.concat(df_list, ignore_index=True)
df_doh_malicious = shuffle(df_doh_malicious, random_state=42)
df_doh_malicious.reset_index(drop=True, inplace=True)
df_doh_malicious = df_doh_malicious.iloc[:20000]
df_doh_malicious = df_doh_malicious.set_axis(headers, axis=1)
df_doh_malicious['c2'] = 1
df_doh_malicious['source'] = 'DoH'

df_doh = pd.concat([df_doh_malicious,df_doh_benign])

In [43]:
df_all = pd.concat([df_tranco,df_ms, df_doh], ignore_index=True)
df_all = shuffle(df_all, random_state=42)
df_all.reset_index(drop=True, inplace=True)

In [44]:
df_all

Unnamed: 0,client_hello_len,client_hello_ext_num,server_hello_len,server_hello_ext_num,tls_version,c2,source
0,512,17.0,108.0,6.0,3.0,0,DoH
1,589,14.0,122.0,2.0,3.0,0,tranco
2,256,10.0,122.0,2.0,3.0,1,DoH
3,256,10.0,122.0,2.0,3.0,1,DoH
4,512,10.0,122.0,2.0,3.0,1,MS
...,...,...,...,...,...,...,...
148560,256,10.0,122.0,2.0,3.0,1,DoH
148561,512,10.0,122.0,2.0,3.0,1,MS
148562,512,17.0,122.0,2.0,3.0,0,DoH
148563,512,17.0,78.0,6.0,3.0,0,DoH


# Split Data

In [45]:
X = df_all.loc[:,headers].values.astype(np.float64)
y = df_all.loc[:,"c2"].values
y

array([0, 0, 1, ..., 0, 0, 1])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=42,  
                                   test_size=0.25,  
                                   shuffle=True) 

# Training

In [47]:
rf_model = RandomForestClassifier(max_depth=20, n_estimators = 5, max_leaf_nodes=500, n_jobs=4, random_state=42, bootstrap=False)                              
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy}")
importances = rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': headers, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances

Accuracy on test data: 0.993484465026116


Unnamed: 0,feature,importance
1,client_hello_ext_num,0.517863
0,client_hello_len,0.215466
3,server_hello_ext_num,0.209622
4,tls_version,0.051161
2,server_hello_len,0.005887


In [48]:
# import pickle
# pickle.dump(rf_model, open("RF.pkl", 'wb'))