### Importing libraries


In [60]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

### Import training data
###### Create a directory named 'Encrypted Training Data' in the same directory as this script. Download and extract the dataset from https://data.mendeley.com/datasets/ztyk4h3v6s/1 to the 'Encrypted Training Data' folder.

In [61]:
# Specify the file path
file_path = 'Encrypted Training Data/Machin_learning_based_encrypted_malicious_and_legitimate_traffic_dataset.csv'

# Read the CSV file into a dataframe
df = pd.read_csv(file_path)

# Display the dataframe
df


Unnamed: 0,session,Flag_of_packets,Traffic_sequence,Payload_ratio,Length_of_IP_packets,Length_of_TCP_payload,Length_of_TCP_packet_header,Length_of_IP_packet_header,TCP_windows_size_value,Length_of_TCP_segment(packet),...,std_backward_pkt_length,duration_forward,duration_back,mean_of_backward_IP_header,mean_of_forward_IP_header,total_payload_per_session,IPratio,Goodput,source_IP_address,Destination_IP_address
0,1,24,1054741384,0.908612,569,517,32,20,1369,549,...,48.804901,1028.790806,1028.791156,20.0,20.0,1470.0,10.942308,2.808822,10.42.0.211,209.85.201.188
1,1,24,1450143304,0.727749,191,139,32,20,340,171,...,48.804901,1028.790806,1028.791156,20.0,20.0,1470.0,10.942308,2.808822,10.42.0.211,209.85.201.188
2,1,24,1054741901,0.452632,95,43,32,20,1386,75,...,48.804901,1028.790806,1028.791156,20.0,20.0,1470.0,10.942308,2.808822,10.42.0.211,209.85.201.188
3,1,24,1054741944,0.860215,372,320,32,20,1386,352,...,48.804901,1028.790806,1028.791156,20.0,20.0,1470.0,10.942308,2.808822,10.42.0.211,209.85.201.188
4,1,24,1450143443,0.297297,74,22,32,20,349,54,...,48.804901,1028.790806,1028.791156,20.0,20.0,1470.0,10.942308,2.808822,10.42.0.211,209.85.201.188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637012,117627,24,1996100887,0.841463,328,276,32,20,4368,308,...,524.408417,1.609708,1.618277,20.0,20.0,63857.0,37.500000,0.974260,192.168.1.165,184.73.174.14
1637013,117627,16,1996101163,0.965333,1500,1448,32,20,4368,1480,...,524.408417,1.609708,1.618277,20.0,20.0,63857.0,37.500000,0.974260,192.168.1.165,184.73.174.14
1637014,117627,24,1996102611,0.875300,417,365,32,20,4368,397,...,524.408417,1.609708,1.618277,20.0,20.0,63857.0,37.500000,0.974260,192.168.1.165,184.73.174.14
1637015,117627,24,1996102976,0.956631,1199,1147,32,20,4368,1179,...,524.408417,1.609708,1.618277,20.0,20.0,63857.0,37.500000,0.974260,192.168.1.165,184.73.174.14


### Cutting dataset to the Further Optimized Statistical (FOS) feature set 

###### (Let's assume that a label of '1' means malicious and a label of '0' means benign. The authors of the dataset did not state which is which)

In [62]:
FOSfields = ['mean_TCP_windows_size_value', \
        'Source_port', \
        'max_Interval_of_arrival_time_of_forward_traffic', \
        'max_Interval_of_arrival_time_of_backward_traffic', \
        'flow duration', \
        'std_Interval_of_arrival_time_of_backward_traffic', \
        'Total_length_of_forward_payload', \
        'std_Length_of_IP_packets', \
        'max_Time_difference_between_packets_per_session', \
        'std_forward_pkt_length', \
        'max_Length_of_TCP_payload', \
        'mean_time_to_live', \
        'std_time_to_live', \
        'duration_forward', \
        'label',
] 

df = df[FOSfields]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1637017 entries, 0 to 1637016
Data columns (total 15 columns):
 #   Column                                            Non-Null Count    Dtype  
---  ------                                            --------------    -----  
 0   mean_TCP_windows_size_value                       1637017 non-null  float64
 1   Source_port                                       1637017 non-null  int64  
 2   max_Interval_of_arrival_time_of_forward_traffic   1637017 non-null  float64
 3   max_Interval_of_arrival_time_of_backward_traffic  1637017 non-null  float64
 4   flow duration                                     1637017 non-null  float64
 5   std_Interval_of_arrival_time_of_backward_traffic  1637017 non-null  float64
 6   Total_length_of_forward_payload                   1637017 non-null  float64
 7   std_Length_of_IP_packets                          1637017 non-null  float64
 8   max_Time_difference_between_packets_per_session   1637017 non-null  floa

### Set X and Y variables

In [63]:
X = df.drop('label', axis=1)
y = df['label']

### Splitting the test/train data, fitting the model

In [64]:
#splitting data
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.20)

#number of jobs to run in parallel, (number of processors to use, don't use more processors than you have)
NUMJOBS = 16

#create the decision tree classifier
RF = RandomForestClassifier(n_jobs=NUMJOBS, n_estimators=200)
RF.fit(X_train,y_train)

### Accuracy Score, comes out at around ~100%. 
###### However, the paper states it should be at around 94%


In [65]:
#statistics
y_prediction = RF.predict(X_test)
accuracy_score(y_test, y_prediction)

0.9999938913391406

### Sanity check with Stratified K-fold cross-validation

In [66]:
# Define the number of folds
n_splits = 5

# Create an instance of StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits)

# Perform stratified k-fold validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model
    RF.fit(X_train, y_train)
    
    # Make predictions
    y_pred = RF.predict(X_test)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the accuracy score
    print("Accuracy:", accuracy)


Accuracy: 0.8676802971252642
Accuracy: 0.9837662337662337
Accuracy: 0.9948687092054745
Accuracy: 0.8156400521681231
Accuracy: 0.9486626573366768
