# More data and merged dos lables - different features

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
Friday_data = "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
Wednesday_data = "Wednesday-workingHours.pcap_ISCX.csv"
# Read the existing DataFrame from the first CSV file
df1 = pd.read_csv(Friday_data, sep=',')

# Read the new data from the second CSV file
df2 = pd.read_csv(Wednesday_data, sep=',')

# Append the new data to the end of the existing DataFrame
merged_df = pd.concat([df1, df2], ignore_index=True)


In [14]:
merged_df.shape

(918448, 79)

In [15]:
merged_df.isnull().any().any() # check if we have nan values.

True

In [16]:
#PREPROCCSING
# Replace infinite or very large values with NaN
merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

#drop any instance that is missing a feture
merged_df = merged_df.dropna()
merged_df.isnull().any().any() # check if we have nan values.

False

In [17]:
merged_df[' Label'].unique()

array(['BENIGN', 'DDoS'], dtype=object)

In [18]:
merged_df.shape

(917117, 79)

In [20]:
#Selected fetaures from an article 
columns_to_keep = [
    ' Fwd IAT Min',
    'Init_Win_bytes_forward',
    ' Destination Port',
    ' Bwd Packet Length Min',
    ' Init_Win_bytes_backward',
    ' Subflow Fwd Bytes',
    ' Total Fwd Packets',
    ' Total Length of Bwd Packets',
    ' Bwd Packet Length Mean',
    ' Fwd Packet Length Min'
]
X = merged_df[columns_to_keep]
y = merged_df[' Label']

In [21]:
X

Unnamed: 0,Fwd IAT Min,Init_Win_bytes_forward,Destination Port,Bwd Packet Length Min,Init_Win_bytes_backward,Subflow Fwd Bytes,Total Fwd Packets,Total Length of Bwd Packets,Bwd Packet Length Mean,Fwd Packet Length Min
0,3,33,54865,0,-1,12,2,0,0.0,6
1,0,29,55054,6,256,6,1,6,6.0,6
2,0,29,55055,6,256,6,1,6,6.0,6
3,0,31,46236,6,329,6,1,6,6.0,6
4,3,32,54863,0,-1,12,2,0,0.0,6
...,...,...,...,...,...,...,...,...,...,...
918443,4,-1,53,76,-1,112,4,152,76.0,28
918444,2,-1,53,181,-1,84,2,362,181.0,42
918445,4,1006,58030,6,0,31,2,6,6.0,0
918446,1,-1,53,128,-1,192,6,256,128.0,32


In [None]:
#Random forest - split into train,test

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

#standardScaler normalization
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

rf = RandomForestClassifier(max_depth=100, random_state=45)
rf.fit(X_train_transformed, y_train)
y_predicted = rf.predict(X_test_transformed)


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming y_predicted and y_test are your predicted and true labels, respectively.
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, pos_label='DDoS')
recall = recall_score(y_test, y_predicted, pos_label='DDoS')
f1 = f1_score(y_test, y_predicted, pos_label='DDoS')
conf_matrix = confusion_matrix(y_test, y_predicted)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9997655704815074
Precision: 0.9996829506724088
Recall: 0.9997489860356968
F1 Score: 0.9997159672635758
Confusion Matrix:
 [[107707     24]
 [    19  75674]]


In [None]:
#XGboost - split into train,test

In [25]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

#Initialize LabelEncoder
label_encoder = LabelEncoder()

#create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Fit and transform the labels in both training and test sets
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

#standardScaler normalization
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

#Initialize XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_transformed, y_train_encoded)
y_predicted_encoded = xgb_model.predict(X_test_transformed)

#Inverse transform the encoded labels to get the original class labels
y_predicted = label_encoder.inverse_transform(y_predicted_encoded)

#Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='macro')
recall = recall_score(y_test, y_predicted, average='macro')
f1 = f1_score(y_test, y_predicted, average='macro')
conf_matrix = confusion_matrix(y_test, y_predicted)

#Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9997982815771109
Precision: 0.9997742380609639
Recall: 0.9998104000612058
F1 Score: 0.9997923073615993
Confusion Matrix:
 [[107248     28]
 [     9  76139]]


In [168]:
from sklearn.feature_selection import RFE
rf = RandomForestClassifier(random_state=45)
# Initialize RFE with RandomForestClassifier
rfe = RFE(estimator=rf, n_features_to_select=15)
# Fit RFE and get the selected features
X_train_rfe = rfe.fit_transform(X_train_transformed, y_train)

In [170]:
selected_feature=rfe.support_
selected_feature_names=X.columns[selected_feature]
print(selected_feature_names)

Index([' Fwd IAT Min', 'Init_Win_bytes_forward', ' Destination Port',
       ' Bwd Packet Length Min', ' Init_Win_bytes_backward',
       ' Subflow Fwd Bytes', ' Total Fwd Packets',
       ' Total Length of Bwd Packets', ' Bwd Packet Length Mean',
       ' Fwd Packet Length Min'],
      dtype='object')
