<a href="https://colab.research.google.com/github/AbdulsemedShalo/DDOS-Detection-and-Mitigation/blob/main/FeatureOrder1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/AbdulsemedShalo/DDOS-Detection-and-Mitigation/main/Dataset.csv')

# because ip addresses are string can't be converted to int to train, drop them.
data = data.drop("Source.IP", axis=1)
data = data.drop("Destination.IP", axis=1)

label_encoder = preprocessing.LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

# Extraxt the features you want to base the foundation of our model training
features = ['Max_Packet_Length','Fwd_Packet_Length_Max','Flow_Packets_Sec','Flow_Bytes_Sec','Packet_Length_Std'
           ,'Packet_Length_Variance','Flow_IAT_Max','Fwd_IAT_Max','Subflow_Fwd_Bytes','Fwd_Packet_Length_Std'
           ,'Bwd_Packets_Sec','min_seg_size_forward','Init_Win_bytes_backward','Average_Packet_Size'
           ,'Packet_Length_Mean','Fwd_IAT_Total','Flow_IAT_Std','Fwd_IAT_Std','Avg_Fwd_Segment_Size'
           ,'Fwd_Packet_Length_Mean','Fwd_Header_Length','Fwd_IAT_Mean','Flow_IAT_Mean','Idle_Max'
           ,'Idle_Mean','Fwd_Packets_Sec']



# Replace missing values with the mean of the column
data['Fwd_Packets_Sec'].fillna(data['Fwd_Packets_Sec'].median(), inplace=True)
data['Idle_Mean'].fillna(data['Idle_Mean'].median(), inplace=True)
data['Idle_Max'].fillna(data['Idle_Max'].median(), inplace=True)
data['Flow_IAT_Mean'].fillna(data['Flow_IAT_Mean'].median(), inplace=True)

data['Fwd_IAT_Mean'].fillna(data['Fwd_IAT_Mean'].median(), inplace=True)
data['Fwd_Header_Length'].fillna(data['Fwd_Header_Length'].median(), inplace=True)
data['Fwd_Packet_Length_Mean'].fillna(data['Fwd_Packet_Length_Mean'].median(), inplace=True)
data['Avg_Fwd_Segment_Size'].fillna(data['Avg_Fwd_Segment_Size'].median(), inplace=True)

data['Fwd_IAT_Std'].fillna(data['Fwd_IAT_Std'].median(), inplace=True)
data['Fwd_IAT_Total'].fillna(data['Fwd_IAT_Total'].median(), inplace=True)
data['Flow_IAT_Std'].fillna(data['Flow_IAT_Std'].median(), inplace=True)
data['Packet_Length_Mean'].fillna(data['Packet_Length_Mean'].median(), inplace=True)

data['Average_Packet_Size'].fillna(data['Average_Packet_Size'].median(), inplace=True)
data['Init_Win_bytes_backward'].fillna(data['Init_Win_bytes_backward'].median(), inplace=True)
data['min_seg_size_forward'].fillna(data['min_seg_size_forward'].median(), inplace=True)
data['Bwd_Packets_Sec'].fillna(data['Bwd_Packets_Sec'].median(), inplace=True)

data['Fwd_Packet_Length_Std'].fillna(data['Fwd_Packet_Length_Std'].median(), inplace=True)
data['Subflow_Fwd_Bytes'].fillna(data['Subflow_Fwd_Bytes'].median(), inplace=True)
data['Fwd_IAT_Max'].fillna(data['Fwd_IAT_Max'].median(), inplace=True)
data['Flow_IAT_Max'].fillna(data['Flow_IAT_Max'].median(), inplace=True)

data['Packet_Length_Variance'].fillna(data['Packet_Length_Variance'].median(), inplace=True)
data['Packet_Length_Std'].fillna(data['Packet_Length_Std'].median(), inplace=True)
data['Flow_Bytes_Sec'].fillna(data['Flow_Bytes_Sec'].median(), inplace=True)
data['Flow_Packets_Sec'].fillna(data['Flow_Packets_Sec'].median(), inplace=True)

data['Fwd_Packet_Length_Max'].fillna(data['Fwd_Packet_Length_Max'].median(), inplace=True)
data['Max_Packet_Length'].fillna(data['Max_Packet_Length'].median(), inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, data["Label"], test_size=0.2)

# Transform the negative feature values to positive values
features_positive = X_train.copy()
features_positive[features_positive < 0] = 0

# Create a logistic regression model
model = RandomForestRegressor()

# Re-fit the model to the transformed data
model.fit(features_positive, y_train)

# Make predictions on the test data
predictions = model.predict(features_positive)

# Evaluate the model performance
print('Mean absolute error:', np.mean(np.abs(predictions - y_train)))

# Calculate the mutual information between each feature and the labels
mi_scores = mutual_info_classif(X_train, y_train)

# Sort the features by their mutual information scores
sorted_features = np.argsort(mi_scores)

# Print the features in order of importance
print('Features in order of importance:')
for i in sorted_features[::-1]:
    print(data.columns[i])



Mean absolute error: 0.0
Features in order of importance:
Label
Max_Packet_Length
Fwd_Packet_Length_Max
Flow_Packets_Sec
Flow_Bytes_Sec
Packet_Length_Variance
Packet_Length_Std
Flow_IAT_Max
Subflow_Fwd_Bytes
Fwd_IAT_Max
min_seg_size_forward
Fwd_Packet_Length_Std
Bwd_Packets_Sec
Init_Win_bytes_backward
Packet_Length_Mean
Average_Packet_Size
Flow_IAT_Std
Fwd_IAT_Total
Fwd_IAT_Std
Avg_Fwd_Segment_Size
Fwd_Packet_Length_Mean
Fwd_Header_Length
Fwd_IAT_Mean
Flow_IAT_Mean
Idle_Max
Idle_Mean
Fwd_Packets_Sec
