<a href="https://colab.research.google.com/github/Arashi283/AIRepoOne/blob/main/DDoS_Attack_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Load the Dataset ---
# Load the dataset from the CSV file.
# This dataset contains synthetic data representing network traffic.
try:
    df = pd.read_csv('ddos_dataset.csv')
except FileNotFoundError:
    print("Error: 'ddos_dataset.csv' not found. Make sure the dataset file is in the same directory.")
    exit()

print("--- First 5 rows of the dataset ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()

# --- 2. Preprocessing the Data ---

# Handle categorical features. In this dataset, 'protocol' and 'label' are categorical.
# We will use LabelEncoder to convert them into numerical format.
label_encoder_protocol = LabelEncoder()
label_encoder_label = LabelEncoder()

df['protocol'] = label_encoder_protocol.fit_transform(df['protocol'])
df['label'] = label_encoder_label.fit_transform(df['label'])

# Separate features (X) and the target variable (y)
X = df.drop('label', axis=1)
y = df['label']

# Get the mapping of original labels to encoded labels
label_mapping = {index: label for index, label in enumerate(label_encoder_label.classes_)}
print(f"\nLabel mapping: {label_mapping}")


# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
# Standardize features by removing the mean and scaling to unit variance.
# This is important for many machine learning algorithms.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# --- 3. Train the Machine Learning Model ---

# We will use a RandomForestClassifier, which is an ensemble model
# known for its high accuracy and robustness.
print("\n--- Training the RandomForest Classifier ---")
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model on the training data
model.fit(X_train, y_train)
print("Model training complete.")


# --- 4. Evaluate the Model ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Display a detailed classification report
print("\n--- Classification Report ---")
# Use the inverse_transform to show original labels in the report
target_names = label_encoder_label.inverse_transform(sorted(y.unique()))
print(classification_report(y_test, y_pred, target_names=target_names))

# Display the confusion matrix
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize the confusion matrix using a heatmap for better interpretation
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# --- 5. Feature Importance ---
# Analyze which features were most important for the model's decisions.
print("\n--- Feature Importance ---")
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

# Visualize feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances.importance, y=feature_importances.index)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()