In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np
import os

# List of attack types and BENIGN
attack_types = ["Bot", "DDoS", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "FTP-Patator",
                "Heartbleed", "Infiltration", "PortScan", "SSH-Patator", "Web Attack - Brute Force",
                "Web Attack - Sql Injection", "Web Attack - XSS"]
benign_type = "BENIGN"

# Function for feature selection and importance list creation
def perform_feature_selection(data):
    X = data.drop(columns=["Label"])
    y = data["Label"].apply(lambda x: 1 if x != benign_type else 0)
    
    clf = RandomForestRegressor(n_estimators=100, random_state=42)
    clf.fit(X, y)
    
    importances = clf.feature_importances_
    return importances

# Iterate over attack types
for attack_type in attack_types:
    # Load the attack vs. benign file
    input_filename = f"{attack_type}_vs_{benign_type}.csv"
    attack_data = pd.read_csv(input_filename, low_memory=False)
    
    # Perform feature selection
    importances = perform_feature_selection(attack_data)
    
    # Create a DataFrame for importance list
    importance_df = pd.DataFrame({"Feature": attack_data.drop(columns=["Label"]).columns,
                                  "Importance": importances})
    
    # Calculate the percentage of importance for each feature
    total_importance = importance_df["Importance"].sum()
    importance_df["Percentage"] = importance_df["Importance"] / total_importance * 100
    
    # Sort the DataFrame by importance in descending order
    importance_df = importance_df.sort_values(by="Importance", ascending=False)
    
    # Print the top 20 features and their percentages
    print(f"\nTop 20 features and their percentages for {attack_type}:")
    print(importance_df.head(20))
    
    # Save the importance list to a CSV file
    importance_filename = f"{attack_type}_importance.csv"
    importance_df.to_csv(importance_filename, index=False)
    print(f"Saved importance list for {attack_type}")
    
    # Create a bar plot for the top 20 features' importances
    plt.figure(figsize=(10, 6))
    top_20_df = importance_df.head(20)
    top_20_df.plot(kind="bar", x="Feature", y="Importance", legend=None)
    plt.title(f"Feature Importance for {attack_type}")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

print("Feature selection and visualization completed for all attack types.")

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

file_path = r"C:\Users\hp\Desktop\ML projects\CYBER_AI\preprocessed_train_data2.csv"

# Function to perform feature selection
def feature_selection(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Check if the DataFrame is empty
    if df.empty:
        print(f"DataFrame from {file_path} is empty.")
        return None

    # Assuming the label is in the last column; adjust based on your dataset
    label_column = df.columns[-1]  # Get the last column as the label
    X = df.drop(columns=[label_column])  # Features
    y = df[label_column]  # Labels

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize Random Forest Classifier
    model = RandomForestClassifier(random_state=42)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Get feature importance
    feature_importances = model.feature_importances_

    # Create a DataFrame to hold feature names and their importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    })

    # Sort the DataFrame by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Select the top 4 features
    top_features = feature_importance_df.head(4)
    return top_features

# Example usage for each attack file
files = [
    r'C:\Users\hp\Desktop\ML projects\CYBER_AI\Bot_vs_BENIGN.csv',
    r"C:\Users\hp\Desktop\ML projects\CYBER_AI\DDoS_vs_BENIGN.csv",
    r"C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS GoldenEye_vs_BENIGN.csv",
    r"C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS Hulk_vs_BENIGN.csv",
    r"C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS slowloris_vs_BENIGN.csv"
]

# Iterate through each file and print the top features
for file in files:
    print(f"Selected features for {file}:")
    top_features = feature_selection(file)
    if top_features is not None:
        print(top_features)
    print("-----------------------------")


Selected features for C:\Users\hp\Desktop\ML projects\CYBER_AI\Bot_vs_BENIGN.csv:
DataFrame from C:\Users\hp\Desktop\ML projects\CYBER_AI\Bot_vs_BENIGN.csv is empty.
-----------------------------
Selected features for C:\Users\hp\Desktop\ML projects\CYBER_AI\DDoS_vs_BENIGN.csv:
DataFrame from C:\Users\hp\Desktop\ML projects\CYBER_AI\DDoS_vs_BENIGN.csv is empty.
-----------------------------
Selected features for C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS GoldenEye_vs_BENIGN.csv:
DataFrame from C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS GoldenEye_vs_BENIGN.csv is empty.
-----------------------------
Selected features for C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS Hulk_vs_BENIGN.csv:
DataFrame from C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS Hulk_vs_BENIGN.csv is empty.
-----------------------------
Selected features for C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS slowloris_vs_BENIGN.csv:
DataFrame from C:\Users\hp\Desktop\ML projects\CYBER_AI\DoS slowloris_vs_BENIGN.csv is empty