In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import os

# List of file names
file_names = [
    'Bot_vs_BENIGN.csv', 'DDoS_vs_BENIGN.csv', 'DoS GoldenEye_vs_BENIGN.csv',
    'DoS Hulk_vs_BENIGN.csv', 'DoS Slowhttptest_vs_BENIGN.csv',
    'DoS slowloris_vs_BENIGN.csv', 'FTP-Patator_vs_BENIGN.csv',
    'Heartbleed_vs_BENIGN.csv', 'Infiltration_vs_BENIGN.csv',
    'PortScan_vs_BENIGN.csv', 'SSH-Patator_vs_BENIGN.csv',
    'Web Attack – Brute Force_vs_BENIGN.csv',
    'Web Attack – Sql Injection_vs_BENIGN.csv', 'Web Attack – XSS_vs_BENIGN.csv'
]

# Loop through each file
for file_name in file_names:
    # Read the file using pandas
    data = pd.read_csv(os.path.join("attacks", file_name))
    
    # Replace +/- infinity with NaN
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Drop rows with NaN values
    data.dropna(inplace=True)
    
    # Extract the attack type from the file name
    attack_type = file_name.split("_vs_")[0]
    
    # Separating features (X) and target variable (y)
    X = data.drop(' Label', axis=1)
    y = data[' Label']
    
    # Initialize Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Fit the classifier
    clf.fit(X, y)
    
    # Feature importances
    feature_importances = clf.feature_importances_
    
    # Create a DataFrame to better visualize feature importances
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    
    # Sort features by importance in descending order
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Calculate percentages
    importance_df['Percentage'] = importance_df['Importance'] * 100
    
    # Print the top 20 features and their percentages for the current file
    print(f"\nTop 20 features and their percentages for {attack_type}:\n")
    print(importance_df.head(20))
    
    # Select the top 4 attributes
    top_4_attributes = importance_df.head(4)['Feature'].tolist()
    
    # Save the selected attributes to a text file
    importance_file_path = os.path.join("attacks", f"{attack_type}_importance_list.txt")
    with open(importance_file_path, "w") as f:
        f.write(repr(top_4_attributes))
    
    print(f"Saved importance list for {attack_type}")
    
    # Plot feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'].head(20), importance_df['Importance'].head(20))
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(f'Top 20 Feature Importances for {attack_type}')
    plt.gca().invert_yaxis()  # Invert y-axis to show most important features at top
    plt.savefig(os.path.join("attacks", f"{attack_type}_feature_importance.png"))
    plt.show()
    
    # Save their importance to a separate file
    importance_df.to_csv(os.path.join("attacks", f"{attack_type}_feature_importance.csv"), index=False)



Top 20 features and their percentages for Bot:
                     Feature  Importance  Percentage
5                   Protocol    0.861703   86.170332
4           Destination Port    0.089629    8.962888
2                Source Port    0.042115    4.211482
43             Bwd Packets/s    0.005280    0.527998
41         Bwd Header Length    0.000207    0.020749
9     Total Backward Packets    0.000206    0.020642
70       Subflow Bwd Packets    0.000176    0.017640
73   Init_Win_bytes_backward    0.000175    0.017514
36             Fwd PSH Flags    0.000130    0.013037
42             Fwd Packets/s    0.000107    0.010695
50            SYN Flag Count    0.000092    0.009205
20              Flow Bytes/s    0.000069    0.006893
0                    Flow ID    0.000039    0.003870
18    Bwd Packet Length Mean    0.000030    0.003041
1                  Source IP    0.000015    0.001523
19     Bwd Packet Length Std    0.000012    0.001153
27              Fwd IAT Mean    0.000007    0.00074