In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt
import warnings
import os

from sklearn.preprocessing import StandardScaler

In [2]:
# 强制使用无GUI后端
matplotlib.use('Agg')

# 使用DejaVu Sans字体（大多数Linux系统都有）
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

warnings.filterwarnings('ignore')


In [10]:
def load_and_preprocess_data(file_path):
    """
    Load and preprocess data
    """
    print("Loading data...")
    df = pd.read_parquet(file_path)
    print(f"Data shape: {df.shape}")

    print("\nLabel distribution:")
    print(df['Label'].value_counts())

    # Binary classification (Normal vs Anomaly)
    df['binary_label'] = df['Label'].apply(lambda x: 0 if x == 'Benign' else 1)

    # Get feature columns
    exclude_cols = ['Timestamp', 'Label', 'binary_label']
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    # Handle infinite and missing values
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)

    missing_counts = df[feature_cols].isnull().sum()
    if missing_counts.sum() > 0:
        print(f"\nFound missing values: {missing_counts.sum()}")
        df[feature_cols] = df[feature_cols].fillna(0)

    print(f"\nNumber of features: {len(feature_cols)}")
    
    # use standard normalization
    print("Applying scaler...")
    scaler = StandardScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    #check success or not
    normalized_stats = df[feature_cols].describe()
    mean_check = abs(normalized_stats.loc['mean'].mean()) < 1e-10
    std_check = abs(normalized_stats.loc['std'].mean()-1.0) < 1e-10
    print(f"\nNormalization check - mean: {mean_check}, std: {std_check}")
    return df, feature_cols

In [4]:
def calculate_feature_importance(X, y, n_estimators=100, max_samples=50000):
    """
    Calculate feature importance using Random Forest
    """
    print(f"\nCalculating feature importance...")
    print(f"Parameters: n_estimators={n_estimators}")

    if len(X) > max_samples:
        print(f"Large dataset ({len(X)} samples), sampling {max_samples}...")
        _, X_sample, _, y_sample = train_test_split(
            X, y, test_size=max_samples / len(X),
            random_state=42, stratify=y
        )
    else:
        X_sample = X
        y_sample = y

    rf_classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    print("Training Random Forest model...")
    rf_classifier.fit(X_sample, y_sample)
    feature_importances = rf_classifier.feature_importances_

    return feature_importances

In [5]:
def analyze_and_visualize_importance(feature_cols, feature_importances, top_n=30):
    """
    Analyze and visualize feature importance
    """
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': feature_importances
    })

    importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)
    importance_df['cumulative_importance'] = importance_df['importance'].cumsum()

    print(f"\nTop {top_n} most important features:")
    print("-" * 70)
    for idx, row in importance_df.head(top_n).iterrows():
        print(f"{idx + 1:3d}. {row['feature']:45s} {row['importance']:.6f}")

    # Create visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

    # Plot 1: Top N features importance bar chart
    top_features = importance_df.head(top_n)
    bars = ax1.barh(range(len(top_features)), top_features['importance'], color='skyblue')
    ax1.set_yticks(range(len(top_features)))
    ax1.set_yticklabels(top_features['feature'], fontsize=8)
    ax1.set_xlabel('Feature Importance', fontsize=12)
    ax1.set_title(f'Top {top_n} Most Important Features', fontsize=14, fontweight='bold')
    ax1.invert_yaxis()
    ax1.grid(axis='x', alpha=0.3)

    # Add value labels on bars
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax1.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
                f'{width:.4f}', ha='left', va='center', fontsize=7)

    # Plot 2: Cumulative importance curve
    ax2.plot(range(1, len(importance_df) + 1), importance_df['cumulative_importance'], 
             'b-', linewidth=2, label='Cumulative Importance')
    ax2.axhline(y=0.95, color='r', linestyle='--', linewidth=2, label='95% Threshold')
    ax2.set_xlabel('Number of Features', fontsize=12)
    ax2.set_ylabel('Cumulative Importance', fontsize=12)
    ax2.set_title('Feature Cumulative Importance Curve', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)

    # Mark 95% point
    n_features_95 = (importance_df['cumulative_importance'] >= 0.95).idxmax() + 1
    ax2.axvline(x=n_features_95, color='g', linestyle='--', linewidth=2,
                label=f'{n_features_95} features reach 95%')
    ax2.legend(fontsize=10)

    plt.tight_layout()
    
    # Save the plot
    plt.savefig('feature_importance_analysis.png', dpi=300, bbox_inches='tight')
    plt.savefig('feature_importance_analysis.pdf', dpi=300, bbox_inches='tight')
    print("✓ Plots saved as 'feature_importance_analysis.png' and '.pdf'")
    plt.close()

    print(f"\n✓ {n_features_95} features needed to reach 95% cumulative importance")
    return importance_df

In [6]:
def create_distribution_plot(importance_df):
    """
    Create feature importance distribution plot
    """
    plt.figure(figsize=(12, 8))
    
    # Main histogram
    n, bins, patches = plt.hist(importance_df['importance'], bins=50, 
                               edgecolor='black', alpha=0.7, color='lightblue')
    
    # Color bars based on importance
    for i, patch in enumerate(patches):
        if bins[i] > importance_df['importance'].quantile(0.9):
            patch.set_facecolor('red')
        elif bins[i] > importance_df['importance'].quantile(0.7):
            patch.set_facecolor('orange')
    
    plt.xlabel('Feature Importance', fontsize=12)
    plt.ylabel('Number of Features', fontsize=12)
    plt.title('Feature Importance Distribution', fontsize=14, fontweight='bold')
    
    # Add statistics lines
    mean_val = importance_df['importance'].mean()
    median_val = importance_df['importance'].median()
    
    plt.axvline(x=mean_val, color='r', linestyle='--', linewidth=2,
                label=f'Mean: {mean_val:.6f}')
    plt.axvline(x=median_val, color='g', linestyle='--', linewidth=2,
                label=f'Median: {median_val:.6f}')
    
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # Add text box with statistics
    stats_text = f"""Statistics:
    Mean: {mean_val:.6f}
    Median: {median_val:.6f}
    Std: {importance_df['importance'].std():.6f}
    Max: {importance_df['importance'].max():.6f}
    Min: {importance_df['importance'].min():.6f}"""
    
    plt.text(0.7, 0.7, stats_text, transform=plt.gca().transAxes,
             fontsize=9, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig('feature_importance_distribution.png', dpi=300, bbox_inches='tight')
    plt.savefig('feature_importance_distribution.pdf', dpi=300, bbox_inches='tight')
    print("✓ Distribution plots saved as 'feature_importance_distribution.png' and '.pdf'")
    plt.close()


In [7]:
def save_results(importance_df, output_path='feature_importance_results.csv'):
    """
    Save feature importance results
    """
    importance_df.to_csv(output_path, index=False)
    print(f"\n✓ Feature importance results saved to: {output_path}")

    # Save top features for 95% cumulative importance
    n_features_95 = (importance_df['cumulative_importance'] >= 0.95).idxmax() + 1
    important_features = importance_df.head(n_features_95)['feature'].tolist()

    with open('important_features_95.txt', 'w') as f:
        for feature in important_features:
            f.write(f"{feature}\n")

    print(f"✓ Top {n_features_95} important features saved to: important_features_95.txt")
    
    # Save different threshold versions
    thresholds = [0.8, 0.9, 0.95, 0.99]
    for threshold in thresholds:
        n_features = (importance_df['cumulative_importance'] >= threshold).idxmax() + 1
        features = importance_df.head(n_features)['feature'].tolist()
        
        filename = f'important_features_{int(threshold*100)}.txt'
        with open(filename, 'w') as f:
            for feature in features:
                f.write(f"{feature}\n")
        print(f"✓ Top {n_features} features ({threshold*100}%) saved to: {filename}")

In [8]:
def print_detailed_analysis(importance_df):
    """
    Print detailed analysis results
    """
    print("\n" + "="*80)
    print("DETAILED FEATURE IMPORTANCE ANALYSIS")
    print("="*80)
    
    print(f"\nBasic Statistics:")
    print(f"  Total features: {len(importance_df)}")
    print(f"  Mean importance: {importance_df['importance'].mean():.6f}")
    print(f"  Median importance: {importance_df['importance'].median():.6f}")
    print(f"  Standard deviation: {importance_df['importance'].std():.6f}")
    print(f"  Max importance: {importance_df['importance'].max():.6f}")
    print(f"  Min importance: {importance_df['importance'].min():.6f}")
    
    # Features needed for different thresholds
    print(f"\nFeatures needed for different cumulative importance thresholds:")
    for threshold in [0.5, 0.8, 0.9, 0.95, 0.99]:
        n_features = (importance_df['cumulative_importance'] >= threshold).idxmax() + 1
        percentage = (n_features / len(importance_df)) * 100
        print(f"  {threshold*100:2.0f}%: {n_features:4d} features ({percentage:5.1f}% of total)")
    
    # Low importance features
    low_thresholds = [0.0001, 0.001, 0.01]
    for threshold in low_thresholds:
        low_features = importance_df[importance_df['importance'] < threshold]
        percentage = (len(low_features) / len(importance_df)) * 100
        print(f"\nFeatures with importance < {threshold}: {len(low_features)} ({percentage:.1f}%)")
        
        if len(low_features) > 0 and threshold == 0.0001:
            print("  These features could be candidates for removal:")
            for _, row in low_features.head(5).iterrows():
                print(f"    - {row['feature']}: {row['importance']:.8f}")
            if len(low_features) > 5:
                print(f"    ... and {len(low_features) - 5} more")

In [14]:
"""
    Main function
    """
# Configuration
data_path = "../cicids2017/clean/all_data.parquet"
n_estimators = 50000  # Reduced for faster execution
max_samples = 100000
top_n_display = 30

print("="*80)
print("CICIDS2017 FEATURE IMPORTANCE ANALYSIS")
print("="*80)

try:
    # 1. Load and preprocess data
    df, feature_cols = load_and_preprocess_data(data_path)

    # 2. Prepare feature matrix and labels
    X = df[feature_cols].values
    y = df['binary_label'].values

    print(f"\n📊 Dataset Summary:")
    print(f"   Feature matrix shape: {X.shape}")
    print(f"   Normal samples: {(y == 0).sum():,} ({(y == 0).sum() / len(y) * 100:.2f}%)")
    print(f"   Anomalous samples: {(y == 1).sum():,} ({(y == 1).sum() / len(y) * 100:.2f}%)")

    # 3. Calculate feature importance
    feature_importances = calculate_feature_importance(
        X, y, n_estimators=n_estimators, max_samples=max_samples
    )

    # 4. Analyze and visualize results
    importance_df = analyze_and_visualize_importance(
        feature_cols, feature_importances, top_n=top_n_display
    )

    # 5. Create distribution plot
    create_distribution_plot(importance_df)

    # 6. Save results
    save_results(importance_df)

    # 7. Print detailed analysis
    print_detailed_analysis(importance_df)

    print(f"\n🎉 Analysis completed successfully!")
    print(f"📁 Check the following output files:")
    print(f"   - feature_importance_analysis.png/pdf")
    print(f"   - feature_importance_distribution.png/pdf")
    print(f"   - feature_importance_results.csv")
    print(f"   - important_features_*.txt")

except Exception as e:
    print(f"\n❌ Error: {str(e)}")
    import traceback
    traceback.print_exc()

CICIDS2017 FEATURE IMPORTANCE ANALYSIS
Loading data...
Data shape: (2497441, 70)

Label distribution:
Benign                        2071709
DoS Hulk                       172837
DDoS                           128014
PortScan                        90694
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1948
Web Attack  Brute Force         1470
Web Attack  XSS                  652
Infiltration                       36
Web Attack  Sql Injection         21
Heartbleed                         11
Name: Label, dtype: int64

Number of features: 68
Applying scaler...

Normalization check - mean: True, std: False

📊 Dataset Summary:
   Feature matrix shape: (2497441, 68)
   Normal samples: 2,071,709 (82.95%)
   Anomalous samples: 425,732 (17.05%)

Calculating feature importance...
Parameters: n_estimators=50000
Large datas

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 3986 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 4936 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 5986 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 7136 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done 8386 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 9736 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 11186 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 12736 task


Top 30 most important features:
----------------------------------------------------------------------
  1. Bwd Packet Length Std                         0.081605
  2. Packet Length Variance                        0.073981
  3. Packet Length Std                             0.073657
  4. Avg Bwd Segment Size                          0.056573
  5. Bwd Packet Length Mean                        0.056257
  6. Avg Packet Size                               0.044413
  7. Bwd Packet Length Max                         0.041143
  8. Packet Length Max                             0.038322
  9. Packet Length Mean                            0.033564
 10. Subflow Bwd Bytes                             0.027837
 11. Bwd Packets Length Total                      0.027642
 12. Destination Port                              0.025845
 13. Fwd Packets Length Total                      0.021124
 14. Subflow Fwd Packets                           0.020820
 15. Subflow Fwd Bytes                             0.020