In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
df = pd.read_csv('../data/combined_realistic.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head()

MemoryError: Unable to allocate 64.0 KiB for an array with shape (8192,) and data type int64

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData types:")
print(df.dtypes.value_counts())

print(f"\nMissing values:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found!")

print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Label distribution analysis
print("\n=== LABEL DISTRIBUTION ===")
label_counts = df['Label'].value_counts()
print(label_counts)
print(f"\nClass distribution percentages:")
print((label_counts / len(df) * 100).round(2))

# Plot label distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
label_counts.plot(kind='pie', ax=ax1, autopct='%1.1f%%', startangle=90)
ax1.set_title('Label Distribution (Pie Chart)', fontsize=14, fontweight='bold')
ax1.set_ylabel('')

# Bar chart
label_counts.plot(kind='bar', ax=ax2, color='skyblue', edgecolor='black')
ax2.set_title('Label Distribution (Bar Chart)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Labels')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Separate features and target
# Get numeric columns (excluding Label and Protocol if present)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"\nNumeric features: {len(numeric_columns)}")
print(f"Categorical features: {len(categorical_columns)}")

# If Protocol is in categorical, handle it
if 'Protocol' in df.columns:
    protocol_counts = df['Protocol'].value_counts()
    print(f"\nProtocol distribution:")
    print(protocol_counts.head(10))

In [None]:
# Basic statistics for numeric features
print("\n=== BASIC STATISTICS ===")
numeric_stats = df[numeric_columns].describe()
print("Basic statistics computed. Shape:", numeric_stats.shape)

# Show stats for first few features
print("\nSample statistics (first 5 features):")
print(numeric_stats.iloc[:, :5].round(4))

# Check for zero variance features
zero_var_features = []
for col in numeric_columns:
    if df[col].var() == 0:
        zero_var_features.append(col)

print(f"\nZero variance features: {len(zero_var_features)}")
if zero_var_features:
    print(zero_var_features)

# Feature distribution analysis
print("\n=== FEATURE DISTRIBUTION ANALYSIS ===")

# Calculate skewness and kurtosis
feature_stats = pd.DataFrame(index=numeric_columns)
feature_stats['mean'] = df[numeric_columns].mean()
feature_stats['std'] = df[numeric_columns].std()
feature_stats['skewness'] = df[numeric_columns].skew()
feature_stats['kurtosis'] = df[numeric_columns].kurtosis()
feature_stats['zeros_pct'] = (df[numeric_columns] == 0).sum() / len(df) * 100
feature_stats['unique_values'] = df[numeric_columns].nunique()
feature_stats['range'] = df[numeric_columns].max() - df[numeric_columns].min()

print("Feature statistics calculated!")
print(f"\nFeatures with >50% zeros: {(feature_stats['zeros_pct'] > 50).sum()}")
print(f"Highly skewed features (|skew| > 2): {(abs(feature_stats['skewness']) > 2).sum()}")

# Visualize feature distributions for top features
top_features_by_std = feature_stats.nlargest(12, 'std').index.tolist()

fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

for i, feature in enumerate(top_features_by_std):
    df[feature].hist(bins=50, ax=axes[i], alpha=0.7, color='lightblue', edgecolor='black')
    axes[i].set_title(f'{feature}\n(std: {feature_stats.loc[feature, "std"]:.2f})', fontsize=10)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

plt.suptitle('Distribution of Top 12 Features by Standard Deviation', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
print("\n=== CORRELATION ANALYSIS ===")

# Calculate correlation matrix (use a sample if dataset is too large)
if len(numeric_columns) > 50:
    print("Large number of features detected. Using top 30 features by variance for correlation analysis.")
    top_var_features = feature_stats.nlargest(30, 'std').index.tolist()
    corr_matrix = df[top_var_features].corr()
else:
    corr_matrix = df[numeric_columns].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.5})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Find highly correlated features
def find_correlated_features(corr_matrix, threshold=0.9):
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
    return corr_pairs

high_corr = find_correlated_features(corr_matrix, 0.8)
print(f"\nHighly correlated feature pairs (>0.8): {len(high_corr)}")
for pair in high_corr[:10]:  # Show first 10
    print(f"{pair[0]} <-> {pair[1]}: {pair[2]:.3f}")

In [None]:
# ============== FEATURE SELECTION ==============
print("\n" + "="*50)
print("FEATURE SELECTION FOR TOP 25 FEATURES")
print("="*50)

# Prepare data for feature selection
X = df[numeric_columns].fillna(0)  # Fill any NaN with 0
y = df['Label']

# Encode labels if they're categorical
le = LabelEncoder()
if y.dtype == 'object':
    y_encoded = le.fit_transform(y)
    print(f"Label encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")
else:
    y_encoded = y

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y_encoded.shape}")

# Method 1: Statistical Feature Selection (F-score)
print("\n1. Statistical Feature Selection (ANOVA F-test)")
selector_f = SelectKBest(score_func=f_classif, k=25)
X_f_selected = selector_f.fit_transform(X, y_encoded)

f_scores = pd.DataFrame({
    'feature': X.columns,
    'f_score': selector_f.scores_,
    'p_value': selector_f.pvalues_
}).sort_values('f_score', ascending=False)

print("Top 25 features by F-score:")
top_25_f_score = f_scores.head(25)
print(top_25_f_score[['feature', 'f_score']].to_string(index=False))

# Method 2: Mutual Information
print("\n2. Mutual Information Feature Selection")
selector_mi = SelectKBest(score_func=mutual_info_classif, k=25)
X_mi_selected = selector_mi.fit_transform(X, y_encoded)

mi_scores = pd.DataFrame({
    'feature': X.columns,
    'mi_score': selector_mi.scores_
}).sort_values('mi_score', ascending=False)

print("Top 25 features by Mutual Information:")
top_25_mi = mi_scores.head(25)
print(top_25_mi.to_string(index=False))

# Method 3: Random Forest Feature Importance
print("\n3. Random Forest Feature Importance")
# Use a sample if dataset is too large for faster computation
if len(X) > 10000:
    sample_idx = np.random.choice(len(X), 5000, replace=False)
    X_sample = X.iloc[sample_idx]
    y_sample = y_encoded[sample_idx]
    print("Using random sample of 5000 rows for RF feature importance")
else:
    X_sample = X
    y_sample = y_encoded

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_sample, y_sample)

rf_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 25 features by Random Forest Importance:")
top_25_rf = rf_importance.head(25)
print(top_25_rf.to_string(index=False))

# Method 4: Variance-based Selection
print("\n4. Variance-based Feature Selection")
variance_scores = pd.DataFrame({
    'feature': X.columns,
    'variance': X.var()
}).sort_values('variance', ascending=False)

print("Top 25 features by Variance:")
top_25_variance = variance_scores.head(25)
print(top_25_variance.to_string(index=False))

# Method 5: Combined Score (Ensemble approach)
print("\n5. Combined Feature Selection (Ensemble)")

# Normalize scores to 0-1 range
f_scores_norm = (f_scores['f_score'] - f_scores['f_score'].min()) / (f_scores['f_score'].max() - f_scores['f_score'].min())
mi_scores_norm = (mi_scores['mi_score'] - mi_scores['mi_score'].min()) / (mi_scores['mi_score'].max() - mi_scores['mi_score'].min())
rf_scores_norm = (rf_importance['importance'] - rf_importance['importance'].min()) / (rf_importance['importance'].max() - rf_importance['importance'].min())
var_scores_norm = (variance_scores['variance'] - variance_scores['variance'].min()) / (variance_scores['variance'].max() - variance_scores['variance'].min())

# Create combined score
combined_scores = pd.DataFrame({
    'feature': X.columns,
    'f_score_norm': f_scores_norm,
    'mi_score_norm': mi_scores_norm.values,  # Reorder to match
    'rf_score_norm': rf_scores_norm.values,
    'var_score_norm': var_scores_norm.values
})

# Weighted combination (you can adjust weights)
weights = {'f_score_norm': 0.3, 'mi_score_norm': 0.3, 'rf_score_norm': 0.3, 'var_score_norm': 0.1}
combined_scores['combined_score'] = (
    combined_scores['f_score_norm'] * weights['f_score_norm'] +
    combined_scores['mi_score_norm'] * weights['mi_score_norm'] +
    combined_scores['rf_score_norm'] * weights['rf_score_norm'] +
    combined_scores['var_score_norm'] * weights['var_score_norm']
)

combined_scores = combined_scores.sort_values('combined_score', ascending=False)

print("Top 25 features by Combined Score:")
top_25_combined = combined_scores.head(25)
print(top_25_combined[['feature', 'combined_score']].to_string(index=False))
