In [None]:
## 1. Generate Synthetic Security Data

# Let's create a realistic dataset that mimics network security logs with both normal and anomalous behavior.
# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic network security data
def generate_security_data(n_samples=10000, anomaly_fraction=0.05):
    """#
    Generate synthetic security data with normal and anomalous patterns
    """
    n_anomalies = int(n_samples * anomaly_fraction)
    n_normal = n_samples - n_anomalies
    
    # Calculate proper splits for anomaly data
    split1 = n_anomalies // 3
    split2 = n_anomalies // 3
    split3 = n_anomalies - split1 - split2  # This ensures all anomalies are accounted for
    
    print(f"Anomaly splits: {split1}, {split2}, {split3} (total: {split1 + split2 + split3})")
    
    # Normal traffic patterns
    normal_data = {
        'bytes_sent': np.random.normal(1500, 500, n_normal),  # Normal packet sizes
        'bytes_received': np.random.normal(1200, 400, n_normal),
        'connection_duration': np.random.exponential(30, n_normal),  # Typical connection times
        'packets_per_second': np.random.gamma(2, 5, n_normal),  # Normal packet rates
        'unique_ports': np.random.poisson(3, n_normal),  # Typical port usage
        'failed_logins': np.random.poisson(0.1, n_normal),  # Very few failed logins
        'is_anomaly': [0] * n_normal
    }
    
    # Anomalous traffic patterns (attacks, data exfiltration, etc.)
    anomaly_data = {
        'bytes_sent': np.concatenate([
            np.random.normal(50000, 10000, split1),  # Data exfiltration
            np.random.normal(100, 50, split2),       # Port scanning
            np.random.normal(1500, 500, split3)      # Subtle attacks
        ]),
        'bytes_received': np.concatenate([
            np.random.normal(1000, 300, split1),
            np.random.normal(5000, 1000, split2),    # Command injection responses
            np.random.normal(1200, 400, split3)
        ]),
        'connection_duration': np.concatenate([
            np.random.exponential(300, split1),      # Long-lived connections
            np.random.exponential(1, split2),        # Very short connections
            np.random.exponential(30, split3)
        ]),
        'packets_per_second': np.concatenate([
            np.random.gamma(20, 10, split1),         # High packet rates
            np.random.gamma(0.1, 1, split2),         # Very low rates
            np.random.gamma(2, 5, split3)
        ]),
        'unique_ports': np.concatenate([
            np.random.poisson(50, split1),           # Port scanning
            np.random.poisson(1, split2),            # Single port focus
            np.random.poisson(3, split3)
        ]),
        'failed_logins': np.concatenate([
            np.random.poisson(20, split1),           # Brute force attempts
            np.random.poisson(0.1, split2),
            np.random.poisson(5, split3)             # Moderate failed attempts
        ]),
        'is_anomaly': [1] * n_anomalies
    }
    
    # Verify all arrays have correct length
    print("Array lengths check:")
    for key in anomaly_data.keys():
        print(f"  {key}: {len(anomaly_data[key])}")
    
    # Combine normal and anomalous data
    combined_data = {}
    for key in normal_data.keys():
        combined_data[key] = np.concatenate([normal_data[key], anomaly_data[key]])
    
    # Create DataFrame and shuffle
    df = pd.DataFrame(combined_data)
    df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the data
    
    # Ensure non-negative values
    numeric_cols = ['bytes_sent', 'bytes_received', 'connection_duration', 'packets_per_second']
    for col in numeric_cols:
        df[col] = np.abs(df[col])
    
    return df

# Test the fixed function
if __name__ == "__main__":
    print("Testing fixed data generation function...")
    security_data = generate_security_data(n_samples=10000, anomaly_fraction=0.05)  # 8% anomalies
    
    print(f"\n📈 Generated {len(security_data)} security events")
    print(f"🚨 Anomalies: {security_data['is_anomaly'].sum()} ({security_data['is_anomaly'].mean()*100:.1f}%)")
    print(f"✅ Normal events: {(security_data['is_anomaly'] == 0).sum()}")
    
    print("\nFirst few rows:")
    print(security_data.head())
    
    print("\nData types:")
    print(security_data.dtypes)
    
    print("\nSuccess! The function works correctly.")


# Generate the dataset
security_data = generate_security_data(1000, 0.08)  # 8% anomalies

print(f"📈 Generated {len(security_data)} security events")
print(f"🚨 Anomalies: {security_data['is_anomaly'].sum()} ({security_data['is_anomaly'].mean()*100:.1f}%)")
print(f"✅ Normal events: {(security_data['is_anomaly'] == 0).sum()}")

# Display first few rows
security_data.head()
security_data.to_csv(f"{DATA_DIR}/security_interesting_events_data.csv", index=False)
# Visualize the data distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

features = ['bytes_sent', 'bytes_received', 'connection_duration', 'packets_per_second', 'unique_ports', 'failed_logins']

for i, feature in enumerate(features):
    # Plot normal vs anomalous distributions
    normal_data = security_data[security_data['is_anomaly'] == 0][feature]
    anomaly_data = security_data[security_data['is_anomaly'] == 1][feature]
    
    axes[i].hist(normal_data, bins=30, alpha=0.7, label='Normal', color='blue', density=True)
    axes[i].hist(anomaly_data, bins=30, alpha=0.7, label='Anomaly', color='red', density=True)
    axes[i].set_title(f'{feature.replace("_", " ").title()}')
    axes[i].legend()
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Density')

plt.tight_layout()
plt.suptitle('Security Data: Normal vs Anomalous Patterns', y=1.02, fontsize=16)
plt.show()