In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

import random

# Tanpa set seed:
rand_array = np.random.rand(3)
rand_int = random.randint(0, 100)


def generate_system_monitoring_data(n_samples=2000, anomaly_ratio=0.00):
    """
    Generate synthetic system monitoring data with gradual increasing patterns and anomalies
    Also track anomalies for separate logging
    """
    
    # Base timestamp - mulai dari waktu yang lebih awal untuk menunjukkan progression
    start_time = datetime(2024, 11, 6, 8, 0, 0)
    
    data = []
    anomaly_logs = []  # List untuk menyimpan informasi anomali
    anomaly_indices = set(np.random.choice(n_samples, int(n_samples * anomaly_ratio), replace=False))
    
    # Base values yang akan naik secara gradual
    base_mem = 7910000
    base_root = 10093200
    base_log = 10308000
    base_fw_alloc = 646000000
    base_rx = 19000000
    base_tx = 1820000
    
    for i in range(n_samples):
        # Time progression - interval yang lebih natural
        time_offset = timedelta(
            seconds=random.randint(30, 180),  # 30 detik sampai 3 menit
            microseconds=random.randint(0, 999999)
        )
        current_time = start_time + timedelta(minutes=i * 2) + time_offset  # Setiap 2 menit rata-rata
        
        is_anomaly = i in anomaly_indices
        
        if is_anomaly:
            # Generate anomalous data dan catat anomalinya
            row, anomaly_info = generate_anomaly_row(current_time, i, n_samples, base_mem, base_root, base_log, base_fw_alloc, base_rx, base_tx)
            
            # Tambahkan informasi anomali ke log
            anomaly_log_entry = {
                'record_index': i,
                'timestamp': current_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3],
                'anomaly_type': anomaly_info['type'],
                'description': anomaly_info['description'],
                'affected_metrics': ', '.join(anomaly_info['affected_metrics']),
                'severity': anomaly_info['severity']
            }
            anomaly_logs.append(anomaly_log_entry)
        else:
            # Generate normal data
            row = generate_normal_row(current_time, i, n_samples, base_mem, base_root, base_log, base_fw_alloc, base_rx, base_tx)
        
        data.append(row)
    
    # Sort by timestamp untuk memastikan urutan waktu yang benar
    df = pd.DataFrame(data)
    df = df.sort_values('created_at').reset_index(drop=True)
    
    # Create anomalies DataFrame
    anomalies_df = pd.DataFrame(anomaly_logs)
    if not anomalies_df.empty:
        anomalies_df = anomalies_df.sort_values('timestamp').reset_index(drop=True)
    
    return df, anomalies_df, anomaly_indices

def generate_normal_row(timestamp, index, total_samples, base_mem, base_root, base_log, base_fw_alloc, base_rx, base_tx):
    """Generate normal system metrics dengan gradual increase pattern"""
    
    # Progress factor (0 to 1) - menunjukkan seberapa jauh dalam timeline
    progress = index / total_samples
    
    # Load averages - tetap rendah dengan variasi kecil
    fw_load_avg_1_min = round(max(0, np.random.normal(0.15 + progress * 0.1, 0.1)), 2)
    fw_load_avg_5_min = round(max(0, np.random.normal(0.18 + progress * 0.08, 0.08)), 2)
    fw_load_avg_15_min = round(max(0, np.random.normal(0.20 + progress * 0.05, 0.05)), 2)
    
    # CPU usage - mostly 0, occasionally 1-2
    cpu_base = 0 if random.random() < 0.7 else random.choice([1, 1, 2])
    fw_cpu_used = cpu_base
    
    # Memory usage - gradual increase dengan noise
    mem_increase = progress * 15000  # Naik sekitar 15KB selama periode
    mem_used = int(base_mem + mem_increase + np.random.normal(0, 3000))
    
    # Root filesystem - hampir konstan dengan sedikit variasi
    root_used = int(base_root + np.random.normal(0, 50))
    
    # Log usage - gradual increase (log files bertambah)
    log_increase = progress * 8000  # Log naik sekitar 8KB
    log_used = int(base_log + log_increase + np.random.normal(0, 500))
    
    # Firewall total allocation - gradual increase dengan variasi
    fw_alloc_increase = progress * 2000000  # Naik sekitar 2MB
    fw_total_alloc = int(base_fw_alloc + fw_alloc_increase + np.random.normal(0, 200000))
    
    # Network packets - gradual increase (traffic bertambah)
    rx_increase = progress * 150000  # RX packets naik
    tx_increase = progress * 4000    # TX packets naik
    
    total_rx_packets = int(base_rx + rx_increase + np.random.randint(0, 5000))
    total_tx_packets = int(base_tx + tx_increase + np.random.randint(0, 200))
    
    return {
        'fw_load_avg_1_min': f"{fw_load_avg_1_min:.2f}".replace('.', ','),
        'fw_load_avg_5_min': f"{fw_load_avg_5_min:.2f}".replace('.', ','),
        'fw_load_avg_15_min': f"{fw_load_avg_15_min:.2f}".replace('.', ','),
        'fw_cpu_used': fw_cpu_used,
        'mem_used': mem_used,
        'root_used': root_used,
        'log_used': log_used,
        'fw_total_alloc': fw_total_alloc,
        'total_rx_packets': total_rx_packets,
        'total_tx_packets': total_tx_packets,
        'created_at': timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    }

def generate_anomaly_row(timestamp, index, total_samples, base_mem, base_root, base_log, base_fw_alloc, base_rx, base_tx):
    """Generate anomalous system metrics and return anomaly information"""
    
    progress = index / total_samples
    
    anomaly_type = random.choice([
        'high_load', 'high_cpu', 'memory_spike', 'disk_spike', 
        'network_spike', 'system_stress', 'resource_leak'
    ])
    
    # Initialize anomaly info
    anomaly_info = {
        'type': anomaly_type,
        'description': '',
        'affected_metrics': [],
        'severity': 'medium'
    }
    
    if anomaly_type == 'high_load':
        # Abnormally high load averages
        fw_load_avg_1_min = round(np.random.uniform(2.0, 8.0), 2)
        fw_load_avg_5_min = round(np.random.uniform(1.5, 4.0), 2)
        fw_load_avg_15_min = round(np.random.uniform(1.0, 2.5), 2)
        fw_cpu_used = random.randint(50, 95)
        
        anomaly_info['description'] = f'High system load detected: 1min={fw_load_avg_1_min}, CPU={fw_cpu_used}%'
        anomaly_info['affected_metrics'] = ['fw_load_avg_1_min', 'fw_load_avg_5_min', 'fw_load_avg_15_min', 'fw_cpu_used']
        anomaly_info['severity'] = 'high' if fw_load_avg_1_min > 5.0 else 'medium'
        
        # Base values dengan gradual increase
        mem_used = int(base_mem + progress * 15000 + np.random.normal(0, 3000))
        root_used = int(base_root + np.random.normal(0, 50))
        log_used = int(base_log + progress * 8000 + np.random.normal(0, 500))
        fw_total_alloc = int(base_fw_alloc + progress * 2000000 + np.random.normal(0, 200000))
        total_rx_packets = int(base_rx + progress * 150000 + np.random.randint(0, 5000))
        total_tx_packets = int(base_tx + progress * 4000 + np.random.randint(0, 200))
        
    elif anomaly_type == 'high_cpu':
        # Normal load but very high CPU
        fw_load_avg_1_min = round(max(0, np.random.normal(0.15 + progress * 0.1, 0.1)), 2)
        fw_load_avg_5_min = round(max(0, np.random.normal(0.18 + progress * 0.08, 0.08)), 2)
        fw_load_avg_15_min = round(max(0, np.random.normal(0.20 + progress * 0.05, 0.05)), 2)
        fw_cpu_used = random.randint(80, 100)
        
        anomaly_info['description'] = f'High CPU usage detected: {fw_cpu_used}% with normal load average'
        anomaly_info['affected_metrics'] = ['fw_cpu_used']
        anomaly_info['severity'] = 'high' if fw_cpu_used > 90 else 'medium'
        
        mem_used = int(base_mem + progress * 15000 + np.random.normal(0, 3000))
        root_used = int(base_root + np.random.normal(0, 50))
        log_used = int(base_log + progress * 8000 + np.random.normal(0, 500))
        fw_total_alloc = int(base_fw_alloc + progress * 2000000 + np.random.normal(0, 200000))
        total_rx_packets = int(base_rx + progress * 150000 + np.random.randint(0, 5000))
        total_tx_packets = int(base_tx + progress * 4000 + np.random.randint(0, 200))
        
    elif anomaly_type == 'memory_spike':
        # Memory usage spike
        fw_load_avg_1_min = round(max(0, np.random.normal(0.4, 0.2)), 2)
        fw_load_avg_5_min = round(max(0, np.random.normal(0.35, 0.15)), 2)
        fw_load_avg_15_min = round(max(0, np.random.normal(0.25, 0.1)), 2)
        fw_cpu_used = random.randint(5, 25)
        
        # Memory spike - 2-3x normal
        mem_used = int(np.random.uniform(15000000, 25000000))
        
        anomaly_info['description'] = f'Memory spike detected: {mem_used/1000000:.1f}MB (2-3x normal)'
        anomaly_info['affected_metrics'] = ['mem_used', 'fw_load_avg_1_min']
        anomaly_info['severity'] = 'high' if mem_used > 20000000 else 'medium'
        
        root_used = int(base_root + np.random.normal(0, 50))
        log_used = int(base_log + progress * 8000 + np.random.normal(0, 500))
        fw_total_alloc = int(base_fw_alloc + progress * 2000000 + np.random.normal(0, 200000))
        total_rx_packets = int(base_rx + progress * 150000 + np.random.randint(0, 5000))
        total_tx_packets = int(base_tx + progress * 4000 + np.random.randint(0, 200))
        
    elif anomaly_type == 'disk_spike':
        # Disk usage anomaly
        fw_load_avg_1_min = round(max(0, np.random.normal(0.8, 0.3)), 2)
        fw_load_avg_5_min = round(max(0, np.random.normal(0.6, 0.2)), 2)
        fw_load_avg_15_min = round(max(0, np.random.normal(0.4, 0.15)), 2)
        fw_cpu_used = random.randint(15, 40)
        
        mem_used = int(base_mem + progress * 15000 + np.random.normal(0, 3000))
        root_used = int(np.random.uniform(15000000, 20000000))  # Disk full
        log_used = int(np.random.uniform(20000000, 30000000))   # Log explosion
        
        anomaly_info['description'] = f'Disk usage spike: Root={root_used/1000000:.1f}MB, Log={log_used/1000000:.1f}MB'
        anomaly_info['affected_metrics'] = ['root_used', 'log_used', 'fw_load_avg_1_min']
        anomaly_info['severity'] = 'critical' if root_used > 18000000 else 'high'
        
        fw_total_alloc = int(base_fw_alloc + progress * 2000000 + np.random.normal(0, 200000))
        total_rx_packets = int(base_rx + progress * 150000 + np.random.randint(0, 5000))
        total_tx_packets = int(base_tx + progress * 4000 + np.random.randint(0, 200))
        
    elif anomaly_type == 'network_spike':
        # Network traffic anomaly
        fw_load_avg_1_min = round(max(0, np.random.normal(0.6, 0.2)), 2)
        fw_load_avg_5_min = round(max(0, np.random.normal(0.5, 0.15)), 2)
        fw_load_avg_15_min = round(max(0, np.random.normal(0.3, 0.1)), 2)
        fw_cpu_used = random.randint(10, 30)
        
        mem_used = int(base_mem + progress * 15000 + np.random.normal(0, 10000))
        root_used = int(base_root + np.random.normal(0, 50))
        log_used = int(base_log + progress * 8000 + np.random.normal(0, 500))
        fw_total_alloc = int(np.random.uniform(800000000, 1200000000))  # High allocation
        total_rx_packets = random.randint(25000000, 40000000)  # Traffic spike
        total_tx_packets = random.randint(2500000, 4000000)
        
        anomaly_info['description'] = f'Network traffic spike: RX={total_rx_packets/1000000:.1f}M, TX={total_tx_packets/1000000:.1f}M packets'
        anomaly_info['affected_metrics'] = ['total_rx_packets', 'total_tx_packets', 'fw_total_alloc']
        anomaly_info['severity'] = 'high' if total_rx_packets > 35000000 else 'medium'
        
    elif anomaly_type == 'system_stress':
        # Overall system stress
        fw_load_avg_1_min = round(np.random.uniform(3.0, 6.0), 2)
        fw_load_avg_5_min = round(np.random.uniform(2.0, 4.0), 2)
        fw_load_avg_15_min = round(np.random.uniform(1.5, 3.0), 2)
        fw_cpu_used = random.randint(70, 95)
        
        mem_used = int(np.random.uniform(12000000, 18000000))
        root_used = int(np.random.uniform(12000000, 15000000))
        log_used = int(np.random.uniform(15000000, 25000000))
        fw_total_alloc = int(np.random.uniform(750000000, 950000000))
        total_rx_packets = random.randint(22000000, 35000000)
        total_tx_packets = random.randint(2200000, 3500000)
        
        anomaly_info['description'] = f'System stress: High load ({fw_load_avg_1_min}), CPU ({fw_cpu_used}%), Memory ({mem_used/1000000:.1f}MB)'
        anomaly_info['affected_metrics'] = ['fw_load_avg_1_min', 'fw_cpu_used', 'mem_used', 'root_used', 'log_used']
        anomaly_info['severity'] = 'critical'
        
    else:  # resource_leak
        # Gradual resource leak pattern
        fw_load_avg_1_min = round(np.random.uniform(1.0, 2.5), 2)
        fw_load_avg_5_min = round(np.random.uniform(1.2, 2.8), 2)
        fw_load_avg_15_min = round(np.random.uniform(1.5, 3.2), 2)
        fw_cpu_used = random.randint(25, 60)
        
        # Resource leak - lebih tinggi dari normal tapi tidak extreme
        mem_used = int(base_mem + progress * 15000 + np.random.uniform(3000000, 6000000))
        root_used = int(base_root + np.random.normal(0, 50))
        log_used = int(base_log + progress * 8000 + np.random.normal(0, 500))
        fw_total_alloc = int(base_fw_alloc + progress * 2000000 + np.random.uniform(30000000, 100000000))
        total_rx_packets = int(base_rx + progress * 150000 + np.random.randint(0, 5000))
        total_tx_packets = int(base_tx + progress * 4000 + np.random.randint(0, 200))
        
        anomaly_info['description'] = f'Resource leak detected: Memory +{(mem_used-base_mem)/1000000:.1f}MB, FW alloc +{(fw_total_alloc-base_fw_alloc)/1000000:.1f}MB'
        anomaly_info['affected_metrics'] = ['mem_used', 'fw_total_alloc', 'fw_load_avg_15_min']
        anomaly_info['severity'] = 'medium'
    
    row_data = {
        'fw_load_avg_1_min': f"{fw_load_avg_1_min:.2f}".replace('.', ','),
        'fw_load_avg_5_min': f"{fw_load_avg_5_min:.2f}".replace('.', ','),
        'fw_load_avg_15_min': f"{fw_load_avg_15_min:.2f}".replace('.', ','),
        'fw_cpu_used': fw_cpu_used,
        'mem_used': mem_used,
        'root_used': root_used,
        'log_used': log_used,
        'fw_total_alloc': fw_total_alloc,
        'total_rx_packets': total_rx_packets,
        'total_tx_packets': total_tx_packets,
        'created_at': timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    }
    
    return row_data, anomaly_info

# Generate the dataset
print("🚀 Generating 1000 system monitoring records with gradual patterns and ~5% anomalies...")
df, anomalies_df, anomaly_indices = generate_system_monitoring_data()

# Save complete dataset to CSV with semicolon separator
output_file = 'system_monitoring_unsupervised.csv'
df.to_csv(output_file, sep=';', index=False)

# Save anomalies log to separate CSV
anomalies_file = 'anomalies_list.csv'
if not anomalies_df.empty:
    anomalies_df.to_csv(anomalies_file, sep=';', index=False)
    print(f"🎯 Anomalies log saved as: {anomalies_file}")
else:
    print("⚠️ No anomalies were generated in this run")

print(f"✅ Unsupervised dataset generated successfully!")
print(f"📁 Dataset saved as: {output_file}")
print(f"📊 Dataset shape: {df.shape}")
print(f"🎯 Contains {len(anomalies_df)} anomalous records (details logged separately)")

# Display sample data
print("\n📋 Sample of generated data (first 5 records):")
print(df.head(5).to_string(index=False))

print("\n📋 Sample of generated data (last 5 records - showing progression):")
print(df.tail(5).to_string(index=False))

# Display anomalies summary if any exist
if not anomalies_df.empty:
    print(f"\n🚨 Anomalies Summary ({len(anomalies_df)} total):")
    print("=" * 80)
    
    # Group by anomaly type
    anomaly_counts = anomalies_df['anomaly_type'].value_counts()
    for anomaly_type, count in anomaly_counts.items():
        print(f"  {anomaly_type}: {count} occurrences")
    
    # Group by severity
    severity_counts = anomalies_df['severity'].value_counts()
    print(f"\nSeverity Distribution:")
    for severity, count in severity_counts.items():
        print(f"  {severity}: {count} occurrences")
    
    print(f"\n📋 Sample anomalies (first 3):")
    print(anomalies_df[['timestamp', 'anomaly_type', 'severity', 'description']].head(3).to_string(index=False))

print("\n📈 Data statistics:")
print(f"Date range: {df['created_at'].min()} to {df['created_at'].max()}")
print(f"Total records: {len(df)}")

# Convert comma decimal to dot for numeric analysis
df_numeric = df.copy()
for col in ['fw_load_avg_1_min', 'fw_load_avg_5_min', 'fw_load_avg_15_min']:
    df_numeric[col] = df_numeric[col].str.replace(',', '.').astype(float)

print(f"Load avg 1min range: {df_numeric['fw_load_avg_1_min'].min():.2f} to {df_numeric['fw_load_avg_1_min'].max():.2f}")
print(f"CPU usage range: {df['fw_cpu_used'].min()} to {df['fw_cpu_used'].max()}")
print(f"Memory usage range: {df['mem_used'].min():,} to {df['mem_used'].max():,}")
print(f"RX packets range: {df['total_rx_packets'].min():,} to {df['total_rx_packets'].max():,}")
print(f"TX packets range: {df['total_tx_packets'].min():,} to {df['total_tx_packets'].max():,}")

print("\n📈 Showing gradual increase pattern:")
print(f"First record memory: {df.iloc[0]['mem_used']:,}")
print(f"Last record memory: {df.iloc[-1]['mem_used']:,}")
print(f"Memory increase: {df.iloc[-1]['mem_used'] - df.iloc[0]['mem_used']:,}")

print(f"First record RX packets: {df.iloc[0]['total_rx_packets']:,}")
print(f"Last record RX packets: {df.iloc[-1]['total_rx_packets']:,}")
print(f"RX packets increase: {df.iloc[-1]['total_rx_packets'] - df.iloc[0]['total_rx_packets']:,}")

print("\n💡 Dataset characteristics:")
print("✓ Purely unsupervised - no labels or anomaly indicators in main dataset")
print("✓ Gradual increase in memory usage over time")
print("✓ Gradual increase in network packets over time") 
print("✓ Gradual increase in log usage over time")
print("✓ Load averages stay low with small variations")
print("✓ CPU usage mostly 0, occasionally 1-2")
print("✓ Realistic timestamp progression")
print("✓ Contains hidden anomalies (~5%) for detection algorithms")
print("✓ Anomalies timeline logged separately in anomalies_list.csv")
print("📝 Both files use semicolon (;) as separator")

🚀 Generating 1000 system monitoring records with gradual patterns and ~5% anomalies...
⚠️ No anomalies were generated in this run
✅ Unsupervised dataset generated successfully!
📁 Dataset saved as: system_monitoring_unsupervised.csv
📊 Dataset shape: (2000, 11)
🎯 Contains 0 anomalous records (details logged separately)

📋 Sample of generated data (first 5 records):
fw_load_avg_1_min fw_load_avg_5_min fw_load_avg_15_min  fw_cpu_used  mem_used  root_used  log_used  fw_total_alloc  total_rx_packets  total_tx_packets              created_at
             0,13              0,22               0,18            0   7909922   10093214  10308128       645970186          19001258           1820113 2024-11-06 08:01:10.657
             0,20              0,29               0,11            1   7908706   10093269  10308047       645873105          19003739           1820008 2024-11-06 08:03:37.727
             0,00              0,28               0,21            0   7910752   10093227  10308305       6461