In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import math

# ========== RANDOM SEED CONFIGURATION ==========
def set_random_seed(seed=42):
    """
    Set random seed untuk reproducible results
    Args:
        seed: Integer seed value (default: 42)
    """
    random.seed(seed)
    np.random.seed(seed)
    print(f"🎲 Random seed set to: {seed}")

def generate_system_monitoring_data_cyclic(n_samples=1000, anomaly_ratio=0.05, cycle_type='weekly', random_seed=None):
    # Set random seed jika diberikan
    if random_seed is not None:
        set_random_seed(random_seed)
    
    # Base timestamp
    start_time = datetime(2024, 11, 6, 8, 0, 0)
    
    data = []
    anomaly_logs = []
    anomaly_indices = set(np.random.choice(n_samples, int(n_samples * anomaly_ratio), replace=False))
    
    # Base values - ini akan jadi "baseline" untuk pola siklik
    base_mem = 7756000           # Tengah antara 7794756 - 7917172
    base_root = 10076680         # Tengah antara 10080064 - 10093296  
    base_log = 10135430          # Tengah antara 10154048 - 10316812
    base_fw_alloc = 490508444    # Tengah antara 372230365 - 648786522
    base_rx = 10088046           # Tengah antara 9034753 - 19141339
    base_tx = 1172478            # Tengah antara 560546 - 1824410
    
    # Range untuk variasi siklik (amplitude naik-turun) - sesuai dengan range yang diminta
    mem_amplitude = 61208        # Memory: (7917172 - 7794756) / 2
    root_amplitude = 6616        # Root: (10093296 - 10080064) / 2
    log_amplitude = 81382        # Log: (10316812 - 10154048) / 2
    fw_alloc_amplitude = 138278079 # FW: (648786522 - 372230365) / 2
    rx_amplitude = 4053293       # RX: (19141339 - 9034753) / 2
    tx_amplitude = 631932        # TX: (1824410 - 560546) / 2
    load_amplitude = 0.3         # Load average bisa naik/turun 0.3
    
    # Tentukan periode siklik
    if cycle_type == 'weekly':
        cycle_period = 7 * 24 * 30  # 7 hari * 24 jam * 30 samples per jam (asumsi)
        cycle_label = "mingguan"
    else:  # biweekly
        cycle_period = 14 * 24 * 30  # 14 hari
        cycle_label = "2 mingguan"
    
    print(f"🔄 Menggunakan pola siklik {cycle_label} dengan periode {cycle_period} samples")
    
    for i in range(n_samples):
        # Time progression
        time_offset = timedelta(
            seconds=random.randint(30, 180),
            microseconds=random.randint(0, 999999)
        )
        current_time = start_time + timedelta(minutes=i * 2) + time_offset
        
        # Hitung faktor siklik menggunakan sin wave (0 ke 1 ke 0 ke -1 ke 0)
        cycle_progress = (i % cycle_period) / cycle_period
        cycle_factor = math.sin(2 * math.pi * cycle_progress)  # -1 sampai +1
        
        # Tambahkan noise harian (pola dalam hari - pagi rendah, siang tinggi, malam rendah)
        hour_of_day = current_time.hour
        daily_factor = math.sin(math.pi * (hour_of_day - 6) / 12)  # Peak di siang hari
        daily_factor = max(0, daily_factor) * 0.3  # Scale down daily variation
        
        # Kombinasi cycle factor dengan daily factor
        combined_factor = cycle_factor + daily_factor
        
        is_anomaly = i in anomaly_indices
        
        if is_anomaly:
            row, anomaly_info = generate_anomaly_row_cyclic(
                current_time, i, combined_factor, base_mem, base_root, base_log, 
                base_fw_alloc, base_rx, base_tx, mem_amplitude, root_amplitude, log_amplitude, 
                fw_alloc_amplitude, rx_amplitude, tx_amplitude, load_amplitude
            )
            
            anomaly_log_entry = {
                'record_index': i,
                'timestamp': current_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3],
                'anomaly_type': anomaly_info['type'],
                'description': anomaly_info['description'],
                'affected_metrics': ', '.join(anomaly_info['affected_metrics']),
                'severity': anomaly_info['severity'],
                'cycle_phase': f"{cycle_progress:.2f}",
                'cycle_factor': f"{cycle_factor:.2f}"
            }
            anomaly_logs.append(anomaly_log_entry)
        else:
            row = generate_normal_row_cyclic(
                current_time, i, combined_factor, base_mem, base_root, base_log,
                base_fw_alloc, base_rx, base_tx, mem_amplitude, root_amplitude, log_amplitude,
                fw_alloc_amplitude, rx_amplitude, tx_amplitude, load_amplitude
            )
        
        # Tambahkan informasi cycle untuk analisis
        row['cycle_phase'] = f"{cycle_progress:.3f}"
        row['cycle_factor'] = f"{cycle_factor:.3f}"
        
        data.append(row)
    
    # Sort by timestamp
    df = pd.DataFrame(data)
    df = df.sort_values('created_at').reset_index(drop=True)
    
    # Create anomalies DataFrame
    anomalies_df = pd.DataFrame(anomaly_logs)
    if not anomalies_df.empty:
        anomalies_df = anomalies_df.sort_values('timestamp').reset_index(drop=True)
    
    return df, anomalies_df, anomaly_indices

def generate_normal_row_cyclic(timestamp, index, cycle_factor, base_mem, base_root, base_log, 
                              base_fw_alloc, base_rx, base_tx, mem_amplitude, root_amplitude,
                              log_amplitude, fw_alloc_amplitude, rx_amplitude, tx_amplitude, load_amplitude):
    """Generate normal system metrics dengan pola siklik"""
    
    # Load averages - bervariasi sesuai cycle dengan baseline rendah
    base_load_1 = 0.15
    base_load_5 = 0.18
    base_load_15 = 0.20
    
    fw_load_avg_1_min = round(max(0, base_load_1 + (cycle_factor * load_amplitude) + np.random.normal(0, 0.05)), 2)
    fw_load_avg_5_min = round(max(0, base_load_5 + (cycle_factor * load_amplitude * 0.8) + np.random.normal(0, 0.04)), 2)
    fw_load_avg_15_min = round(max(0, base_load_15 + (cycle_factor * load_amplitude * 0.6) + np.random.normal(0, 0.03)), 2)
    
    # CPU usage - mostly 0-2, sedikit lebih tinggi saat peak cycle
    base_cpu_prob = 0.7 - (cycle_factor * 0.2)  # Lebih sering non-zero saat peak
    cpu_base = 0 if random.random() < max(0.3, base_cpu_prob) else random.choice([1, 1, 2, 3])
    fw_cpu_used = cpu_base
    
    # Memory usage - bervariasi siklik di sekitar baseline
    mem_used = int(base_mem + (cycle_factor * mem_amplitude) + np.random.normal(0, 10000))
    mem_used = max(base_mem - mem_amplitude, mem_used)  # Jangan sampai terlalu rendah
    
    # Root filesystem - bervariasi siklik sesuai range yang diminta
    root_used = int(base_root + (cycle_factor * root_amplitude) + np.random.normal(0, 1000))
    root_used = max(base_root - root_amplitude, root_used)
    
    # Log usage - bervariasi siklik sesuai range yang diminta
    log_used = int(base_log + (cycle_factor * log_amplitude) + np.random.normal(0, 5000))
    log_used = max(base_log - log_amplitude, log_used)
    
    # Firewall total allocation - bervariasi siklik sesuai range yang diminta
    fw_total_alloc = int(base_fw_alloc + (cycle_factor * fw_alloc_amplitude) + np.random.normal(0, 5000000))
    fw_total_alloc = max(base_fw_alloc - fw_alloc_amplitude, fw_total_alloc)
    
    # Network packets - bervariasi siklik sesuai range yang diminta
    total_rx_packets = int(base_rx + (cycle_factor * rx_amplitude) + np.random.randint(-50000, 50000))
    total_rx_packets = max(base_rx - rx_amplitude, total_rx_packets)
    
    total_tx_packets = int(base_tx + (cycle_factor * tx_amplitude) + np.random.randint(-5000, 5000))
    total_tx_packets = max(base_tx - tx_amplitude, total_tx_packets)
    
    return {
        'fw_load_avg_1_min': f"{fw_load_avg_1_min:.2f}".replace('.', ','),
        'fw_load_avg_5_min': f"{fw_load_avg_5_min:.2f}".replace('.', ','),
        'fw_load_avg_15_min': f"{fw_load_avg_15_min:.2f}".replace('.', ','),
        'fw_cpu_used': fw_cpu_used,
        'mem_used': mem_used,
        'root_used': root_used,
        'log_used': log_used,
        'fw_total_alloc': fw_total_alloc,
        'total_rx_packets': total_rx_packets,
        'total_tx_packets': total_tx_packets,
        'created_at': timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    }

def generate_anomaly_row_cyclic(timestamp, index, cycle_factor, base_mem, base_root, base_log,
                               base_fw_alloc, base_rx, base_tx, mem_amplitude, root_amplitude, log_amplitude,
                               fw_alloc_amplitude, rx_amplitude, tx_amplitude, load_amplitude):
    """Generate anomalous system metrics dengan mempertimbangkan cycle factor"""
    
    anomaly_type = random.choice([
        'high_load', 'high_cpu', 'memory_spike', 'disk_spike', 
        'network_spike', 'system_stress', 'resource_leak'
    ])
    
    # Initialize anomaly info
    anomaly_info = {
        'type': anomaly_type,
        'description': '',
        'affected_metrics': [],
        'severity': 'medium'
    }
    
    # Base values dari cycle (untuk anomali yang tidak ekstrem)
    normal_mem = int(base_mem + (cycle_factor * mem_amplitude))
    normal_root = int(base_root + (cycle_factor * root_amplitude))
    normal_log = int(base_log + (cycle_factor * log_amplitude))
    normal_rx = int(base_rx + (cycle_factor * rx_amplitude))
    normal_tx = int(base_tx + (cycle_factor * tx_amplitude))
    normal_fw_alloc = int(base_fw_alloc + (cycle_factor * fw_alloc_amplitude))
    
    if anomaly_type == 'high_load':
        fw_load_avg_1_min = round(np.random.uniform(2.0, 8.0), 2)
        fw_load_avg_5_min = round(np.random.uniform(1.5, 4.0), 2)
        fw_load_avg_15_min = round(np.random.uniform(1.0, 2.5), 2)
        fw_cpu_used = random.randint(50, 95)
        
        anomaly_info['description'] = f'High system load: 1min={fw_load_avg_1_min}, CPU={fw_cpu_used}%'
        anomaly_info['affected_metrics'] = ['fw_load_avg_1_min', 'fw_load_avg_5_min', 'fw_load_avg_15_min', 'fw_cpu_used']
        anomaly_info['severity'] = 'high' if fw_load_avg_1_min > 5.0 else 'medium'
        
        mem_used = normal_mem + random.randint(0, 1000000)
        root_used = normal_root + random.randint(0, 100000)
        log_used = normal_log + random.randint(0, 200000)
        fw_total_alloc = normal_fw_alloc
        total_rx_packets = normal_rx
        total_tx_packets = normal_tx
        
    elif anomaly_type == 'memory_spike':
        # Memory spike - jauh di atas normal cycle
        base_load = 0.15 + (cycle_factor * load_amplitude)
        fw_load_avg_1_min = round(max(0, base_load + 0.5), 2)
        fw_load_avg_5_min = round(max(0, base_load + 0.3), 2)
        fw_load_avg_15_min = round(max(0, base_load + 0.2), 2)
        fw_cpu_used = random.randint(5, 25)
        
        mem_used = normal_mem + random.randint(8000000, 15000000)  # Spike di atas cycle normal
        
        anomaly_info['description'] = f'Memory spike: {mem_used/1000000:.1f}MB (spike above cycle)'
        anomaly_info['affected_metrics'] = ['mem_used', 'fw_load_avg_1_min']
        anomaly_info['severity'] = 'high' if mem_used > normal_mem + 12000000 else 'medium'
        
        root_used = normal_root + random.randint(0, 100000)
        log_used = normal_log + random.randint(0, 200000)
        fw_total_alloc = normal_fw_alloc
        total_rx_packets = normal_rx
        total_tx_packets = normal_tx
        
    elif anomaly_type == 'network_spike':
        base_load = 0.15 + (cycle_factor * load_amplitude)
        fw_load_avg_1_min = round(max(0, base_load + 0.4), 2)
        fw_load_avg_5_min = round(max(0, base_load + 0.3), 2)
        fw_load_avg_15_min = round(max(0, base_load + 0.2), 2)
        fw_cpu_used = random.randint(10, 30)
        
        mem_used = normal_mem + random.randint(1000000, 3000000)
        root_used = normal_root + random.randint(0, 100000)
        log_used = normal_log + random.randint(0, 200000)
        
        # Network spike - jauh di atas normal cycle
        fw_total_alloc = normal_fw_alloc + random.randint(100000000, 300000000)
        total_rx_packets = normal_rx + random.randint(8000000, 15000000)
        total_tx_packets = normal_tx + random.randint(800000, 1500000)
        
        anomaly_info['description'] = f'Network spike: RX={total_rx_packets/1000000:.1f}M, TX={total_tx_packets/1000000:.1f}M'
        anomaly_info['affected_metrics'] = ['total_rx_packets', 'total_tx_packets', 'fw_total_alloc']
        anomaly_info['severity'] = 'high' if total_rx_packets > normal_rx + 12000000 else 'medium'
        
    else:
        # Untuk anomali lainnya, gunakan logika serupa dengan penyesuaian cycle
        fw_load_avg_1_min = round(np.random.uniform(1.5, 4.0), 2)
        fw_load_avg_5_min = round(np.random.uniform(1.2, 3.0), 2)
        fw_load_avg_15_min = round(np.random.uniform(1.0, 2.5), 2)
        fw_cpu_used = random.randint(30, 80)
        
        mem_used = normal_mem + random.randint(2000000, 8000000)
        root_used = normal_root + random.randint(2000000, 5000000)
        log_used = normal_log + random.randint(1000000, 4000000)
        fw_total_alloc = normal_fw_alloc + random.randint(50000000, 200000000)
        total_rx_packets = normal_rx + random.randint(2000000, 8000000)
        total_tx_packets = normal_tx + random.randint(200000, 800000)
        
        anomaly_info['description'] = f'{anomaly_type}: Multiple metrics elevated above cycle normal'
        anomaly_info['affected_metrics'] = ['fw_load_avg_1_min', 'fw_cpu_used', 'mem_used']
        anomaly_info['severity'] = 'medium'
    
    row_data = {
        'fw_load_avg_1_min': f"{fw_load_avg_1_min:.2f}".replace('.', ','),
        'fw_load_avg_5_min': f"{fw_load_avg_5_min:.2f}".replace('.', ','),
        'fw_load_avg_15_min': f"{fw_load_avg_15_min:.2f}".replace('.', ','),
        'fw_cpu_used': fw_cpu_used,
        'mem_used': mem_used,
        'root_used': root_used,
        'log_used': log_used,
        'fw_total_alloc': fw_total_alloc,
        'total_rx_packets': total_rx_packets,
        'total_tx_packets': total_tx_packets,
        'created_at': timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    }
    
    return row_data, anomaly_info


print("🚀 Generating system monitoring data dengan pola siklik...")
print("Pilihan: 'weekly' untuk pola mingguan, 'biweekly' untuk pola 2 mingguan")

# ========== KONFIGURASI PARAMETERS ==========
CYCLE_TYPE = 'weekly'        # atau 'biweekly'
N_SAMPLES = 10000            # Lebih banyak sample untuk melihat beberapa cycle
ANOMALY_RATIO = 0.00        # Ratio anomali (0.05 = 5%)
RANDOM_SEED = random.randint(0,100)         # Set ke None untuk random seed, atau integer untuk reproducible results

# Generate dataset dengan pola siklik
df, anomalies_df, anomaly_indices = generate_system_monitoring_data_cyclic(
    n_samples=N_SAMPLES, 
    anomaly_ratio=ANOMALY_RATIO, 
    cycle_type=CYCLE_TYPE,
    random_seed=RANDOM_SEED  # Tambahan parameter ini
)

# Save files
output_file = 'system_monitoring_unsupervised.csv'
df_output = df.drop(['cycle_phase', 'cycle_factor'], axis=1)  # Remove cycle info for clean dataset
df_output.to_csv(output_file, sep=';', index=False)

# Save cycle analysis file (dengan cycle info)
analysis_file = f'system_monitoring_with_cycle_info_{CYCLE_TYPE}.csv'
df.to_csv(analysis_file, sep=';', index=False)

# Save anomalies
anomalies_file = 'anomalies_list.csv'
if not anomalies_df.empty:
    anomalies_df.to_csv(anomalies_file, sep=';', index=False)

print(f"✅ Dataset dengan pola {CYCLE_TYPE} berhasil dibuat!")
print(f"📁 Main dataset: {output_file}")
print(f"📁 Dataset with cycle info: {analysis_file}")
print(f"📁 Anomalies log: {anomalies_file}")
print(f"📊 Dataset shape: {df.shape}")
print(f"🎯 Contains {len(anomalies_df)} anomalous records")

# Analysis
print(f"\n📈 Cycle Pattern Analysis:")
print(f"Sample cycle phases and factors:")
sample_cycles = df[['created_at', 'cycle_phase', 'cycle_factor', 'mem_used']].head(10)
for _, row in sample_cycles.iterrows():
    print(f"  {row['created_at'][:19]} | Phase: {row['cycle_phase']} | Factor: {row['cycle_factor']} | Memory: {row['mem_used']:,}")

# Show min/max ranges to demonstrate cycling
df_numeric = df.copy()
for col in ['fw_load_avg_1_min', 'fw_load_avg_5_min', 'fw_load_avg_15_min']:
    df_numeric[col] = df_numeric[col].str.replace(',', '.').astype(float)

print(f"\n📊 Data Ranges (showing cyclic variation):")
print(f"Memory usage: {df['mem_used'].min():,} to {df['mem_used'].max():,}")
print(f"RX packets: {df['total_rx_packets'].min():,} to {df['total_rx_packets'].max():,}")
print(f"TX packets: {df['total_tx_packets'].min():,} to {df['total_tx_packets'].max():,}")
print(f"Load avg 1min: {df_numeric['fw_load_avg_1_min'].min():.2f} to {df_numeric['fw_load_avg_1_min'].max():.2f}")

print(f"\n💡 Dataset characteristics:")
print(f"✓ Pola siklik {CYCLE_TYPE} dengan naik-turun teratur")
print("✓ Baseline values konsisten untuk setiap metric")
print("✓ Variasi harian tambahan (pagi rendah, siang tinggi)")
print("✓ Anomali yang mempertimbangkan posisi dalam cycle")
print("✓ Range data terbatas dan dapat diprediksi")
print("✓ Suitable untuk time series analysis dan pattern detection")
if RANDOM_SEED is not None:
    print(f"✓ Reproducible results dengan random seed: {RANDOM_SEED}")
else:
    print("✓ Random seed tidak diset - hasil akan berbeda setiap run")

🚀 Generating system monitoring data dengan pola siklik...
Pilihan: 'weekly' untuk pola mingguan, 'biweekly' untuk pola 2 mingguan
🎲 Random seed set to: 78
🔄 Menggunakan pola siklik mingguan dengan periode 5040 samples
✅ Dataset dengan pola weekly berhasil dibuat!
📁 Main dataset: system_monitoring_unsupervised.csv
📁 Dataset with cycle info: system_monitoring_with_cycle_info_weekly.csv
📁 Anomalies log: anomalies_list.csv
📊 Dataset shape: (10000, 13)
🎯 Contains 0 anomalous records

📈 Cycle Pattern Analysis:
Sample cycle phases and factors:
  2024-11-06 08:01:18 | Phase: 0.000 | Factor: 0.000 | Memory: 7,771,874
  2024-11-06 08:03:35 | Phase: 0.000 | Factor: 0.001 | Memory: 7,751,567
  2024-11-06 08:04:38 | Phase: 0.000 | Factor: 0.002 | Memory: 7,770,417
  2024-11-06 08:08:12 | Phase: 0.001 | Factor: 0.004 | Memory: 7,772,731
  2024-11-06 08:10:30 | Phase: 0.001 | Factor: 0.005 | Memory: 7,774,161
  2024-11-06 08:11:30 | Phase: 0.001 | Factor: 0.006 | Memory: 7,770,041
  2024-11-06 08:14: