## ðŸ”§ Setup and Imports

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from ml_models.data_preprocessing import WorkloadDataLoader

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("âœ… Imports completed successfully!")

## ðŸ“¥ Load Workload Data

In [None]:
# Initialize data loader
data_loader = WorkloadDataLoader()

# Generate synthetic workload for demonstration
# Replace with real data: data_loader.load_planetlab_trace('../data/planetlab/trace.txt')
workload_data = data_loader.generate_synthetic_workload(
    num_hosts=10,
    num_timesteps=1000,
    pattern='mixed'
)

print(f"Loaded workload data: {workload_data.shape}")
print(f"Hosts: {workload_data.shape[1]}, Timesteps: {workload_data.shape[0]}")

## ðŸ“ˆ Basic Statistics

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(
    workload_data,
    columns=[f'Host_{i}' for i in range(workload_data.shape[1])]
)
df['Timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='5min')
df.set_index('Timestamp', inplace=True)

# Display summary statistics
print("\nðŸ“Š Summary Statistics (CPU Utilization %):\n")
print(df.describe())

## ðŸ“‰ Workload Distribution Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Distribution of utilization values
axes[0, 0].hist(df.values.flatten(), bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('CPU Utilization Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Utilization (%)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df.values.flatten().mean(), color='red', linestyle='--', label=f'Mean: {df.values.flatten().mean():.2f}%')
axes[0, 0].legend()

# 2. Box plot per host
df.boxplot(ax=axes[0, 1])
axes[0, 1].set_title('CPU Utilization by Host', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Utilization (%)')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Time series plot (first 3 hosts)
df.iloc[:, :3].plot(ax=axes[1, 0], linewidth=1.5)
axes[1, 0].set_title('CPU Utilization Over Time (Sample Hosts)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Time')
axes[1, 0].set_ylabel('Utilization (%)')
axes[1, 0].legend(loc='upper right')

# 4. Correlation heatmap
correlation = df.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 1], cbar_kws={'label': 'Correlation'})
axes[1, 1].set_title('Host Utilization Correlation Matrix', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/graphs/data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Workload distribution analysis completed!")

## ðŸ•’ Temporal Pattern Analysis

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 8))

# 1. Hourly average utilization
df['Hour'] = df.index.hour
hourly_avg = df.groupby('Hour').mean().mean(axis=1)
axes[0].plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2, markersize=8, color='teal')
axes[0].set_title('Average CPU Utilization by Hour of Day', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Hour')
axes[0].set_ylabel('Avg Utilization (%)')
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(range(24))

# 2. Rolling average (window=50)
rolling_avg = df.iloc[:, :3].rolling(window=50).mean()
rolling_avg.plot(ax=axes[1], linewidth=2)
axes[1].set_title('CPU Utilization - Rolling Average (Window=50)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Utilization (%)')
axes[1].legend(loc='upper right')

plt.tight_layout()
plt.savefig('../results/graphs/temporal_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Temporal pattern analysis completed!")

## ðŸŽ² Synthetic Workload Generation

In [None]:
# Generate different workload patterns
patterns = ['sine', 'random', 'spike', 'mixed']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, pattern in enumerate(patterns):
    synthetic_data = data_loader.generate_synthetic_workload(
        num_hosts=3,
        num_timesteps=500,
        pattern=pattern
    )
    
    # Plot each host
    for host_idx in range(3):
        axes[i].plot(synthetic_data[:, host_idx], label=f'Host {host_idx}', linewidth=1.5)
    
    axes[i].set_title(f'Pattern: {pattern.upper()}', fontsize=14, fontweight='bold')
    axes[i].set_xlabel('Timestep')
    axes[i].set_ylabel('Utilization (%)')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/graphs/synthetic_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Synthetic workload generation completed!")

## ðŸ“Š Host-Level Aggregation

In [None]:
# Simulate VM-to-Host mapping
num_vms = 50
vm_data = data_loader.generate_synthetic_workload(
    num_hosts=num_vms,
    num_timesteps=500,
    pattern='mixed'
)

# Random VM-to-Host assignment
np.random.seed(42)
vm_to_host = {f'VM_{i}': f'Host_{np.random.randint(0, 10)}' for i in range(num_vms)}

# Aggregate to host level
host_data = data_loader.aggregate_to_host_level(vm_data, vm_to_host)

print(f"VM data shape: {vm_data.shape}")
print(f"Host aggregated data shape: {host_data.shape}")
print(f"\nHost utilization summary (%):\n{pd.DataFrame(host_data).describe()}")

# Visualize aggregation
fig, ax = plt.subplots(figsize=(15, 6))
pd.DataFrame(host_data, columns=[f'Host_{i}' for i in range(host_data.shape[1])]).plot(ax=ax, linewidth=2)
ax.set_title('Aggregated Host CPU Utilization', fontsize=14, fontweight='bold')
ax.set_xlabel('Timestep')
ax.set_ylabel('Utilization (%)')
ax.legend(loc='upper right', ncol=5)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../results/graphs/host_aggregation.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Host-level aggregation completed!")

## ðŸ’¾ Export Processed Data

In [None]:
# Save processed data
os.makedirs('../data/processed', exist_ok=True)

# Save workload data
np.save('../data/processed/workload_data.npy', workload_data)
np.save('../data/processed/host_aggregated_data.npy', host_data)

# Save DataFrame as CSV
df.to_csv('../data/processed/workload_timeseries.csv')

print("âœ… Data exported successfully!")
print("   - workload_data.npy")
print("   - host_aggregated_data.npy")
print("   - workload_timeseries.csv")

## ðŸ“‹ Summary

**Key Findings:**
- âœ… Workload data loaded and preprocessed successfully
- âœ… Temporal patterns identified (hourly variations)
- âœ… Synthetic workload generators validated
- âœ… Host-level aggregation implemented
- âœ… Data exported for model training

**Next Steps:**
1. Train LSTM workload predictors (see `02_lstm_analysis.ipynb`)
2. Train DQN consolidation agent (see `03_dqn_training.ipynb`)
3. Evaluate and visualize results (see `04_results_visualization.ipynb`)