In [None]:
%pip install pandas matplotlib seaborn numpy glob

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
import os
from matplotlib.gridspec import GridSpec

def load_metrics(scenario):
    """Load CPU and memory metrics for a specific scenario."""
    # Find all metric files for the scenario
    files = glob.glob(f"performance_logs/{scenario}/*_metrics.csv")
    
    # Load and combine data
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        df['Scenario'] = scenario
        dfs.append(df)
    
    if not dfs:
        return None
    
    # Combine all dataframes
    return pd.concat(dfs)

def analyze_memory_consumption(scenarios=["small", "medium", "full"]):
    """Analyze and visualize memory consumption across scenarios."""
    # Load data for all scenarios
    data = []
    for scenario in scenarios:
        df = load_metrics(scenario)
        if df is not None:
            data.append(df)
    
    if not data:
        print("No data found!")
        return
    
    # Combine all scenario data
    all_data = pd.concat(data)
    
    # Convert timestamp to datetime
    all_data['Timestamp'] = pd.to_datetime(all_data['Timestamp'])
    
    # Extract agent type from AgentId
    all_data['AgentType'] = all_data['AgentId'].apply(
        lambda x: 'Professor' if 'Profesor' in x else 
                  'Room' if 'Sala' in x else 
                  'Supervisor' if 'Supervisor' in x else 'Other')
    
    # Create figure for memory analysis
    fig = plt.figure(figsize=(15, 12))
    gs = GridSpec(3, 2, figure=fig)
    
    # 1. Memory Usage Time Series
    ax1 = fig.add_subplot(gs[0, :])
    
    for scenario, scenario_data in all_data.groupby('Scenario'):
        # Normalize timestamps to start from 0
        base_time = scenario_data['Timestamp'].min()
        scenario_data = scenario_data.copy()
        scenario_data['RelativeTime'] = (scenario_data['Timestamp'] - base_time).dt.total_seconds()
        
        # Group by time bins and agent type
        time_bins = np.linspace(0, scenario_data['RelativeTime'].max(), 100)
        scenario_data['TimeBin'] = pd.cut(scenario_data['RelativeTime'], time_bins)
        
        # Calculate average memory usage per time bin and agent type
        avg_mem = scenario_data.groupby(['TimeBin', 'AgentType'])['MemoryUsagePercent'].mean().reset_index()
        
        # Plot data
        for agent_type, agent_data in avg_mem.groupby('AgentType'):
            ax1.plot(
                agent_data['TimeBin'].apply(lambda x: x.mid), 
                agent_data['MemoryUsagePercent'],
                label=f"{scenario} - {agent_type}"
            )
    
    ax1.set_xlabel('Time (seconds)')
    ax1.set_ylabel('Memory Usage (%)')
    ax1.set_title('Memory Usage Over Time by Agent Type and Scenario')
    ax1.legend()
    ax1.grid(True)
    
    # 2. Memory vs CPU Usage Scatter Plot
    ax2 = fig.add_subplot(gs[1, 0])
    
    scatter = ax2.scatter(
        all_data['CPUUsage'],
        all_data['MemoryUsagePercent'],
        c=all_data['Scenario'].astype('category').cat.codes,
        alpha=0.6,
        s=50,
        cmap='viridis'
    )
    
    # Add legend
    legend1 = ax2.legend(
        scatter.legend_elements()[0], 
        all_data['Scenario'].unique(),
        title="Scenario"
    )
    ax2.add_artist(legend1)
    
    ax2.set_xlabel('CPU Usage (%)')
    ax2.set_ylabel('Memory Usage (%)')
    ax2.set_title('Memory Usage vs CPU Usage')
    ax2.grid(True)
    
    # 3. Memory Usage Distribution
    ax3 = fig.add_subplot(gs[1, 1])
    
    for scenario, scenario_data in all_data.groupby('Scenario'):
        sns.kdeplot(
            data=scenario_data, 
            x='MemoryUsagePercent', 
            label=scenario,
            ax=ax3
        )
    
    ax3.set_xlabel('Memory Usage (%)')
    ax3.set_ylabel('Density')
    ax3.set_title('Memory Usage Distribution by Scenario')
    ax3.legend()
    ax3.grid(True)
    
    # 4. Memory Usage by Agent Type
    ax4 = fig.add_subplot(gs[2, 0])
    
    sns.barplot(
        data=all_data, 
        x='AgentType', 
        y='UsedMemory', 
        hue='Scenario',
        ax=ax4
    )
    
    ax4.set_xlabel('Agent Type')
    ax4.set_ylabel('Used Memory (bytes)')
    ax4.set_title('Memory Usage by Agent Type')
    ax4.grid(True, axis='y')
    
    # 5. Memory Efficiency
    ax5 = fig.add_subplot(gs[2, 1])
    
    # Calculate memory efficiency (higher is better)
    # This is an example - you might define efficiency differently
    # For example, memory per assigned task or processed message
    
    # Let's use a simple metric: number of messages processed per MB of memory
    # (You would need to add this data to your metrics)
    # This is a placeholder - the real calculation depends on your data structure
    
    # Sample calculation assuming message count is available
    if 'MessagesProcessed' in all_data.columns:
        all_data['MemoryEfficiency'] = all_data['MessagesProcessed'] / (all_data['UsedMemory'] / (1024 * 1024))
        
        sns.boxplot(
            data=all_data, 
            x='Scenario', 
            y='MemoryEfficiency',
            ax=ax5
        )
        
        ax5.set_xlabel('Scenario')
        ax5.set_ylabel('Messages per MB')
        ax5.set_title('Memory Efficiency by Scenario')
        ax5.grid(True, axis='y')
    else:
        ax5.text(0.5, 0.5, 'Efficiency metric not available', 
                 horizontalalignment='center', verticalalignment='center')
    
    plt.tight_layout()
    plt.savefig('memory_analysis.png')
    print("Memory analysis visualizations saved.")

# Extension for thread bottleneck analysis
def analyze_thread_bottlenecks(scenarios=["small", "medium", "full"]):
    """Analyze thread-level bottlenecks across scenarios."""
    # Load thread data for all scenarios
    thread_data = []
    
    for scenario in scenarios:
        files = glob.glob(f"performance_logs/{scenario}/*_thread.csv")
        for file in files:
            df = pd.read_csv(file)
            df['Scenario'] = scenario
            thread_data.append(df)
    
    if not thread_data:
        print("No thread data found!")
        return
    
    # Combine all thread data
    all_thread_data = pd.concat(thread_data)
    
    # Convert timestamp to datetime
    all_thread_data['Timestamp'] = pd.to_datetime(all_thread_data['Timestamp'])
    
    # Categorize threads by name pattern
    def categorize_thread(name):
        if 'Profesor' in name:
            return 'Professor Agent'
        elif 'Sala' in name:
            return 'Room Agent'
        elif 'Supervisor' in name:
            return 'Supervisor Agent'
        elif 'Messaging' in name or 'ACL' in name:
            return 'Messaging'
        elif 'DF' in name or 'Directory' in name:
            return 'Directory Services'
        elif 'GC' in name:
            return 'Garbage Collection'
        else:
            return 'Other'
    
    all_thread_data['ThreadCategory'] = all_thread_data['ThreadName'].apply(categorize_thread)
    
    # Create figure for thread bottleneck analysis
    fig = plt.figure(figsize=(15, 15))
    gs = GridSpec(3, 2, figure=fig)
    
    # 1. Top CPU Consuming Threads
    ax1 = fig.add_subplot(gs[0, :])
    
    # Get top 10 threads by CPU time
    top_threads = all_thread_data.groupby(['ThreadName', 'ThreadCategory'])['CPUTime_ns'].sum().reset_index()
    top_threads = top_threads.sort_values('CPUTime_ns', ascending=False).head(10)
    
    sns.barplot(
        data=top_threads,
        x='CPUTime_ns',
        y='ThreadName',
        hue='ThreadCategory',
        palette='viridis',
        ax=ax1
    )
    
    ax1.set_title('Top 10 CPU Consuming Threads')
    ax1.set_xlabel('Total CPU Time (ns)')
    ax1.set_ylabel('Thread Name')
    ax1.grid(True, axis='x')
    
    # 2. CPU Time by Thread Category
    ax2 = fig.add_subplot(gs[1, 0])
    
    category_cpu = all_thread_data.groupby(['Scenario', 'ThreadCategory'])['CPUTime_ns'].sum().reset_index()
    
    sns.barplot(
        data=category_cpu,
        x='ThreadCategory',
        y='CPUTime_ns',
        hue='Scenario',
        ax=ax2
    )
    
    ax2.set_title('CPU Time by Thread Category')
    ax2.set_xlabel('Thread Category')
    ax2.set_ylabel('Total CPU Time (ns)')
    ax2.grid(True, axis='y')
    plt.xticks(rotation=45)
    
    # 3. CPU Utilization Over Time
    ax3 = fig.add_subplot(gs[1, 1])
    
    # Group by time and category
    all_thread_data['TimeMinute'] = all_thread_data['Timestamp'].dt.floor('1min')
    time_series = all_thread_data.groupby(['TimeMinute', 'ThreadCategory'])['CPUPercent'].sum().reset_index()
    
    # Plot CPU percentage over time by category
    for category, cat_data in time_series.groupby('ThreadCategory'):
        ax3.plot(
            cat_data['TimeMinute'],
            cat_data['CPUPercent'],
            label=category,
            marker='o',
            markersize=4
        )
    
    ax3.set_title('CPU Utilization by Thread Category Over Time')
    ax3.set_xlabel('Time')
    ax3.set_ylabel('CPU Utilization (%)')
    ax3.legend()
    ax3.grid(True)
    
    # 4. Thread Bottleneck Analysis
    ax4 = fig.add_subplot(gs[2, 0])
    
    # Calculate bottleneck score = (CPU Time * CPU %) - higher means more likely a bottleneck
    all_thread_data['BottleneckScore'] = all_thread_data['CPUTime_ns'] * all_thread_data['CPUPercent'] / 1e9
    
    bottleneck_data = all_thread_data.groupby(['Scenario', 'ThreadCategory'])['BottleneckScore'].mean().reset_index()
    
    sns.barplot(
        data=bottleneck_data,
        x='ThreadCategory',
        y='BottleneckScore',
        hue='Scenario',
        ax=ax4
    )
    
    ax4.set_title('Thread Bottleneck Analysis')
    ax4.set_xlabel('Thread Category')
    ax4.set_ylabel('Bottleneck Score')
    ax4.grid(True, axis='y')
    plt.xticks(rotation=45)
    
    # 5. User Time vs CPU Time
    ax5 = fig.add_subplot(gs[2, 1])
    
    # Calculate ratio of user time to CPU time
    all_thread_data['UserRatio'] = all_thread_data['UserTime_ns'] / all_thread_data['CPUTime_ns']
    all_thread_data['UserRatio'].replace([np.inf, -np.inf], np.nan, inplace=True)
    all_thread_data.dropna(subset=['UserRatio'], inplace=True)
    
    sns.boxplot(
        data=all_thread_data,
        x='ThreadCategory',
        y='UserRatio',
        ax=ax5
    )
    
    ax5.set_title('User Time vs CPU Time by Thread Category')
    ax5.set_xlabel('Thread Category')
    ax5.set_ylabel('User Time / CPU Time Ratio')
    ax5.grid(True, axis='y')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig('thread_bottleneck_analysis.png')
    print("Thread bottleneck analysis visualizations saved.")

# Execute both analyses
if __name__ == "__main__":
    analyze_memory_consumption()
    analyze_thread_bottlenecks()