In [5]:
import os
import re
import glob

# Base directory
base_dir = 'Encoder-BS32-SL512'

# Find all files matching the pattern
pattern = os.path.join(base_dir, 'flame-moe-*_layer*_*.txt')
files = glob.glob(pattern)

print("="*80)
print("Renaming layer files")
print("="*80)
print()

if not files:
    print("No files found matching the pattern")
else:
    print(f"Found {len(files)} file(s) to rename:\n")
    
    for file_path in files:
        # Extract layer number from filename
        filename = os.path.basename(file_path)
        match = re.search(r'layer(\d+)', filename)
        
        if match:
            layer_num = match.group(1)
            new_filename = f"layer{layer_num}.txt"
            new_path = os.path.join(base_dir, new_filename)
            
            print(f"  {filename}")
            print(f"    -> {new_filename}")
            
            # Rename the file
            try:
                os.rename(file_path, new_path)
                print(f"    ✓ Renamed successfully")
            except Exception as e:
                print(f"    ✗ ERROR: {e}")
            print()
        else:
            print(f"  Could not extract layer number from: {filename}")
            print()

print("="*80)
print("✓ Renaming complete!")
print("="*80)


Renaming layer files

No files found matching the pattern
✓ Renaming complete!


In [7]:
import pandas as pd
import os
import re

# Base directory
base_dir = 'Decoder-BS32-deep'

# Numbered folders to process
numbered_folders = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

# Store results for each experiment and numbered folder
all_experiment_results = []
numbered_folder_totals = []
grand_total = 0

print("="*80)
print("NPU-ONLY Mode Analysis - Decoder-BS64")
print("="*80)
print()

# Process each numbered folder
for folder_name in numbered_folders:
    npu_folder = os.path.join(base_dir, folder_name, 'npu')
    
    if not os.path.exists(npu_folder):
        print(f"⚠ {folder_name}/npu/ does not exist, skipping...")
        continue
    
    print(f"\n{folder_name}/:")
    print("-"*80)
    
    folder_total_cycles = 0
    experiment_count = 0
    
    # Process experiment folders 2-12
    for exp_num in range(2, 13):
        txt_file = os.path.join(npu_folder, str(exp_num), 'SA_stage_E.txt')
        
        if not os.path.exists(txt_file):
            continue
        
        # Read the text file
        with open(txt_file, 'r') as f:
            content = f.read()
        
        # Parse the table - extract data
        lines = content.split('\n')
        param_load_sum = 0
        expert_63_total_compute = 0
        
        for line in lines:
            # Skip empty lines and header lines
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'NPU Expert' in line or 'Decoder' in line:
                continue
            
            # Split by whitespace
            parts = line.split()
            
            if len(parts) >= 6:  # Should have: Expert Number, param_load, fc1, gelu, fc2, total compute
                try:
                    expert_num = int(parts[0])
                    param_load = int(parts[1])
                    total_compute = int(parts[5])
                    
                    # Sum all param_load values
                    param_load_sum += param_load
                    
                    # Store expert 63's total compute
                    if expert_num == 63:
                        expert_63_total_compute = total_compute
                except (ValueError, IndexError):
                    continue
        
        # Calculate experiment total
        exp_total = param_load_sum + expert_63_total_compute
        folder_total_cycles += exp_total
        experiment_count += 1
        
        # Store experiment result
        all_experiment_results.append({
            'Numbered Folder': folder_name,
            'Experiment': exp_num,
            'Sum of param_load': param_load_sum,
            'Expert 63 total compute': expert_63_total_compute,
            'Experiment Total': exp_total
        })
        
        print(f"  Exp {exp_num}: {exp_total:,} cycles")
    
    # Store numbered folder total
    if experiment_count > 0:
        numbered_folder_totals.append({
            'Numbered Folder': folder_name,
            'Experiments Processed': experiment_count,
            'Folder Total': folder_total_cycles
        })
        grand_total += folder_total_cycles
        
        print(f"  {'='*76}")
        print(f"  {folder_name} Total ({experiment_count} experiments): {folder_total_cycles:,} cycles")

# Create summary DataFrames
experiments_df = pd.DataFrame(all_experiment_results)
folders_df = pd.DataFrame(numbered_folder_totals)

print("\n" + "="*80)
print("Numbered Folder Totals")
print("="*80)
print(folders_df.to_string(index=False))

print("\n" + "="*80)
print(f"GRAND TOTAL (All NPU experiments): {grand_total:,} cycles")
print("="*80)

# Display folders DataFrame
folders_df


NPU-ONLY Mode Analysis - Decoder-BS64


1st/:
--------------------------------------------------------------------------------
  Exp 2: 86,265,524 cycles
  Exp 3: 84,850,871 cycles
  Exp 4: 84,851,796 cycles
  Exp 5: 87,679,156 cycles
  Exp 6: 83,436,288 cycles
  Exp 7: 82,021,888 cycles
  Exp 8: 79,193,897 cycles
  Exp 9: 82,022,190 cycles
  Exp 10: 86,265,524 cycles
  Exp 11: 84,850,871 cycles
  Exp 12: 84,851,796 cycles
  1st Total (11 experiments): 926,289,801 cycles

2nd/:
--------------------------------------------------------------------------------
  Exp 2: 86,265,269 cycles
  Exp 3: 80,608,046 cycles
  Exp 4: 80,608,553 cycles
  Exp 5: 80,608,294 cycles
  Exp 6: 79,194,549 cycles
  Exp 7: 82,022,696 cycles
  Exp 8: 77,780,012 cycles
  Exp 9: 79,194,414 cycles
  Exp 10: 86,265,269 cycles
  Exp 11: 80,608,046 cycles
  Exp 12: 80,608,553 cycles
  2nd Total (11 experiments): 893,763,701 cycles

3rd/:
--------------------------------------------------------------------------------

Unnamed: 0,Numbered Folder,Experiments Processed,Folder Total
0,1st,11,926289801
1,2nd,11,893763701
2,3rd,11,899419978
3,4th,11,958814893
4,5th,11,930532324
5,6th,11,923461309
6,7th,11,940431966
7,8th,11,950328982
8,9th,11,964472376
9,10th,11,914975161


In [8]:
import pandas as pd
import os
import re

# Base directory
base_dir = 'Decoder-BS32-deep'

# Numbered folders to process
numbered_folders = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

# Store results for each experiment and numbered folder
all_experiment_results = []
numbered_folder_totals = []
grand_total = 0

print("="*80)
print("PIM-ONLY Mode Analysis - Decoder-BS64")
print("="*80)
print()

# Process each numbered folder
for folder_name in numbered_folders:
    pim_folder = os.path.join(base_dir, folder_name, 'pim')
    
    if not os.path.exists(pim_folder):
        print(f"⚠ {folder_name}/pim/ does not exist, skipping...")
        continue
    
    print(f"\n{folder_name}/:")
    print("-"*80)
    
    folder_total_cycles = 0
    experiment_count = 0
    
    # Process experiment folders 2-12
    for exp_num in range(2, 13):
        txt_file = os.path.join(pim_folder, str(exp_num), 'SA_stage_E.txt')
        
        if not os.path.exists(txt_file):
            continue
        
        # Read the text file
        with open(txt_file, 'r') as f:
            content = f.read()
        
        # Parse activation movements
        activation_1 = 0
        activation_2 = 0
        
        # Extract activation movements from the file
        for line in content.split('\n'):
            if 'activation_movement_1' in line and 'cycles' in line:
                # Extract number before 'cycles' (handles both int and float)
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_1 = int(float(match.group(1)))
            elif 'activation_movement_2' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_2 = int(float(match.group(1)))
        
        # Parse the expert table
        lines = content.split('\n')
        total_compute_sum = 0
        
        for line in lines:
            # Skip empty lines and header lines
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'Note:' in line or 'Activation' in line or '-' in line[:3] or 'PIM Expert' in line or 'Decoder' in line or 'BS' in line:
                continue
            
            # Split by whitespace
            parts = line.split()
            
            if len(parts) >= 5:  # Should have: Expert Number, fc1, gelu, fc2, total compute
                try:
                    expert_num = int(parts[0])
                    total_compute = int(parts[4])  # Last column is total compute
                    
                    # Sum all total compute values
                    total_compute_sum += total_compute
                except (ValueError, IndexError):
                    continue
        
        # Calculate experiment total
        exp_total = total_compute_sum + activation_1 + activation_2
        folder_total_cycles += exp_total
        experiment_count += 1
        
        # Store experiment result
        all_experiment_results.append({
            'Numbered Folder': folder_name,
            'Experiment': exp_num,
            'Sum of total compute': total_compute_sum,
            'Activation 1': activation_1,
            'Activation 2': activation_2,
            'Experiment Total': exp_total
        })
        
        print(f"  Exp {exp_num}: {exp_total:,} cycles (compute: {total_compute_sum:,} + act: {activation_1 + activation_2:,})")
    
    # Store numbered folder total
    if experiment_count > 0:
        numbered_folder_totals.append({
            'Numbered Folder': folder_name,
            'Experiments Processed': experiment_count,
            'Folder Total': folder_total_cycles
        })
        grand_total += folder_total_cycles
        
        print(f"  {'='*76}")
        print(f"  {folder_name} Total ({experiment_count} experiments): {folder_total_cycles:,} cycles")

# Create summary DataFrames
experiments_df = pd.DataFrame(all_experiment_results)
folders_df = pd.DataFrame(numbered_folder_totals)

print("\n" + "="*80)
print("Numbered Folder Totals")
print("="*80)
print(folders_df.to_string(index=False))

print("\n" + "="*80)
print(f"GRAND TOTAL (All PIM experiments): {grand_total:,} cycles")
print("="*80)

# Display folders DataFrame
folders_df


PIM-ONLY Mode Analysis - Decoder-BS64


1st/:
--------------------------------------------------------------------------------
  Exp 2: 1,670,373 cycles (compute: 1,661,405 + act: 8,968)
  Exp 3: 1,597,617 cycles (compute: 1,588,649 + act: 8,968)
  Exp 4: 1,664,896 cycles (compute: 1,655,928 + act: 8,968)
  Exp 5: 1,783,903 cycles (compute: 1,774,935 + act: 8,968)
  Exp 6: 1,665,740 cycles (compute: 1,656,772 + act: 8,968)
  Exp 7: 1,712,284 cycles (compute: 1,703,316 + act: 8,968)
  Exp 8: 1,592,879 cycles (compute: 1,583,911 + act: 8,968)
  Exp 9: 1,688,320 cycles (compute: 1,679,352 + act: 8,968)
  Exp 10: 1,670,373 cycles (compute: 1,661,405 + act: 8,968)
  Exp 11: 1,597,617 cycles (compute: 1,588,649 + act: 8,968)
  Exp 12: 1,664,896 cycles (compute: 1,655,928 + act: 8,968)
  1st Total (11 experiments): 18,308,898 cycles

2nd/:
--------------------------------------------------------------------------------
  Exp 2: 1,667,450 cycles (compute: 1,658,482 + act: 8,968)
  Exp 3: 1,662

Unnamed: 0,Numbered Folder,Experiments Processed,Folder Total
0,1st,11,18308898
1,2nd,11,18485493
2,3rd,11,18488265
3,4th,11,18822798
4,5th,11,18249032
5,6th,11,18219003
6,7th,11,18164694
7,8th,11,18941435
8,9th,11,18811375
9,10th,11,18361702


In [11]:
import pandas as pd
import os
import re
import math

# Base directory
base_dir = 'Decoder-BS64-switch'

# Numbered folders to process
numbered_folders = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

# Store results for each experiment
all_experiment_results = []
numbered_folder_totals = []
grand_total = 0

print("="*80)
print("HYBRID MODE Analysis (Top H on NPU, Rest on PIM) - Decoder-BS16")
print("="*80)
print()

# Process each numbered folder
for folder_name in numbered_folders:
    folder_path = os.path.join(base_dir, folder_name)
    
    if not os.path.exists(folder_path):
        print(f"⚠ {folder_name}/ not found, skipping...")
        continue
    
    print(f"\n{'='*80}")
    print(f"Processing: {folder_name}/")
    print('='*80)
    
    folder_total_cycles = 0
    experiment_count = 0
    
    # Process experiment folders 2-26 (or however many exist)
    for exp_num in range(2, 27):
        # Determine which layer file to use (cycling through 2-9)
        layer_num = ((exp_num - 2) % 8) + 2
        layer_file = os.path.join(folder_path, f'layer{layer_num}.txt')
        
        npu_txt = os.path.join(folder_path, 'npu', str(exp_num), 'SA_stage_E.txt')
        pim_txt = os.path.join(folder_path, 'pim', str(exp_num), 'SA_stage_E.txt')
        
        # Check if files exist
        if not os.path.exists(layer_file):
            print(f"  Exp {exp_num}: layer{layer_num}.txt not found, skipping...")
            continue
        if not os.path.exists(npu_txt):
            print(f"  Exp {exp_num}: NPU file not found, skipping...")
            continue
        if not os.path.exists(pim_txt):
            print(f"  Exp {exp_num}: PIM file not found, skipping...")
            continue
    
                # Read layer file to identify all active experts and top 4
        with open(layer_file, 'r') as f:
            layer_content = f.read()
        
        # Parse layer file to get all experts with usage > 0
        all_active_experts = []
        for line in layer_content.split('\n'):
            # Look for lines with expert numbers and token count
            match = re.search(r'Expert\s+(\d+)\s+(\d+)', line)
            if match:
                expert_num = int(match.group(1))
                token_count = int(match.group(2))
                # Only count experts with at least 1 token
                if token_count > 0:
                    all_active_experts.append(expert_num)
        
        total_active_experts = len(all_active_experts)
        
        if total_active_experts == 0:
            print(f"  Exp {exp_num}: No active experts found in layer{layer_num}.txt, skipping...")
            continue
        
        # Calculate dynamic NPU allocation based on bandwidth ratio
        # Formula: num_on_npu = ceil(0.05882 × total_active_experts)
        num_experts_on_npu = math.ceil(0.05882 * total_active_experts)
        
        # Ensure at least 1 expert on NPU, and not more than total active
        num_experts_on_npu = max(1, min(num_experts_on_npu, total_active_experts))
        
        # Top N experts go to NPU (N is dynamically calculated)
        top_npu_experts = all_active_experts[:num_experts_on_npu]
        
        # Calculate remaining experts on PIM
        num_experts_on_pim = total_active_experts - num_experts_on_npu
        
        # print(f"  Exp {exp_num} (using layer{layer_num}.txt):")
        # print(f"    Top 4 experts: {top_4_experts}")
    
        # Read NPU file and extract data for top N experts
        with open(npu_txt, 'r') as f:
            npu_content = f.read()
        
        npu_param_load_sum = 0
        last_expert_total_compute = 0
        
        for line in npu_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'NPU Expert' in line or 'Decoder' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 6:
                try:
                    expert_num = int(parts[0])
                    if expert_num in top_npu_experts:
                        param_load = int(parts[1])
                        total_compute = int(parts[5])
                        npu_param_load_sum += param_load
                        
                        # If this is the last expert in NPU group (slowest)
                        if expert_num == top_npu_experts[-1]:
                            last_expert_total_compute = total_compute
                except (ValueError, IndexError):
                    continue
    
        # Read PIM file and extract data for remaining experts
        with open(pim_txt, 'r') as f:
            pim_content = f.read()
        
        # Extract activation movements
        activation_1 = 0
        activation_2 = 0
        
        for line in pim_content.split('\n'):
            if 'activation_movement_1' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_1 = int(float(match.group(1)))
            elif 'activation_movement_2' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_2 = int(float(match.group(1)))
        
        # Extract total compute for remaining experts (not in top 4)
        pim_total_compute_sum = 0
        
        for line in pim_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'Note:' in line or 'Activation' in line or '-' in line[:3] or 'PIM Expert' in line or 'Decoder' in line or 'BS' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 5:
                try:
                    expert_num = int(parts[0])
                    # Only include experts NOT in top N (NPU group)
                    if expert_num not in top_npu_experts:
                        total_compute = int(parts[4])
                        pim_total_compute_sum += total_compute
                except (ValueError, IndexError):
                    continue
    
        # Calculate experiment total (NPU and PIM run in PARALLEL, so take MAX)
        npu_time = npu_param_load_sum + last_expert_total_compute
        pim_time = pim_total_compute_sum + activation_1 + activation_2
        exp_total = max(npu_time, pim_time)
        
        folder_total_cycles += exp_total
        experiment_count += 1
        
        # Store experiment result
        all_experiment_results.append({
            'Numbered Folder': folder_name,
            'Experiment': exp_num,
            'Layer File': f'layer{layer_num}.txt',
            'Total Active': total_active_experts,
            'Experts on NPU': num_experts_on_npu,
            'Experts on PIM': num_experts_on_pim,
            'Top NPU Experts': str(top_npu_experts),
            'NPU param_load': npu_param_load_sum,
            'Last expert compute': last_expert_total_compute,
            'PIM compute': pim_total_compute_sum,
            'Activations': activation_1 + activation_2,
            'Experiment Total': exp_total
        })
        
        # Print detailed result
        bottleneck = "NPU" if npu_time >= pim_time else "PIM"
        
        print(f"\n  Experiment {exp_num} (using layer{layer_num}.txt):")
        print(f"    Active experts: {total_active_experts} total → {num_experts_on_npu} on NPU, {num_experts_on_pim} on PIM")
        print(f"    Top {num_experts_on_npu} experts (NPU): {top_npu_experts}")
        print(f"    NPU Time (parallel with PIM):")
        print(f"      - Param load (top {num_experts_on_npu}): {npu_param_load_sum:,} cycles")
        print(f"      - Last expert (#{num_experts_on_npu}) compute: {last_expert_total_compute:,} cycles")
        print(f"      - NPU Total: {npu_time:,} cycles")
        print(f"    PIM Time (parallel with NPU):")
        print(f"      - Total compute ({num_experts_on_pim} experts): {pim_total_compute_sum:,} cycles")
        print(f"      - Activation 1: {activation_1:,} cycles")
        print(f"      - Activation 2: {activation_2:,} cycles")
        print(f"      - PIM Total: {pim_time:,} cycles")
        print(f"    TOTAL (MAX): {exp_total:,} cycles [{bottleneck} is bottleneck]")
    
    # Store numbered folder total
    if experiment_count > 0:
        numbered_folder_totals.append({
            'Numbered Folder': folder_name,
            'Experiments Processed': experiment_count,
            'Folder Total': folder_total_cycles
        })
        grand_total += folder_total_cycles
        
        print(f"  {'='*76}")
        print(f"  {folder_name} Total ({experiment_count} experiments): {folder_total_cycles:,} cycles")

# Create summary DataFrames
experiments_df = pd.DataFrame(all_experiment_results)
folders_df = pd.DataFrame(numbered_folder_totals)

print("\n" + "="*80)
print("Numbered Folder Totals")
print("="*80)
print(folders_df.to_string(index=False))

print("\n" + "="*80)
print(f"GRAND TOTAL (Hybrid Mode - All experiments): {grand_total:,} cycles")
print("="*80)

# Display folders DataFrame
folders_df


HYBRID MODE Analysis (Top H on NPU, Rest on PIM) - Decoder-BS16


Processing: 1st/

  Experiment 2 (using layer2.txt):
    Active experts: 39 total → 3 on NPU, 36 on PIM
    Top 3 experts (NPU): [20, 12, 26]
    NPU Time (parallel with PIM):
      - Param load (top 3): 1,070,976 cycles
      - Last expert (#3) compute: 341 cycles
      - NPU Total: 1,071,317 cycles
    PIM Time (parallel with NPU):
      - Total compute (36 experts): 669,331 cycles
      - Activation 1: 3,379 cycles
      - Activation 2: 3,365 cycles
      - PIM Total: 676,075 cycles
    TOTAL (MAX): 1,071,317 cycles [NPU is bottleneck]

  Experiment 3 (using layer3.txt):
    Active experts: 38 total → 3 on NPU, 35 on PIM
    Top 3 experts (NPU): [62, 31, 60]
    NPU Time (parallel with PIM):
      - Param load (top 3): 1,071,104 cycles
      - Last expert (#3) compute: 342 cycles
      - NPU Total: 1,071,446 cycles
    PIM Time (parallel with NPU):
      - Total compute (35 experts): 651,112 cycles
      - Activation 

Unnamed: 0,Numbered Folder,Experiments Processed,Folder Total
0,1st,25,24643283
1,2nd,25,22145637
2,3rd,25,25716101
3,4th,25,25358502
4,5th,25,24644307
5,6th,25,26786390
6,7th,25,25359431
7,8th,25,25715270
8,9th,25,23573288
9,10th,25,24288044


In [27]:
import pandas as pd
import os
import re

# Base directory
base_dir = 'Decoder-BS32'

print("="*80)
print("INCREMENTAL OPTIMIZATION Analysis - Decoder-BS16")
print("Finding optimal number of experts to move from PIM to NPU")
print("="*80)
print()

# Numbered folders to process
numbered_folders = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

# Store results
all_folder_results = []
optimization_summary = []
numbered_folder_summaries = []

# Process each numbered folder
for folder_name in numbered_folders:
    folder_path = os.path.join(base_dir, folder_name)
    
    if not os.path.exists(folder_path):
        print(f"⚠ {folder_name}/ not found, skipping...")
        continue
    
    print(f"\n{'='*80}")
    print(f"Processing: {folder_name}/")
    print('='*80)
    print()
    
    folder_optimal_total = 0
    experiments_processed = 0
    
    # Process experiment folders 2-26
    for exp_num in range(2, 10):
        # Determine which layer file to use (cycling through 2-9)
        layer_num = ((exp_num - 2) % 8) + 2
        layer_file = os.path.join(folder_path, f'layer{layer_num}.txt')
        
        npu_txt = os.path.join(folder_path, 'npu', str(exp_num), 'SA_stage_E.txt')
        pim_txt = os.path.join(folder_path, 'pim', str(exp_num), 'SA_stage_E.txt')
        
        # Check if files exist
        if not os.path.exists(layer_file):
            continue
        if not os.path.exists(npu_txt):
            continue
        if not os.path.exists(pim_txt):
            continue
        
        # Read layer file to get ordered list of experts (top to bottom by usage)
        with open(layer_file, 'r') as f:
            layer_content = f.read()
        
        expert_order = []
        for line in layer_content.split('\n'):
            match = re.search(r'Expert\s+(\d+)\s+(\d+)', line)
            if match:
                expert_num = int(match.group(1))
                token_count = int(match.group(2))
                # Only include active experts (token_count > 0)
                if token_count > 0:
                    expert_order.append(expert_num)
        
        total_active_experts = len(expert_order)
        
        if total_active_experts == 0:
            continue
        
        # Read NPU data
        with open(npu_txt, 'r') as f:
            npu_content = f.read()
        
        npu_data = {}
        for line in npu_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'NPU Expert' in line or 'Decoder' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 6:
                try:
                    expert_num = int(parts[0])
                    param_load = int(parts[1])
                    total_compute = int(parts[5])
                    npu_data[expert_num] = {'param_load': param_load, 'total_compute': total_compute}
                except (ValueError, IndexError):
                    continue
        
        # Read PIM data
        with open(pim_txt, 'r') as f:
            pim_content = f.read()
        
        # Extract activation movements
        activation_1 = 0
        activation_2 = 0
        
        for line in pim_content.split('\n'):
            if 'activation_movement_1' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_1 = int(float(match.group(1)))
            elif 'activation_movement_2' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_2 = int(float(match.group(1)))
        
        pim_data = {}
        for line in pim_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'Note:' in line or 'Activation' in line or '-' in line[:3] or 'PIM Expert' in line or 'Decoder' in line or 'BS' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 5:
                try:
                    expert_num = int(parts[0])
                    total_compute = int(parts[4])
                    pim_data[expert_num] = {'total_compute': total_compute}
                except (ValueError, IndexError):
                    continue
        
        # Now test configurations from 0 to total_active_experts
        configurations = []
        
        for num_on_npu in range(0, total_active_experts + 1):
            if num_on_npu == 0:
                # All on PIM
                npu_time = 0
                pim_compute = sum(pim_data[exp]['total_compute'] for exp in expert_order if exp in pim_data)
                pim_time = pim_compute + activation_1 + activation_2
                total_cycles = pim_time
            else:
                # Top num_on_npu experts on NPU, rest on PIM
                top_experts = expert_order[:num_on_npu]
                remaining_experts = expert_order[num_on_npu:]
                
                # NPU: param_load of top experts + total compute of last expert in top group
                npu_param_load = sum(npu_data[exp]['param_load'] for exp in top_experts if exp in npu_data)
                npu_last_compute = npu_data[top_experts[-1]]['total_compute'] if top_experts[-1] in npu_data else 0
                npu_time = npu_param_load + npu_last_compute
                
                # PIM: total compute of remaining experts
                pim_compute = sum(pim_data[exp]['total_compute'] for exp in remaining_experts if exp in pim_data)
                
                if num_on_npu == total_active_experts:
                    # All on NPU, no PIM usage
                    pim_time = 0
                else:
                    # PIM is active
                    pim_time = pim_compute + activation_1 + activation_2
                
                # Total = MAX because NPU and PIM run simultaneously (PARALLEL)
                total_cycles = max(npu_time, pim_time)
            
            configurations.append({
                'Numbered Folder': folder_name,
                'Experiment': exp_num,
                'Experts on NPU': num_on_npu,
                'NPU Time': npu_time,
                'PIM Time': pim_time,
                'Total Cycles': total_cycles
            })
        
        # Find optimal configuration (minimum total cycles)
        min_config = min(configurations, key=lambda x: x['Total Cycles'])
        
        print(f"  Exp {exp_num} (layer{layer_num}.txt, {total_active_experts} active experts):")
        print(f"    Optimal: {min_config['Experts on NPU']} experts on NPU")
        print(f"    NPU Time: {min_config['NPU Time']:,} | PIM Time: {min_config['PIM Time']:,}")
        print(f"    Minimum Total (MAX): {min_config['Total Cycles']:,} cycles")
        
        # Store all configurations for this experiment
        all_folder_results.extend(configurations)
        
        # Store optimization summary for this experiment
        optimization_summary.append({
            'Numbered Folder': folder_name,
            'Experiment': exp_num,
            'Layer File': f'layer{layer_num}.txt',
            'Total Active Experts': total_active_experts,
            'Optimal Experts on NPU': min_config['Experts on NPU'],
            'NPU Time': min_config['NPU Time'],
            'PIM Time': min_config['PIM Time'],
            'Optimal Total Cycles': min_config['Total Cycles']
        })
        
        folder_optimal_total += min_config['Total Cycles']
        experiments_processed += 1
    
    # Store numbered folder summary
    if experiments_processed > 0:
        numbered_folder_summaries.append({
            'Numbered Folder': folder_name,
            'Experiments Processed': experiments_processed,
            'Optimal Total': folder_optimal_total
        })
        
        print(f"\n  {'='*76}")
        print(f"  {folder_name} Optimal Total ({experiments_processed} experiments): {folder_optimal_total:,} cycles")

# Create DataFrames
all_configs_df = pd.DataFrame(all_folder_results)
optimization_df = pd.DataFrame(optimization_summary)
folders_summary_df = pd.DataFrame(numbered_folder_summaries)

print("\n" + "="*80)
print("Numbered Folder Summaries")
print("="*80)
print(folders_summary_df.to_string(index=False))

print("\n" + "="*80)
print(f"GRAND TOTAL (All Optimal Configurations): {folders_summary_df['Optimal Total'].sum():,} cycles")
print("="*80)

# Display folder summaries
folders_summary_df


INCREMENTAL OPTIMIZATION Analysis - Decoder-BS16
Finding optimal number of experts to move from PIM to NPU


Processing: 1st/

  Exp 2 (layer2.txt, 61 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,788 | PIM Time: 411,532
    Minimum Total (MAX): 411,532 cycles
  Exp 3 (layer3.txt, 60 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,788 | PIM Time: 392,368
    Minimum Total (MAX): 392,368 cycles
  Exp 4 (layer4.txt, 60 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,892 | PIM Time: 403,948
    Minimum Total (MAX): 403,948 cycles
  Exp 5 (layer5.txt, 62 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,890 | PIM Time: 434,376
    Minimum Total (MAX): 434,376 cycles
  Exp 6 (layer6.txt, 59 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,788 | PIM Time: 409,621
    Minimum Total (MAX): 409,621 cycles
  Exp 7 (layer7.txt, 58 active experts):
    Optimal: 1 experts on NPU
    NPU Time: 353,891 | PIM Time: 415

Unnamed: 0,Numbered Folder,Experiments Processed,Optimal Total
0,1st,8,3268902
1,2nd,8,3279204
2,3rd,8,3284624
3,4th,8,3354621
4,5th,8,3261083
5,6th,8,3269147
6,7th,8,3274886
7,8th,8,3374205
8,9th,8,3357252
9,10th,8,3247134


In [22]:
import pandas as pd
import os
import re
from collections import defaultdict

# Base directory
base_dir = 'Decoder-BS32-deep'

print("="*80)
print("INCREMENTAL OPTIMIZATION Analysis - REUSE AWARE (LFU Cache)")
print("Finding optimal number of experts to move from PIM to NPU")
print("Cache: 16 experts per layer (25%), LFU eviction policy")
print("="*80)
print()

# Numbered folders to process
numbered_folders = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

# Cache management (persistent across iterations)
# layer_cache[exp_num] = set of cached expert numbers (max 16 per layer)
# layer_freq[exp_num][expert_num] = usage frequency
layer_cache = defaultdict(set)
layer_freq = defaultdict(lambda: defaultdict(int))
CACHE_SIZE = 10  # 25% of 64 experts

# Store results
all_folder_results = []
optimization_summary = []
numbered_folder_summaries = []

# Process each numbered folder (these are ITERATIONS executed sequentially)
for folder_name in numbered_folders:
    folder_path = os.path.join(base_dir, folder_name)
    
    if not os.path.exists(folder_path):
        print(f"⚠ {folder_name}/ not found, skipping...")
        continue
    
    print(f"\n{'='*80}")
    print(f"Processing: {folder_name}/")
    print('='*80)
    print()
    
    folder_optimal_total = 0
    experiments_processed = 0
    
    # Process experiment folders 2-26
    for exp_num in range(2, 13):
        # Determine which layer file to use (cycling through 2-9)
        layer_num = ((exp_num - 2) % 8) + 2
        layer_file = os.path.join(folder_path, f'layer{layer_num}.txt')
        
        npu_txt = os.path.join(folder_path, 'npu', str(exp_num), 'SA_stage_E.txt')
        pim_txt = os.path.join(folder_path, 'pim', str(exp_num), 'SA_stage_E.txt')
        
        # Check if files exist
        if not os.path.exists(layer_file):
            continue
        if not os.path.exists(npu_txt):
            continue
        if not os.path.exists(pim_txt):
            continue
        
        # Read layer file to get ordered list of experts (top to bottom by usage)
        with open(layer_file, 'r') as f:
            layer_content = f.read()
        
        expert_order = []
        for line in layer_content.split('\n'):
            match = re.search(r'Expert\s+(\d+)\s+(\d+)', line)
            if match:
                expert_num = int(match.group(1))
                token_count = int(match.group(2))
                # Only include active experts (token_count > 0)
                if token_count > 0:
                    expert_order.append(expert_num)
        
        total_active_experts = len(expert_order)
        
        if total_active_experts == 0:
            continue
        
        # Read NPU data
        with open(npu_txt, 'r') as f:
            npu_content = f.read()
        
        npu_data = {}
        for line in npu_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'NPU Expert' in line or 'Decoder' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 6:
                try:
                    expert_num = int(parts[0])
                    param_load = int(parts[1])
                    total_compute = int(parts[5])
                    npu_data[expert_num] = {'param_load': param_load, 'total_compute': total_compute}
                except (ValueError, IndexError):
                    continue
        
        # Read PIM data
        with open(pim_txt, 'r') as f:
            pim_content = f.read()
        
        # Extract activation movements
        activation_1 = 0
        activation_2 = 0
        
        for line in pim_content.split('\n'):
            if 'activation_movement_1' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_1 = int(float(match.group(1)))
            elif 'activation_movement_2' in line and 'cycles' in line:
                match = re.search(r'([\d.]+)\s+cycles', line)
                if match:
                    activation_2 = int(float(match.group(1)))
        
        pim_data = {}
        for line in pim_content.split('\n'):
            if not line.strip() or '=' in line or 'Expert Number' in line or 'Expert Operations' in line or 'Note:' in line or 'Activation' in line or '-' in line[:3] or 'PIM Expert' in line or 'Decoder' in line or 'BS' in line:
                continue
            
            parts = line.split()
            if len(parts) >= 5:
                try:
                    expert_num = int(parts[0])
                    total_compute = int(parts[4])
                    pim_data[expert_num] = {'total_compute': total_compute}
                except (ValueError, IndexError):
                    continue
        
        # CACHE-AWARE BENEFIT CALCULATION
        # Calculate benefit score for each active expert
        # Benefit = PIM_time - (NPU_time + param_load_cost)
        # Higher benefit = more worth moving to NPU
        expert_benefits = []
        
        for exp in expert_order:
            if exp in npu_data and exp in pim_data:
                pim_time = pim_data[exp]['total_compute']
                npu_compute = npu_data[exp]['total_compute']
                
                # Cache-aware param_load cost (use exp_num as cache key, not layer_num)
                if exp in layer_cache[exp_num]:
                    param_load_cost = 0  # Cached → zero cost ✅
                else:
                    param_load_cost = npu_data[exp]['param_load']
                
                npu_time = npu_compute + param_load_cost
                benefit = pim_time - npu_time
                
                expert_benefits.append({
                    'expert': exp,
                    'benefit': benefit,
                    'cached': exp in layer_cache[exp_num]
                })
        
        # Sort experts by benefit (descending - highest benefit first)
        # This is the DynaNDE algorithm: prioritize experts with highest benefit
        expert_benefits.sort(key=lambda x: x['benefit'], reverse=True)
        benefit_sorted_experts = [e['expert'] for e in expert_benefits]
        
        # Store top 5 benefits for debugging/display
        top_5_benefits = expert_benefits[:5] if len(expert_benefits) >= 5 else expert_benefits
        
        # Now test configurations from 0 to total_active_experts
        # Using BENEFIT-SORTED ORDER (not layer.txt order)
        # Cached experts naturally rank higher due to zero param_load cost
        configurations = []
        
        for num_on_npu in range(0, total_active_experts + 1):
            if num_on_npu == 0:
                # All on PIM
                npu_time = 0
                pim_compute = sum(pim_data[exp]['total_compute'] for exp in benefit_sorted_experts if exp in pim_data)
                pim_time = pim_compute + activation_1 + activation_2
                total_cycles = pim_time
            else:
                # Top num_on_npu experts on NPU (by BENEFIT, not layer.txt order)
                top_experts = benefit_sorted_experts[:num_on_npu]
                remaining_experts = benefit_sorted_experts[num_on_npu:]
                
                # NPU PARALLEL EXECUTION MODEL:
                # - Param loading (non-cached) and compute (cached) happen in PARALLEL
                # - Last expert compute happens after (sequential)
                
                total_param_load = 0
                sum_cached_compute = 0
                
                for exp in top_experts:
                    if exp in npu_data:
                        if exp not in layer_cache[exp_num]:
                            # Non-cached: add param_load
                            total_param_load += npu_data[exp]['param_load']
                        else:
                            # Cached: add compute (happens during param loading phase)
                            sum_cached_compute += npu_data[exp]['total_compute']
                
                # Last expert compute (always sequential at the end)
                last_expert_compute = npu_data[top_experts[-1]]['total_compute'] if top_experts[-1] in npu_data else 0
                
                # If last expert is cached, subtract it from sum_cached_compute to avoid double counting
                if top_experts[-1] in layer_cache[exp_num]:
                    sum_cached_compute -= last_expert_compute
                
                # Calculate NPU time based on execution model
                if total_param_load == 0:
                    # ALL cached: no loading, all compute sequentially
                    npu_time = sum(npu_data[exp]['total_compute'] for exp in top_experts if exp in npu_data)
                else:
                    # Mixed: param loading and cached compute in parallel, then last compute
                    phase1 = max(total_param_load, sum_cached_compute)
                    npu_time = phase1 + last_expert_compute
                
                # PIM: total compute of remaining experts
                pim_compute = sum(pim_data[exp]['total_compute'] for exp in remaining_experts if exp in pim_data)
                
                if num_on_npu == total_active_experts:
                    # All on NPU, no PIM usage
                    pim_time = 0
                else:
                    # PIM is active
                    pim_time = pim_compute + activation_1 + activation_2
                
                # Total = MAX because NPU and PIM run simultaneously (PARALLEL)
                total_cycles = max(npu_time, pim_time)
            
            configurations.append({
                'Numbered Folder': folder_name,
                'Experiment': exp_num,
                'Experts on NPU': num_on_npu,
                'NPU Time': npu_time,
                'PIM Time': pim_time,
                'Total Cycles': total_cycles
            })
        
        # Find optimal configuration (minimum total cycles)
        min_config = min(configurations, key=lambda x: x['Total Cycles'])
        
        # UPDATE CACHE: Add optimal NPU experts to cache and update frequencies
        optimal_npu_count = min_config['Experts on NPU']
        if optimal_npu_count > 0:
            # Use BENEFIT-SORTED order (not layer.txt order)
            optimal_npu_experts = benefit_sorted_experts[:optimal_npu_count]
            
            # Track which experts are newly added vs already cached
            newly_cached = []
            already_cached = []
            
            for exp in optimal_npu_experts:
                # Increment usage frequency (use exp_num as key)
                layer_freq[exp_num][exp] += 1
                
                # Add to cache if not already there
                if exp not in layer_cache[exp_num]:
                    layer_cache[exp_num].add(exp)
                    newly_cached.append(exp)
                else:
                    already_cached.append(exp)
            
            # LFU EVICTION: If cache exceeds 16, evict least frequently used
            if len(layer_cache[exp_num]) > CACHE_SIZE:
                # Get all cached experts with their frequencies
                cached_experts_freq = [(exp, layer_freq[exp_num][exp]) for exp in layer_cache[exp_num]]
                # Sort by frequency (ascending) - least frequent first
                cached_experts_freq.sort(key=lambda x: x[1])
                
                # Evict until cache size = 16
                num_to_evict = len(layer_cache[exp_num]) - CACHE_SIZE
                evicted = []
                for i in range(num_to_evict):
                    exp_to_evict = cached_experts_freq[i][0]
                    layer_cache[exp_num].remove(exp_to_evict)
                    evicted.append(exp_to_evict)
                
                cache_status = f"Cache FULL → Evicted {num_to_evict} LFU: {evicted}"
            else:
                cache_status = f"Cache: {len(layer_cache[exp_num])}/{CACHE_SIZE}"
        else:
            # No NPU experts in optimal config
            newly_cached = []
            already_cached = []
            cache_status = f"Cache: {len(layer_cache[exp_num])}/{CACHE_SIZE} (no NPU experts)"
        
        print(f"  Exp {exp_num} (uses layer{layer_num}.txt, {total_active_experts} active experts):")
        print(f"    [Layer {exp_num} has unique 64 experts, cache key = {exp_num}]")
        
        # Show top 5 experts by benefit (for verification)
        top_5_str = ", ".join([f"E{e['expert']}({'C' if e['cached'] else 'M'}:{e['benefit']:+,})" for e in top_5_benefits])
        print(f"    Top 5 by benefit: {top_5_str}")
        
        print(f"    Optimal: {min_config['Experts on NPU']} experts on NPU")
        if optimal_npu_count > 0:
            # Show which experts were selected (benefit-based order)
            cached_in_optimal = [e for e in optimal_npu_experts if e in already_cached]
            print(f"    Selected experts (by benefit): {optimal_npu_experts}")
            print(f"    Cache hits: {len(already_cached)} (experts: {cached_in_optimal})")
            print(f"    Cache misses: {len(newly_cached)} (experts: {newly_cached})")
        print(f"    NPU Time: {min_config['NPU Time']:,} | PIM Time: {min_config['PIM Time']:,}")
        print(f"    Minimum Total (MAX): {min_config['Total Cycles']:,} cycles")
        print(f"    {cache_status}")
        
        # Store all configurations for this experiment
        all_folder_results.extend(configurations)
        
        # Store optimization summary for this experiment
        optimization_summary.append({
            'Numbered Folder': folder_name,
            'Experiment': exp_num,
            'Layer File': f'layer{layer_num}.txt',
            'Total Active Experts': total_active_experts,
            'Optimal Experts on NPU': min_config['Experts on NPU'],
            'Cache Hits': len(already_cached),
            'Cache Misses': len(newly_cached),
            'Cache Size': len(layer_cache[exp_num]),
            'NPU Time': min_config['NPU Time'],
            'PIM Time': min_config['PIM Time'],
            'Optimal Total Cycles': min_config['Total Cycles']
        })
        
        folder_optimal_total += min_config['Total Cycles']
        experiments_processed += 1
    
    # Store numbered folder summary
    if experiments_processed > 0:
        numbered_folder_summaries.append({
            'Numbered Folder': folder_name,
            'Experiments Processed': experiments_processed,
            'Optimal Total': folder_optimal_total
        })
        
        print(f"\n  {'='*76}")
        print(f"  {folder_name} Optimal Total ({experiments_processed} experiments): {folder_optimal_total:,} cycles")

# Create DataFrames
all_configs_df = pd.DataFrame(all_folder_results)
optimization_df = pd.DataFrame(optimization_summary)
folders_summary_df = pd.DataFrame(numbered_folder_summaries)

print("\n" + "="*80)
print("Numbered Folder Summaries")
print("="*80)
print(folders_summary_df.to_string(index=False))

print("\n" + "="*80)
print(f"GRAND TOTAL (All Optimal Configurations): {folders_summary_df['Optimal Total'].sum():,} cycles")
print("="*80)

# Cache Statistics Summary
print("\n" + "="*80)
print("Cache Statistics After All Iterations")
print("="*80)
cache_stats = []
for exp_num in sorted(layer_cache.keys()):
    total_uses = sum(layer_freq[exp_num].values())
    avg_freq = total_uses / len(layer_cache[exp_num]) if layer_cache[exp_num] else 0
    # Determine which layer.txt file this experiment uses
    display_layer_num = ((exp_num - 2) % 8) + 2
    cache_stats.append({
        'Exp (Layer)': f'Exp {exp_num} (layer{display_layer_num}.txt)',
        'Cached Experts': len(layer_cache[exp_num]),
        'Total Uses': total_uses,
        'Avg Frequency': f'{avg_freq:.1f}'
    })

cache_stats_df = pd.DataFrame(cache_stats)
print(cache_stats_df.to_string(index=False))

print("\n" + "="*80)
total_cache_hits = optimization_df['Cache Hits'].sum()
total_cache_misses = optimization_df['Cache Misses'].sum()
hit_rate = (total_cache_hits / (total_cache_hits + total_cache_misses) * 100) if (total_cache_hits + total_cache_misses) > 0 else 0
print(f"Total Cache Hits: {total_cache_hits} | Total Cache Misses: {total_cache_misses}")
print(f"Cache Hit Rate: {hit_rate:.2f}%")
print("="*80)

# Final Summary
print("\n" + "="*80)
print("FINAL SUMMARY - REUSE AWARE OPTIMIZATION")
print("="*80)
grand_total_reuse = folders_summary_df['Optimal Total'].sum()
print(f"GRAND TOTAL (All Iterations with Cache): {grand_total_reuse:,} cycles")
print(f"Total Experiments Processed: {optimization_df.shape[0]}")
print(f"Cache Hit Rate: {hit_rate:.2f}%")
print("="*80)

# Display folder summaries
folders_summary_df


INCREMENTAL OPTIMIZATION Analysis - REUSE AWARE (LFU Cache)
Finding optimal number of experts to move from PIM to NPU
Cache: 16 experts per layer (25%), LFU eviction policy


Processing: 1st/

  Exp 2 (uses layer2.txt, 61 active experts):
    [Layer 2 has unique 64 experts, cache key = 2]
    Top 5 by benefit: E21(M:-1,368,762), E63(M:-1,369,282), E54(M:-1,369,326), E19(M:-1,369,346), E12(M:-1,369,347)
    Optimal: 1 experts on NPU
    Selected experts (by benefit): [21]
    Cache hits: 0 (experts: [])
    Cache misses: 1 (experts: [21])
    NPU Time: 1,415,092 | PIM Time: 1,624,043
    Minimum Total (MAX): 1,624,043 cycles
    Cache: 1/10
  Exp 3 (uses layer3.txt, 60 active experts):
    [Layer 3 has unique 64 experts, cache key = 3]
    Top 5 by benefit: E26(M:-1,368,834), E48(M:-1,368,950), E6(M:-1,368,998), E4(M:-1,369,557), E15(M:-1,369,654)
    Optimal: 1 experts on NPU
    Selected experts (by benefit): [26]
    Cache hits: 0 (experts: [])
    Cache misses: 1 (experts: [26])
   

Unnamed: 0,Numbered Folder,Experiments Processed,Optimal Total
0,1st,11,17690677
1,2nd,11,17586976
2,3rd,11,17229681
3,4th,11,17321750
4,5th,11,16539276
5,6th,11,16210414
6,7th,11,16115195
7,8th,11,16364310
8,9th,11,15887652
9,10th,11,15555431
