# Performance Analysis - Validation Rule Search System

This notebook provides comprehensive performance analysis with visualizations.

In [None]:
# Import required libraries
import time
import psutil
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Add project to path
sys.path.append('..')

from app import boot
from rag.search.config import SearchMode
from db.manager import DatabaseManager
from config import SQLITE_DB_PATH, SQLITE_TABLE_NAME

print("Libraries imported successfully")

## 1. System Initialization

In [None]:
# Measure startup time
print("Initializing system...")
start_time = time.time()
app, retriever = boot()
startup_time = time.time() - start_time

print(f"✓ System initialized in {startup_time:.3f} seconds")

# Initialize database manager
db_manager = DatabaseManager(SQLITE_DB_PATH, SQLITE_TABLE_NAME)

## 2. Memory Usage Analysis

In [None]:
def get_memory_stats():
    """Get current memory statistics."""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return {
        'rss_mb': memory_info.rss / 1024 / 1024,
        'vms_mb': memory_info.vms / 1024 / 1024,
        'percent': process.memory_percent()
    }

memory_stats = get_memory_stats()

# Create memory usage plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Memory breakdown
memory_data = pd.DataFrame({
    'Type': ['RSS', 'VMS'],
    'MB': [memory_stats['rss_mb'], memory_stats['vms_mb']]
})
sns.barplot(data=memory_data, x='Type', y='MB', ax=ax1, palette='viridis')
ax1.set_title('Memory Usage by Type')
ax1.set_ylabel('Memory (MB)')

# Memory percentage
ax2.pie([memory_stats['percent'], 100-memory_stats['percent']], 
        labels=['Used', 'Available'], 
        autopct='%1.1f%%',
        colors=['#ff9999', '#66b3ff'])
ax2.set_title('System Memory Usage')

plt.tight_layout()
plt.show()

print(f"\nMemory Statistics:")
print(f"  RSS Memory: {memory_stats['rss_mb']:.1f} MB")
print(f"  Virtual Memory: {memory_stats['vms_mb']:.1f} MB")
print(f"  Memory Percent: {memory_stats['percent']:.2f}%")

## 3. Search Latency Analysis

In [None]:
def measure_search_performance(num_queries=30):
    """Measure search performance across different modes."""
    
    test_queries = [
        "IBAN", "BIC", "currency", "amount", "date",
        "validate IBAN", "check currency", "payment amount",
        "MT103", "MT202", "pacs.008", "pain.001", "ISO20022",
        "BANSTA error", "validation failed", "invalid format"
    ]
    
    # Extend queries to reach num_queries
    queries = (test_queries * (num_queries // len(test_queries) + 1))[:num_queries]
    
    results = {}
    
    for mode_name, mode in [
        ('Hybrid', SearchMode.HYBRID),
        ('Keyword', SearchMode.KEYWORD),
        ('Semantic', SearchMode.SEMANTIC),
        ('Fuzzy', SearchMode.FUZZY)
    ]:
        print(f"Testing {mode_name} mode...")
        latencies = []
        
        for query in queries:
            start = time.perf_counter()
            _ = retriever.search_rules(query=query, mode=mode, top_k=10)
            latency = (time.perf_counter() - start) * 1000
            latencies.append(latency)
        
        results[mode_name] = latencies
    
    return pd.DataFrame(results)

# Measure performance
latency_df = measure_search_performance(30)
print("\nSearch performance measured successfully")

In [None]:
# Create latency visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Box plot
latency_df.boxplot(ax=axes[0, 0])
axes[0, 0].set_title('Latency Distribution by Search Mode')
axes[0, 0].set_ylabel('Latency (ms)')
axes[0, 0].set_yscale('log')

# Violin plot
melted_df = latency_df.melt(var_name='Mode', value_name='Latency')
sns.violinplot(data=melted_df, x='Mode', y='Latency', ax=axes[0, 1])
axes[0, 1].set_title('Latency Distribution (Violin Plot)')
axes[0, 1].set_ylabel('Latency (ms)')
axes[0, 1].set_yscale('log')

# Percentiles comparison
percentiles = latency_df.describe(percentiles=[.5, .75, .95, .99]).T
percentiles[['50%', '95%', '99%']].plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Latency Percentiles Comparison')
axes[1, 0].set_ylabel('Latency (ms)')
axes[1, 0].set_xlabel('Search Mode')
axes[1, 0].legend(title='Percentile')

# Mean and std comparison
stats_df = pd.DataFrame({
    'Mean': latency_df.mean(),
    'Std': latency_df.std()
})
stats_df.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Mean Latency and Standard Deviation')
axes[1, 1].set_ylabel('Latency (ms)')
axes[1, 1].set_xlabel('Search Mode')

plt.tight_layout()
plt.show()

# Print statistics
print("\nLatency Statistics (milliseconds):")
print("="*60)
print(f"{'Mode':<10} {'Mean':>8} {'Median':>8} {'P95':>8} {'P99':>8} {'Max':>8}")
print("-"*60)
for col in latency_df.columns:
    print(f"{col:<10} {latency_df[col].mean():>8.1f} {latency_df[col].median():>8.1f} "
          f"{latency_df[col].quantile(0.95):>8.1f} {latency_df[col].quantile(0.99):>8.1f} "
          f"{latency_df[col].max():>8.1f}")

## 4. Filter Performance Impact

In [None]:
def measure_filter_impact():
    """Measure the impact of filters on search performance."""
    filter_options = retriever.filter_options
    results = []
    
    test_query = "payment validation"
    
    # No filters
    latencies_no_filter = []
    for _ in range(20):
        start = time.perf_counter()
        _ = retriever.search_rules(query=test_query, mode=SearchMode.HYBRID, top_k=10)
        latencies_no_filter.append((time.perf_counter() - start) * 1000)
    
    results.append({
        'Configuration': 'No Filters',
        'Mean': np.mean(latencies_no_filter),
        'P95': np.percentile(latencies_no_filter, 95)
    })
    
    # Single filter
    if filter_options.get('rule_type'):
        latencies_single = []
        for _ in range(20):
            start = time.perf_counter()
            _ = retriever.search_rules(
                query=test_query,
                rule_type=[filter_options['rule_type'][0]],
                mode=SearchMode.HYBRID,
                top_k=10
            )
            latencies_single.append((time.perf_counter() - start) * 1000)
        
        results.append({
            'Configuration': 'Single Filter',
            'Mean': np.mean(latencies_single),
            'P95': np.percentile(latencies_single, 95)
        })
    
    # Multiple filters
    if filter_options.get('rule_type') and filter_options.get('country'):
        latencies_multiple = []
        for _ in range(20):
            start = time.perf_counter()
            _ = retriever.search_rules(
                query=test_query,
                rule_type=[filter_options['rule_type'][0]],
                country=[filter_options['country'][0]] if filter_options['country'] else None,
                mode=SearchMode.HYBRID,
                top_k=10
            )
            latencies_multiple.append((time.perf_counter() - start) * 1000)
        
        results.append({
            'Configuration': 'Multiple Filters',
            'Mean': np.mean(latencies_multiple),
            'P95': np.percentile(latencies_multiple, 95)
        })
    
    return pd.DataFrame(results)

# Measure filter impact
filter_df = measure_filter_impact()

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

filter_df.set_index('Configuration')['Mean'].plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Mean Latency by Filter Configuration')
ax1.set_ylabel('Latency (ms)')
ax1.set_xlabel('')

filter_df.set_index('Configuration')['P95'].plot(kind='bar', ax=ax2, color='salmon')
ax2.set_title('P95 Latency by Filter Configuration')
ax2.set_ylabel('Latency (ms)')
ax2.set_xlabel('')

plt.tight_layout()
plt.show()

print("\nFilter Performance Impact:")
print(filter_df.to_string(index=False))

## 5. Database and Index Performance

In [None]:
# Database read performance
db_read_times = []
for _ in range(20):
    start = time.perf_counter()
    rules = db_manager.get_rules()
    db_read_times.append((time.perf_counter() - start) * 1000)

# Index build performance
index_build_times = []
for _ in range(5):
    start = time.perf_counter()
    retriever._build_indices()
    index_build_times.append((time.perf_counter() - start) * 1000)

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.hist(db_read_times, bins=15, edgecolor='black', alpha=0.7)
ax1.axvline(np.mean(db_read_times), color='red', linestyle='--', label=f'Mean: {np.mean(db_read_times):.1f}ms')
ax1.axvline(np.percentile(db_read_times, 95), color='orange', linestyle='--', label=f'P95: {np.percentile(db_read_times, 95):.1f}ms')
ax1.set_title('Database Read Performance')
ax1.set_xlabel('Time (ms)')
ax1.set_ylabel('Frequency')
ax1.legend()

ax2.hist(index_build_times, bins=10, edgecolor='black', alpha=0.7, color='green')
ax2.axvline(np.mean(index_build_times), color='red', linestyle='--', label=f'Mean: {np.mean(index_build_times):.1f}ms')
ax2.set_title('Index Build Performance')
ax2.set_xlabel('Time (ms)')
ax2.set_ylabel('Frequency')
ax2.legend()

plt.tight_layout()
plt.show()

print(f"\nDatabase Performance:")
print(f"  Read Mean: {np.mean(db_read_times):.2f} ms")
print(f"  Read P95: {np.percentile(db_read_times, 95):.2f} ms")
print(f"  Rules Count: {len(rules)}")
print(f"\nIndex Build Performance:")
print(f"  Mean: {np.mean(index_build_times):.2f} ms")
print(f"  Std: {np.std(index_build_times):.2f} ms")

## 6. Performance Summary Report

In [None]:
# Generate comprehensive report
report = f"""
PERFORMANCE ANALYSIS REPORT
{'='*60}

1. SYSTEM INITIALIZATION
   - Startup Time: {startup_time:.3f} seconds
   - Total Rules: {len(rules)}

2. MEMORY USAGE
   - RSS Memory: {memory_stats['rss_mb']:.1f} MB
   - Virtual Memory: {memory_stats['vms_mb']:.1f} MB
   - System Memory %: {memory_stats['percent']:.2f}%

3. SEARCH LATENCY (ms)
   Mode      Mean    Median   P95      P99      Max
   {'-'*55}
"""

for col in latency_df.columns:
    report += f"   {col:<9} {latency_df[col].mean():>6.1f}  {latency_df[col].median():>6.1f}  "
    report += f"{latency_df[col].quantile(0.95):>6.1f}  {latency_df[col].quantile(0.99):>6.1f}  "
    report += f"{latency_df[col].max():>6.1f}\n"

report += f"""
4. FILTER PERFORMANCE IMPACT
{filter_df.to_string(index=False, float_format='%.1f')}

5. DATABASE PERFORMANCE
   - Read Mean: {np.mean(db_read_times):.2f} ms
   - Read P95: {np.percentile(db_read_times, 95):.2f} ms

6. INDEX BUILD PERFORMANCE
   - Mean Time: {np.mean(index_build_times):.2f} ms
   - Std Dev: {np.std(index_build_times):.2f} ms

{'='*60}
Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

print(report)

# Save report to file
with open('performance_report.txt', 'w') as f:
    f.write(report)
print("\n✓ Report saved to performance_report.txt")

# Save data to JSON
performance_data = {
    'startup_time': startup_time,
    'memory': memory_stats,
    'latency_stats': latency_df.describe().to_dict(),
    'filter_impact': filter_df.to_dict('records'),
    'database': {
        'read_mean_ms': np.mean(db_read_times),
        'read_p95_ms': np.percentile(db_read_times, 95),
        'num_rules': len(rules)
    },
    'index_build': {
        'mean_ms': np.mean(index_build_times),
        'std_ms': np.std(index_build_times)
    }
}

with open('performance_data.json', 'w') as f:
    json.dump(performance_data, f, indent=2, default=str)
print("✓ Data saved to performance_data.json")

## 7. Performance Recommendations

In [None]:
# Analyze and provide recommendations
recommendations = []

# Check startup time
if startup_time > 1.0:
    recommendations.append("⚠️ Startup time exceeds 1 second. Consider lazy loading or caching indices.")
else:
    recommendations.append("✓ Startup time is acceptable.")

# Check memory usage
if memory_stats['rss_mb'] > 1024:
    recommendations.append("⚠️ Memory usage exceeds 1GB. Consider optimizing data structures.")
else:
    recommendations.append("✓ Memory usage is within acceptable limits.")

# Check search latencies
if latency_df['Hybrid'].quantile(0.95) > 1000:
    recommendations.append("⚠️ Hybrid search P95 exceeds 1 second. Consider caching or optimization.")
else:
    recommendations.append("✓ Hybrid search latency is acceptable.")

if latency_df['Keyword'].mean() > 10:
    recommendations.append("⚠️ Keyword search is slower than expected. Check BM25 implementation.")
else:
    recommendations.append("✓ Keyword search performance is excellent.")

# Check filter impact
if len(filter_df) > 1:
    speedup = filter_df.iloc[0]['Mean'] / filter_df.iloc[-1]['Mean']
    if speedup > 1.5:
        recommendations.append(f"✓ Filters provide {speedup:.1f}x speedup.")
    else:
        recommendations.append("⚠️ Filters don't significantly improve performance.")

print("\nPERFORMANCE RECOMMENDATIONS")
print("="*60)
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print("\n" + "="*60)
print("Analysis complete!")