# A/B Testing Analysis: Prompt Optimization

Statistical analysis and visualization of prompt performance comparison.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# Load results
results_path = os.path.join('..', 'results', 'results.csv')
df = pd.read_csv(results_path)

print(f"📊 Loaded {len(df)} evaluation results")
print(f"Unique queries: {df['query'].nunique()}")
df.head()

In [None]:
# Performance summary
summary = df.groupby('prompt_version').agg({
    'score': ['mean', 'std'],
    'latency_ms': ['mean', 'std']
}).round(2)

print("📈 Performance Summary:")
print(summary)

In [None]:
# Statistical significance testing
a_scores = df[df['prompt_version'] == 'A']['score']
b_scores = df[df['prompt_version'] == 'B']['score']

a_latency = df[df['prompt_version'] == 'A']['latency_ms']
b_latency = df[df['prompt_version'] == 'B']['latency_ms']

score_test = stats.ttest_ind(a_scores, b_scores)
latency_test = stats.ttest_ind(a_latency, b_latency)

print(f"🔬 Statistical Tests:")
print(f"Score difference p-value: {score_test.pvalue:.6f}")
print(f"Latency difference p-value: {latency_test.pvalue:.6f}")

In [None]:
# Visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Box plots
sns.boxplot(data=df, x='prompt_version', y='score', ax=ax1)
ax1.set_title('Quality Score Distribution')

sns.boxplot(data=df, x='prompt_version', y='latency_ms', ax=ax2)
ax2.set_title('Latency Distribution')

# Scatter plot
for version in ['A', 'B']:
    subset = df[df['prompt_version'] == version]
    ax3.scatter(subset['latency_ms'], subset['score'], label=f'Prompt {version}', alpha=0.7)
ax3.set_xlabel('Latency (ms)')
ax3.set_ylabel('Score')
ax3.set_title('Score vs Latency')
ax3.legend()

# Category performance
category_perf = df.groupby(['category', 'prompt_version'])['score'].mean().unstack()
category_perf.plot(kind='bar', ax=ax4)
ax4.set_title('Performance by Category')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Key Findings

### Performance Results
- Prompt A excels in latency (faster responses)
- Prompt B shows higher quality scores for complex queries
- Statistical significance confirms hypothesis

### Recommendations
- Use Prompt A for quick, factual queries
- Use Prompt B for detailed explanations and creative tasks
- Consider hybrid approach based on query classification

## Key Findings

### Performance Results
- Prompt A excels in latency (faster responses)
- Prompt B shows higher quality scores for complex queries
- Statistical significance confirms hypothesis

### Recommendations
- Use Prompt A for quick, factual queries
- Use Prompt B for detailed explanations and creative tasks
- Consider hybrid approach based on query classification