### Phase 5: Model Comparison Patterns

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr, chi2_contingency


In [14]:
datasets = {}
model_names = ['Claude_3.5_Sonnet', 'GPT_3.5', 'GPT_4o']
file_paths = [
    '../Data/qna_dataset_Claude3.5Sonnet_final.csv',
    '../Data/qna_dataset_GPT3.5_final.csv', 
    '../Data/qna_dataset_GPT4o_final.csv'
]

In [15]:
for model_name, file_path in zip(model_names, file_paths):
    try:
        df = pd.read_csv(file_path)
        datasets[model_name] = df
        print(f"✓ {model_name}: {df.shape[0]} rows × {df.shape[1]} columns")
    except Exception as e:
        print(f"❌ Error loading {model_name}: {e}")

✓ Claude_3.5_Sonnet: 400 rows × 16 columns
✓ GPT_3.5: 400 rows × 16 columns
✓ GPT_4o: 400 rows × 16 columns


In [16]:
model_names = list(datasets.keys())
question_ids = {}

for model_name, df in datasets.items():
    question_ids[model_name] = set(df['question_id'].unique())

# Find common questions
common_questions = set.intersection(*question_ids.values())
print(f"📊 Analysis Setup:")
print(f"  Common questions across all models: {len(common_questions)}")
print(f"  Models being compared: {model_names}")

if len(common_questions) == 0:
    print("❌ No common questions found - cannot perform head-to-head analysis")

📊 Analysis Setup:
  Common questions across all models: 400
  Models being compared: ['Claude_3.5_Sonnet', 'GPT_3.5', 'GPT_4o']


#### Same question, different models - find disagreements

In [17]:
disagreement_analysis = {}
agreement_patterns = {}

# Create combined dataset for analysis
combined_results = []

for question_id in sorted(common_questions):
    question_results = {'question_id': question_id}
    
    # Get question text and metadata (from first model)
    first_model = model_names[0]
    question_row = datasets[first_model][datasets[first_model]['question_id'] == question_id].iloc[0]
    question_results.update({
        'question_text': question_row['question_text'],
        'domain': question_row['domain'],
        'question_length': question_row['question_length'],
        'question_type': question_row['question_type'],
        'question_style': question_row['question_style']
    })
    
    # Get results from each model
    model_results = {}
    for model_name in model_names:
        model_row = datasets[model_name][datasets[model_name]['question_id'] == question_id].iloc[0]
        model_results[model_name] = {
            'hallucination': model_row['hallucination_present'],
            'factscore': model_row['factscore'],
            'response_length': model_row['response_length'],
            'citation_present': model_row.get('citation_present', False)
        }
    
    question_results['model_results'] = model_results
    combined_results.append(question_results)

# Analyze agreement patterns
total_questions = len(combined_results)

# Perfect agreement (all models agree)
perfect_agreement = 0
complete_disagreement = 0
partial_disagreement = 0

disagreement_details = []

for result in combined_results:
    halluc_results = [result['model_results'][model]['hallucination'] for model in model_names]
    unique_results = set(halluc_results)
    
    if len(unique_results) == 1:
        perfect_agreement += 1
    elif len(unique_results) == len(model_names):
        complete_disagreement += 1
        disagreement_details.append({
            'question_id': result['question_id'],
            'domain': result['domain'],
            'question_text': result['question_text'][:100] + "...",
            'type': 'complete',
            'model_results': result['model_results']
        })
    else:
        partial_disagreement += 1
        disagreement_details.append({
            'question_id': result['question_id'],
            'domain': result['domain'],
            'question_text': result['question_text'][:100] + "...",
            'type': 'partial',
            'model_results': result['model_results']
        })

print(f"  📊 Agreement Statistics:")
print(f"    Perfect agreement: {perfect_agreement:3d}/{total_questions} ({perfect_agreement/total_questions*100:5.1f}%)")
print(f"    Partial disagreement: {partial_disagreement:3d}/{total_questions} ({partial_disagreement/total_questions*100:5.1f}%)")
print(f"    Complete disagreement: {complete_disagreement:3d}/{total_questions} ({complete_disagreement/total_questions*100:5.1f}%)")

agreement_patterns = {
    'perfect_agreement': perfect_agreement,
    'partial_disagreement': partial_disagreement,
    'complete_disagreement': complete_disagreement,
    'total_questions': total_questions
}

  📊 Agreement Statistics:
    Perfect agreement: 320/400 ( 80.0%)
    Partial disagreement:  80/400 ( 20.0%)
    Complete disagreement:   0/400 (  0.0%)


In [18]:
# 80% Perfect Agreement = All 3 models gave the same result (all correct OR all wrong) on 320 questions
# 20% Partial Disagreement = On 80 questions, some models were right while others were wrong
# 0% Complete Disagreement = Never had all 3 models disagree with each other

In [19]:
 print(f"\n📋 Examples of Model Disagreements:")
print("-" * 55)

# Show top 5 disagreements
disagreement_examples = disagreement_details[:5]

for i, example in enumerate(disagreement_examples, 1):
    print(f"\n  Example {i} ({example['type']} disagreement):")
    print(f"    Question ID: {example['question_id']}")
    print(f"    Domain: {example['domain']}")
    print(f"    Question: {example['question_text']}")
    print(f"    Model results:")
    
    for model_name in model_names:
        halluc = example['model_results'][model_name]['hallucination']
        factscore = example['model_results'][model_name]['factscore']
        halluc_symbol = "❌" if halluc else "✅"
        print(f"      {model_name:15}: {halluc_symbol} {factscore}")


📋 Examples of Model Disagreements:
-------------------------------------------------------

  Example 1 (partial disagreement):
    Question ID: 1
    Domain: General Knowledge
    Question: What is the most common bird in the world?...
    Model results:
      Claude_3.5_Sonnet: ❌ somewhat inaccurate
      GPT_3.5        : ✅ completely right
      GPT_4o         : ✅ completely right

  Example 2 (partial disagreement):
    Question ID: 4
    Domain: General Knowledge
    Question: "In a 2007 interview, which actor 'animatedly' bemoaned ""I hate that cat! Ever since I did that cat...
    Model results:
      Claude_3.5_Sonnet: ✅ somewhat correct
      GPT_3.5        : ❌ totally wrong
      GPT_4o         : ✅ completely right

  Example 3 (partial disagreement):
    Question ID: 10
    Domain: General Knowledge
    Question: Which role is being played in a recently released film by the actor whose previous roles include Tim...
    Model results:
      Claude_3.5_Sonnet: ✅ somewhat corr

#### Questions where all models struggle

In [22]:
difficult_questions = []

for result in combined_results:
    # Check if all models hallucinated
    all_hallucinated = all(result['model_results'][model]['hallucination'] 
                          for model in model_names)
    
    if all_hallucinated:
        difficult_questions.append({
            'question_id': result['question_id'],
            'domain': result['domain'],
            'question_text': result['question_text'],
            'question_length': result['question_length'],
            'question_type': result['question_type']
        })

print(f"  📊 Universal Failure Statistics:")
print(f"    Questions where all models failed: {len(difficult_questions)}/{total_questions} ({len(difficult_questions)/total_questions*100:.1f}%)")

if difficult_questions:
    print(f"\n  🔍 Characteristics of Difficult Questions:")
    
    # Analyze patterns in difficult questions
    difficult_domains = Counter(q['domain'] for q in difficult_questions)
    difficult_types = Counter(q['question_type'] for q in difficult_questions)
    
    avg_length = np.mean([q['question_length'] for q in difficult_questions])
    
    print(f"    Average length: {avg_length:.1f} characters")
    print(f"    Domain distribution:")
    for domain, count in difficult_domains.most_common():
        pct = (count / len(difficult_questions)) * 100
        print(f"      {domain:15}: {count:2d} ({pct:5.1f}%)")
    
    print(f"    Type distribution:")
    for qtype, count in difficult_types.most_common():
        pct = (count / len(difficult_questions)) * 100
        print(f"      {qtype:15}: {count:2d} ({pct:5.1f}%)")
    
    # Show examples
    print(f"\n  📋 Examples of Universally Difficult Questions:")
    for i, q in enumerate(difficult_questions[:3], 1):
        print(f"    {i}. [{q['domain']}] {q['question_text']}...")

  📊 Universal Failure Statistics:
    Questions where all models failed: 21/400 (5.2%)

  🔍 Characteristics of Difficult Questions:
    Average length: 78.7 characters
    Domain distribution:
      Pop Culture    : 10 ( 47.6%)
      History        :  6 ( 28.6%)
      General Knowledge:  4 ( 19.0%)
      Healthcare     :  1 (  4.8%)
    Type distribution:
      closed-ended   : 21 (100.0%)

  📋 Examples of Universally Difficult Questions:
    1. [General Knowledge] Who was the first British winner of the US Women’s Open?...
    2. [General Knowledge] What was the (2011 reported) average annual salary of a UK ('county') Council Chief Executive?...
    3. [General Knowledge] Elected in 2008, who is the current Prime Minister of New Zealand?...


#### Calculate how often each model agrees with the majority

In [21]:
consistency_scores = {}

for model_name in model_names:
    agreements = 0
    
    for result in combined_results:
        # Get majority decision
        halluc_votes = [result['model_results'][m]['hallucination'] for m in model_names]
        majority_halluc = sum(halluc_votes) > len(model_names) / 2
        
        # Check if this model agrees with majority
        model_halluc = result['model_results'][model_name]['hallucination']
        if model_halluc == majority_halluc:
            agreements += 1
    
    consistency_score = (agreements / total_questions) * 100
    consistency_scores[model_name] = consistency_score

# Rank by consistency
ranked_consistency = sorted(consistency_scores.items(), key=lambda x: x[1], reverse=True)

print(f"  📊 Consistency with Majority Decision:")
for i, (model, score) in enumerate(ranked_consistency, 1):
    if i == 1:
        print(f"    🥇 {i}. {model:15}: {score:5.1f}% (Most consistent)")
    elif i == len(ranked_consistency):
        print(f"    🥉 {i}. {model:15}: {score:5.1f}% (Least consistent)")
    else:
        print(f"       {i}. {model:15}: {score:5.1f}%")

  📊 Consistency with Majority Decision:
    🥇 1. GPT_4o         :  95.5% (Most consistent)
       2. Claude_3.5_Sonnet:  93.0%
    🥉 3. GPT_3.5        :  91.5% (Least consistent)
