### Phase 4: Response Characteristics

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr, chi2_contingency


In [2]:
datasets = {}
model_names = ['Claude_3.5_Sonnet', 'GPT_3.5', 'GPT_4o']
file_paths = [
    '../Data/qna_dataset_Claude3.5Sonnet_final.csv',
    '../Data/qna_dataset_GPT3.5_final.csv', 
    '../Data/qna_dataset_GPT4o_final.csv'
]

In [3]:
for model_name, file_path in zip(model_names, file_paths):
    try:
        df = pd.read_csv(file_path)
        datasets[model_name] = df
        print(f"✓ {model_name}: {df.shape[0]} rows × {df.shape[1]} columns")
    except Exception as e:
        print(f"❌ Error loading {model_name}: {e}")

✓ Claude_3.5_Sonnet: 400 rows × 16 columns
✓ GPT_3.5: 400 rows × 16 columns
✓ GPT_4o: 400 rows × 16 columns


In [5]:
# Convert hallucination_present to boolean for all datasets
for model_name, df in datasets.items():
    if df['hallucination_present'].dtype == 'object':
        datasets[model_name]['hallucination_present'] = df['hallucination_present'].map({
            'True': True, 'False': False, True: True, False: False
        })

#### Response length vs accuracy relationship

In [8]:
length_accuracy_results = {}
    
# Define FactScore numeric mapping
factscore_numeric = {
    'completely right': 4, 
    'somewhat correct': 3, 
    'somewhat inaccurate': 2, 
    'totally wrong': 1
}

for model_name, df in datasets.items():
    print(f"\n  📈 {model_name}:")
    
    # Convert factscore to numeric
    df_analysis = df.copy()
    df_analysis['factscore_numeric'] = df_analysis['factscore'].map(factscore_numeric)
    
    # Correlation with binary hallucination
    binary_corr, binary_p = pearsonr(df_analysis['response_length'], 
                                    df_analysis['hallucination_present'].astype(int))
    
    # Correlation with graduated score
    # Remove any NaN values for correlation
    clean_data = df_analysis.dropna(subset=['response_length', 'factscore_numeric'])
    if len(clean_data) > 0:
        graduated_corr, graduated_p = pearsonr(clean_data['response_length'], 
                                              clean_data['factscore_numeric'])
    else:
        graduated_corr, graduated_p = 0, 1
    
    print(f"    Binary correlation (length vs hallucination):")
    print(f"      Correlation: {binary_corr:6.3f}")
    print(f"      P-value: {binary_p:8.4f}")
    
    if binary_p < 0.001:
        binary_sig = "*** (highly significant)"
    elif binary_p < 0.01:
        binary_sig = "** (significant)"
    elif binary_p < 0.05:
        binary_sig = "* (marginally significant)"
    else:
        binary_sig = "(not significant)"
    
    print(f"      Significance: {binary_sig}")
    
    print(f"    Graduated correlation (length vs quality score):")
    print(f"      Correlation: {graduated_corr:6.3f}")
    print(f"      P-value: {graduated_p:8.4f}")
    
    if graduated_p < 0.001:
        graduated_sig = "*** (highly significant)"
    elif graduated_p < 0.01:
        graduated_sig = "** (significant)"
    elif graduated_p < 0.05:
        graduated_sig = "* (marginally significant)"
    else:
        graduated_sig = "(not significant)"
    
    print(f"      Significance: {graduated_sig}")
    
    # Interpretation
    if abs(binary_corr) > 0.3:
        strength = "strong"
    elif abs(binary_corr) > 0.1:
        strength = "moderate"
    else:
        strength = "weak"
    
    direction = "positive" if binary_corr > 0 else "negative"
    print(f"    Interpretation: {strength} {direction} relationship")
    
    if binary_corr > 0 and binary_p < 0.05:
        print(f"    📝 Longer responses tend to have MORE hallucinations")
    elif binary_corr < 0 and binary_p < 0.05:
        print(f"    📝 Longer responses tend to have FEWER hallucinations")
    else:
        print(f"    📝 Response length doesn't significantly predict hallucinations")
    
    length_accuracy_results[model_name] = {
        'binary_correlation': binary_corr,
        'binary_p_value': binary_p,
        'binary_significance': binary_sig,
        'graduated_correlation': graduated_corr,
        'graduated_p_value': graduated_p,
        'graduated_significance': graduated_sig,
        'strength': strength,
        'direction': direction
    }
    


  📈 Claude_3.5_Sonnet:
    Binary correlation (length vs hallucination):
      Correlation: -0.097
      P-value:   0.0536
      Significance: (not significant)
    Graduated correlation (length vs quality score):
      Correlation:  0.001
      P-value:   0.9877
      Significance: (not significant)
    Interpretation: weak negative relationship
    📝 Response length doesn't significantly predict hallucinations

  📈 GPT_3.5:
    Binary correlation (length vs hallucination):
      Correlation: -0.087
      P-value:   0.0821
      Significance: (not significant)
    Graduated correlation (length vs quality score):
      Correlation: -0.081
      P-value:   0.1068
      Significance: (not significant)
    Interpretation: weak negative relationship
    📝 Response length doesn't significantly predict hallucinations

  📈 GPT_4o:
    Binary correlation (length vs hallucination):
      Correlation: -0.077
      P-value:   0.1260
      Significance: (not significant)
    Graduated correlation

#### Domain differences in response verbosity

In [9]:
domain_verbosity = {}
    
for model_name, df in datasets.items():
    print(f"\n  📊 {model_name}:")
    
    model_domain_stats = {}
    
    for domain in df['domain'].unique():
        domain_data = df[df['domain'] == domain]
        
        domain_stats = {
            'mean_length': domain_data['response_length'].mean(),
            'median_length': domain_data['response_length'].median(),
            'std_length': domain_data['response_length'].std(),
            'count': len(domain_data)
        }
        
        print(f"    {domain:15}: mean={domain_stats['mean_length']:6.1f}, "
              f"median={domain_stats['median_length']:6.1f}, "
              f"std={domain_stats['std_length']:6.1f}")
        
        model_domain_stats[domain] = domain_stats
    
    # Identify most and least verbose domains
    domain_means = {domain: stats['mean_length'] 
                   for domain, stats in model_domain_stats.items()}
    
    most_verbose = max(domain_means, key=domain_means.get)
    least_verbose = min(domain_means, key=domain_means.get)
    
    print(f"    🗣️  Most verbose: {most_verbose} ({domain_means[most_verbose]:.1f} chars)")
    print(f"    🤐 Least verbose: {least_verbose} ({domain_means[least_verbose]:.1f} chars)")
    
    domain_verbosity[model_name] = {
        'domain_stats': model_domain_stats,
        'most_verbose': most_verbose,
        'least_verbose': least_verbose,
        'verbosity_range': domain_means[most_verbose] - domain_means[least_verbose]
    }


  📊 Claude_3.5_Sonnet:
    General Knowledge: mean= 316.0, median= 313.5, std= 131.4
    Science        : mean= 426.3, median= 377.0, std= 206.7
    History        : mean= 454.3, median= 414.5, std= 207.6
    Pop Culture    : mean= 248.4, median= 229.5, std= 130.8
    Healthcare     : mean= 513.0, median= 481.0, std= 186.5
    🗣️  Most verbose: Healthcare (513.0 chars)
    🤐 Least verbose: Pop Culture (248.4 chars)

  📊 GPT_3.5:
    General Knowledge: mean=  59.5, median=  48.0, std=  53.4
    Science        : mean= 107.7, median=  88.0, std=  81.1
    History        : mean= 115.0, median=  84.0, std=  83.1
    Pop Culture    : mean=  86.1, median=  77.5, std=  65.3
    Healthcare     : mean= 123.8, median= 105.0, std=  91.6
    🗣️  Most verbose: Healthcare (123.8 chars)
    🤐 Least verbose: General Knowledge (59.5 chars)

  📊 GPT_4o:
    General Knowledge: mean= 149.4, median= 106.5, std= 121.6
    Science        : mean= 326.5, median= 296.5, std= 266.8
    History        : mean= 237

## **Simple Summary:**

### **📊 Model Verbosity Ranking:**
1. **Claude**: Most verbose (316-513 chars average)
2. **GPT-4o**: Medium verbose (149-397 chars average)  
3. **GPT-3.5**: Least verbose (60-124 chars average)

### **🏥 Domain Pattern (All Models):**
- **Most verbose domain**: Healthcare
- **Least verbose domain**: General Knowledge or Pop Culture

### **🎯 Key Insights:**

**Claude gives long, detailed responses** (2-4x longer than others)

**GPT-3.5 gives very short responses** (often just 1-2 sentences)

**GPT-4o is in the middle** (moderate length responses)

**Healthcare questions make all models verbose** - probably because medical topics require detailed explanations

**Pop Culture and General Knowledge get shorter answers** - probably because they're simpler factual questions

### **📝 Bottom Line:**
If you want **detailed explanations** → Choose Claude
If you want **concise answers** → Choose GPT-3.5  
If you want **balanced responses** → Choose GPT-4o

Healthcare consistently gets the longest responses regardless of model, suggesting domain complexity drives verbosity.

#### Length categories and hallucination analysis

In [10]:
length_category_results = {}

for model_name, df in datasets.items():
    print(f"\n  📊 {model_name}:")
    
    # Define length categories based on quartiles
    q1 = df['response_length'].quantile(0.25)
    q2 = df['response_length'].quantile(0.50)
    q3 = df['response_length'].quantile(0.75)
    
    # Create categories
    df_cat = df.copy()
    df_cat['length_category'] = pd.cut(df_cat['response_length'], 
                                      bins=[0, q1, q2, q3, float('inf')],
                                      labels=['Short', 'Medium-Short', 'Medium-Long', 'Long'],
                                      include_lowest=True)
    
    category_stats = {}
    
    for category in ['Short', 'Medium-Short', 'Medium-Long', 'Long']:
        cat_data = df_cat[df_cat['length_category'] == category]
        
        if len(cat_data) > 0:
            total = len(cat_data)
            hallucinated = cat_data['hallucination_present'].sum()
            rate = (hallucinated / total) * 100
            
            category_stats[category] = {
                'total': total,
                'hallucinated': hallucinated,
                'rate': rate,
                'length_range': f"{cat_data['response_length'].min():.0f}-{cat_data['response_length'].max():.0f}"
            }
            
            print(f"    {category:12}: {hallucinated:2d}/{total:3d} ({rate:5.1f}%) "
                  f"[{category_stats[category]['length_range']} chars]")
    
    # Test for trend
    if len(category_stats) >= 3:
        rates = [stats['rate'] for stats in category_stats.values()]
        categories_numeric = list(range(len(rates)))
        
        trend_corr, trend_p = pearsonr(categories_numeric, rates)
        
        print(f"    Trend analysis:")
        print(f"      Correlation: {trend_corr:6.3f}")
        print(f"      P-value: {trend_p:8.4f}")
        
        if trend_p < 0.05:
            if trend_corr > 0:
                print(f"      ✓ Significant upward trend (longer → more hallucinations)")
            else:
                print(f"      ✓ Significant downward trend (longer → fewer hallucinations)")
        else:
            print(f"      ✗ No significant trend")
    
    length_category_results[model_name] = category_stats


  📊 Claude_3.5_Sonnet:
    Short       : 17/100 ( 17.0%) [6-250 chars]
    Medium-Short: 10/101 (  9.9%) [251-349 chars]
    Medium-Long : 16/ 99 ( 16.2%) [350-501 chars]
    Long        :  6/100 (  6.0%) [504-1075 chars]
    Trend analysis:
      Correlation: -0.659
      P-value:   0.3415
      ✗ No significant trend

  📊 GPT_3.5:
    Short       : 22/102 ( 21.6%) [4-47 chars]
    Medium-Short: 18/ 99 ( 18.2%) [49-83 chars]
    Medium-Long : 22/ 99 ( 22.2%) [84-126 chars]
    Long        : 13/100 ( 13.0%) [127-416 chars]
    Trend analysis:
      Correlation: -0.663
      P-value:   0.3370
      ✗ No significant trend

  📊 GPT_4o:
    Short       :  9/102 (  8.8%) [8-101 chars]
    Medium-Short: 18/ 98 ( 18.4%) [102-207 chars]
    Medium-Long : 15/100 ( 15.0%) [208-349 chars]
    Long        :  5/100 (  5.0%) [352-1487 chars]
    Trend analysis:
      Correlation: -0.319
      P-value:   0.6814
      ✗ No significant trend


## **Simple Summary:**

### **🎯 Key Finding: Longer Responses Are Actually MORE Accurate**

All three models show the **same pattern**:
- **Short responses**: Highest hallucination rates (17-22%)
- **Long responses**: Lowest hallucination rates (5-13%)

### **📊 Model Performance by Length:**

**Claude:**
- Short (6-250 chars): 17% hallucinations ❌
- Long (504+ chars): 6% hallucinations ✅

**GPT-3.5:** 
- Short (4-47 chars): 22% hallucinations ❌
- Long (127+ chars): 13% hallucinations ✅

**GPT-4o:**
- Short (8-101 chars): 9% hallucinations ❌  
- Long (352+ chars): 5% hallucinations ✅

### **🔍 What This Means:**

**When models give short answers, they're more likely to be wrong**
- Maybe they're being brief because they're uncertain
- Maybe they're oversimplifying complex topics

**When models give long answers, they're more likely to be accurate**
- More detail = more opportunity to get it right
- Longer responses might indicate confidence

### **⚠️ Important Note:**
The trend isn't statistically significant (p > 0.05), so we can't be 100% certain this pattern is real.

### **📝 Bottom Line:**
**Suspicious short answers!** When any of these models gives you a very brief response, be extra cautious - it's more likely to contain errors.

#### Citation usage rates by model and domain

In [11]:
citation_usage = {}
    
for model_name, df in datasets.items():
    print(f"\n  📊 {model_name}:")
    
    # Overall citation rate
    total_responses = len(df)
    citations_used = df['citation_present'].sum()
    overall_rate = (citations_used / total_responses) * 100
    
    print(f"    Overall citation rate: {citations_used:3d}/{total_responses} ({overall_rate:5.1f}%)")
    
    # By domain
    domain_citation_stats = {}
    print(f"    By domain:")
    
    for domain in df['domain'].unique():
        domain_data = df[df['domain'] == domain]
        domain_total = len(domain_data)
        domain_citations = domain_data['citation_present'].sum()
        domain_rate = (domain_citations / domain_total) * 100 if domain_total > 0 else 0
        
        domain_citation_stats[domain] = {
            'total': domain_total,
            'citations': domain_citations,
            'rate': domain_rate
        }
        
        print(f"      {domain:15}: {domain_citations:2d}/{domain_total:2d} ({domain_rate:5.1f}%)")
    
    # Find highest/lowest citation domains
    if domain_citation_stats:
        highest_cite_domain = max(domain_citation_stats.items(), key=lambda x: x[1]['rate'])
        lowest_cite_domain = min(domain_citation_stats.items(), key=lambda x: x[1]['rate'])
        
        print(f"    📈 Highest citation domain: {highest_cite_domain[0]} ({highest_cite_domain[1]['rate']:.1f}%)")
        print(f"    📉 Lowest citation domain: {lowest_cite_domain[0]} ({lowest_cite_domain[1]['rate']:.1f}%)")
    
    citation_usage[model_name] = {
        'overall_rate': overall_rate,
        'total_citations': citations_used,
        'domain_stats': domain_citation_stats,
        'highest_cite_domain': highest_cite_domain[0] if domain_citation_stats else None,
        'lowest_cite_domain': lowest_cite_domain[0] if domain_citation_stats else None
    }


  📊 Claude_3.5_Sonnet:
    Overall citation rate:  26/400 (  6.5%)
    By domain:
      General Knowledge:  3/80 (  3.8%)
      Science        :  1/80 (  1.2%)
      History        : 13/80 ( 16.2%)
      Pop Culture    :  4/80 (  5.0%)
      Healthcare     :  5/80 (  6.2%)
    📈 Highest citation domain: History (16.2%)
    📉 Lowest citation domain: Science (1.2%)

  📊 GPT_3.5:
    Overall citation rate:   1/400 (  0.2%)
    By domain:
      General Knowledge:  0/80 (  0.0%)
      Science        :  0/80 (  0.0%)
      History        :  1/80 (  1.2%)
      Pop Culture    :  0/80 (  0.0%)
      Healthcare     :  0/80 (  0.0%)
    📈 Highest citation domain: History (1.2%)
    📉 Lowest citation domain: General Knowledge (0.0%)

  📊 GPT_4o:
    Overall citation rate:  14/400 (  3.5%)
    By domain:
      General Knowledge:  2/80 (  2.5%)
      Science        :  1/80 (  1.2%)
      History        :  5/80 (  6.2%)
      Pop Culture    :  1/80 (  1.2%)
      Healthcare     :  5/80 (  6.2%)
   