# RAG-based Li-Paper Analysis Notebook

This notebook analyzes the results of the RAG-based Li-Paper checklist validation process, comparing the OpenAI-Claude and Claude-OpenAI configurations.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Starting RAG-based Li-Paper analysis...")

In [None]:
# Directory containing the RAG paper results
base_dir = 'output/reports_rag'

# Function to extract data from a report file
def extract_report_data(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract paper ID
        paper_id = data.get('paper', '').replace('.pdf', '')
        
        # Extract validation summary
        summary = data.get('validation_summary', {})
        
        # Extract model information if available
        model_info = data.get('model_info', {})
        if not model_info and 'model_info' in summary:
            model_info = summary.get('model_info', {})
        
        # Extract items data
        items = data.get('items', {})
        
        # Determine configuration based on model info
        extractor = model_info.get('extractor', '')
        validator = model_info.get('validator', '')
        
        if 'openai' in extractor.lower() and 'claude' in validator.lower():
            config = 'openai_claude'
        elif 'claude' in extractor.lower() and 'openai' in validator.lower():
            config = 'claude_openai'
        else:
            config = 'unknown'
        
        return {
            'paper_id': paper_id,
            'config': config,
            'summary': summary,
            'model_info': model_info,
            'items': items,
            'file_path': file_path
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Load all report data
all_reports = []

# Check if the directory exists
if not os.path.exists(base_dir):
    print(f"Error: Directory {base_dir} does not exist")
else:
    # List all files in the directory
    print(f"Scanning directory: {base_dir}")
    files = os.listdir(base_dir)
    report_files = [f for f in files if ('report' in f) and f.endswith('.json') and '_Li-Paper' in f]
    print(f"Found {len(report_files)} report files")
    
    # Process each report file
    for file in report_files:
        file_path = os.path.join(base_dir, file)
        report_data = extract_report_data(file_path)
        if report_data:
            all_reports.append(report_data)
    
    print(f"Loaded {len(all_reports)} reports")

In [None]:
# Create a DataFrame with summary information
summary_data = []

for report in all_reports:
    summary = report['summary']
    summary_data.append({
        'paper_id': report['paper_id'],
        'config': report['config'],
        'total_items': summary.get('total_items', 0),
        'agree_with_extractor': summary.get('agree_with_extractor', 0),
        'disagree_with_extractor': summary.get('disagree_with_extractor', 0),
        'unknown': summary.get('unknown', 0),
        'agreement_rate': summary.get('agreement_rate', 0),
        'extractor': report['model_info'].get('extractor', ''),
        'validator': report['model_info'].get('validator', '')
    })

summary_df = pd.DataFrame(summary_data)
print("Summary DataFrame created with shape:", summary_df.shape)
summary_df.head()

In [None]:
# Calculate average agreement rates by configuration
avg_agreement = summary_df.groupby('config')['agreement_rate'].agg(['mean', 'std', 'count']).reset_index()
avg_agreement.columns = ['Configuration', 'Mean Agreement Rate (%)', 'Std Dev', 'Count']
avg_agreement

In [None]:
# Perform Mann-Whitney U test (non-parametric test for independent samples)
openai_claude = summary_df[summary_df['config'] == 'openai_claude']['agreement_rate']
claude_openai = summary_df[summary_df['config'] == 'claude_openai']['agreement_rate']

u_stat, p_value = stats.mannwhitneyu(openai_claude, claude_openai, alternative='two-sided')

print(f"Mann-Whitney U test results:")
print(f"U statistic: {u_stat}")
print(f"p-value: {p_value}")
print(f"Significant difference at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")

In [None]:
# Extract model outputs
model_outputs = {}

for report in all_reports:
    paper_id = report['paper_id']
    config = report['config']
    
    if paper_id not in model_outputs:
        model_outputs[paper_id] = {}
    
    for item_id, item in report['items'].items():
        if item_id not in model_outputs[paper_id]:
            model_outputs[paper_id][item_id] = {}
        
        model_outputs[paper_id][item_id][config] = item.get('correct_answer', '')

# Function to compare model outputs and determine if they agree
def outputs_agree(output1, output2):
    # Simple string comparison for now
    if not output1 or not output2:
        return False
    
    # Check for exact match
    if output1 == output2:
        return True
    
    # Check for 'unknown' or similar values
    unknown_patterns = ['unknown', 'not enough information', 'cannot determine']
    if any(pattern in output1.lower() for pattern in unknown_patterns) and \
       any(pattern in output2.lower() for pattern in unknown_patterns):
        return True
    
    # Check for yes/no agreement
    yes_patterns = ['yes', 'complies', 'compliant', 'fulfilled']
    no_patterns = ['no', 'does not comply', 'non-compliant', 'not fulfilled']
    
    output1_yes = any(pattern in output1.lower() for pattern in yes_patterns)
    output1_no = any(pattern in output1.lower() for pattern in no_patterns)
    output2_yes = any(pattern in output2.lower() for pattern in yes_patterns)
    output2_no = any(pattern in output2.lower() for pattern in no_patterns)
    
    if (output1_yes and output2_yes) or (output1_no and output2_no):
        return True
    
    return False

# Calculate model output agreement
output_agreement_data = []

for paper_id, items in model_outputs.items():
    for item_id, configs in items.items():
        if 'openai_claude' in configs and 'claude_openai' in configs:
            openai_claude_output = configs['openai_claude']
            claude_openai_output = configs['claude_openai']
            
            agreement = outputs_agree(openai_claude_output, claude_openai_output)
            
            output_agreement_data.append({
                'paper_id': paper_id,
                'item_id': item_id,
                'openai_claude_output': openai_claude_output,
                'claude_openai_output': claude_openai_output,
                'models_agree': agreement
            })

output_agreement_df = pd.DataFrame(output_agreement_data)
overall_output_agreement = output_agreement_df['models_agree'].mean() * 100
print(f"Overall model output agreement rate: {overall_output_agreement:.2f}%")

In [None]:
# Calculate model output agreement by paper
paper_output_agreement = output_agreement_df.groupby('paper_id')['models_agree'].mean() * 100
paper_output_agreement = paper_output_agreement.reset_index()
paper_output_agreement.columns = ['Paper ID', 'Model Output Agreement Rate (%)']
paper_output_agreement.sort_values('Model Output Agreement Rate (%)', ascending=False, inplace=True)

print("Top 5 papers by model output agreement rate:")
paper_output_agreement.head()

print("\nBottom 5 papers by model output agreement rate:")
paper_output_agreement.tail()

In [None]:
# Calculate paper-level agreement rates
paper_agreement = summary_df.groupby(['paper_id', 'config'])['agreement_rate'].mean().reset_index()
paper_agreement_pivot = paper_agreement.pivot(index='paper_id', columns='config', values='agreement_rate').reset_index()
paper_agreement_pivot.columns = ['paper_id', 'claude_openai_agreement', 'openai_claude_agreement']

# Merge with model output agreement data
merged_agreement = pd.merge(paper_agreement_pivot, paper_output_agreement, left_on='paper_id', right_on='Paper ID', how='inner')

# Calculate correlation coefficients
corr_openai_claude = merged_agreement['openai_claude_agreement'].corr(merged_agreement['Model Output Agreement Rate (%)'])
corr_claude_openai = merged_agreement['claude_openai_agreement'].corr(merged_agreement['Model Output Agreement Rate (%)'])

print(f"Correlation between OpenAI-Claude validator agreement and model output agreement: {corr_openai_claude:.4f}")
print(f"Correlation between Claude-OpenAI validator agreement and model output agreement: {corr_claude_openai:.4f}")