In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from weaviate_rag.rag_system import GraphRAGSystem
import ollama
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
import re
from ragas import EvaluationDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

In [None]:
# Load your JSON data
with open('/Users/alexlecu/PycharmProjects/LLMKGraph/backend/evaluation/data/grok_evaluation_datasets/2_Hop_MCQ_Questions.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame for analysis
df = pd.DataFrame(data)
df = df[['question', 'answer']]

# Add columns for evaluation
df['contexts'] = None  # Will be filled with retrieved passages
df['with_context_answer'] = None  # Will be filled with RAG responses
df['without_context_answer'] = None  # Will be filled with model-only responses

In [None]:
# Process all questions in parallel
from weaviate_rag.rag_utils_mp.evaluation_MCQ import process_question

# Extract questions from DataFrame
questions = df['question'].tolist()

# Process questions in parallel
print("Processing questions in parallel...")
with ProcessPoolExecutor(max_workers=8) as executor:
    results = list(tqdm(executor.map(process_question, questions), total=len(questions)))

# Unpack results into separate lists
contexts, with_context_answers, without_context_answers = zip(*results)

# Assign results to DataFrame
df['contexts'] = contexts
df['with_context_answer'] = with_context_answers
df['without_context_answer'] = without_context_answers

# Summary of processing
error_count = sum(1 for c, w, wo in results if c is None)
print(f"Processed {len(results)} questions, with {error_count} errors.")

In [None]:
# Parse responses to extract the selected multiple choice option
def parse_mcq_answer(response):
    """
    Extract the selected option (A, B, C, D, etc.) from an MCQ response,
    handling various formats found in the dataset.
    """
    if not response or not isinstance(response, str):
        return None
    
    # First, normalize whitespace and convert to lowercase
    response = response.lower().strip()
    
    # Check for standalone answer formats
    if response in ['a', 'b', 'c', 'd']:
        return response.upper()
    
    if response in ['a)', 'b)', 'c)', 'd)']:
        return response[0].upper()
    
    # If the response starts with option followed by content (e.g., "a) Yes")
    if re.match(r'^[a-d]\)', response) or re.match(r'^[a-d]\) ', response):
        return response[0].upper()
    
    # Look for patterns like "the best answer is A"
    best_answer_match = re.search(r'best answer is ([a-d])', response)
    if best_answer_match:
        return best_answer_match.group(1).upper()
    
    # Look for other common patterns
    patterns = [
        r'(?:the\s+answer\s+is\s+)([a-d])',  # "The answer is A"
        r'(?:option\s+)([a-d])',              # "Option A"
        r'(?:choice\s+)([a-d])',              # "Choice A" 
        r'(?:select\s+)([a-d])',              # "Select A"
        r'(?:answer[:\s]+)([a-d])',           # "Answer: A"
        r'^([a-d])$',                         # Just "A" on a line
        r'(?:answer\s+is\s+)([a-d])',         # "Answer is A"
        r'(?:\s+)([a-d])(?:\s+is\s+correct)', # "A is correct"
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, response)
        if matches:
            return matches[0].upper()
            
    # Check for capital letters A-D which are likely to be answers
    capital_letters = re.findall(r'\b([A-D])\b', response)
    if capital_letters:
        return capital_letters[0]
            
    # If no match found with patterns, check for text like " a " or " a." or "a "
    for option in ['a', 'b', 'c', 'd']:
        option_pattern = r'\b{}\b'.format(option)
        if re.search(option_pattern, response):
            return option.upper()
    
    # If still no match, check if there's any A, B, C, D in the text
    for char in response:
        if char.lower() in ['a', 'b', 'c', 'd']:
            return char.upper()
    
    # No clear answer found
    return None

# Parse responses into MCQ answers
df['with_context_parsed'] = df['with_context_answer'].apply(parse_mcq_answer)
df['without_context_parsed'] = df['without_context_answer'].apply(parse_mcq_answer)

# Convert reference answers to a standard format
def standardize_reference_answer(answer):
    """
    Standardize reference answers to a consistent format (just the letter A, B, C, or D)
    """
    if not answer or not isinstance(answer, str):
        return None
        
    answer = answer.strip().lower()
    
    # Handle "A", "a", "A)", "a)" formats
    if answer in ['a', 'b', 'c', 'd']:
        return answer.upper()
    elif answer in ['a)', 'b)', 'c)', 'd)']:
        return answer[0].upper()
    
    # If answer is "a) Yes", "b) No", etc.
    if re.match(r'^[a-d]\)', answer):
        return answer[0].upper()
    
    # Try to extract any letter that might be the answer
    for char in answer:
        if char.lower() in ['a', 'b', 'c', 'd']:
            return char.upper()
            
    return None

# Standardize the reference answers
df['standardized_answer'] = df['answer'].apply(standardize_reference_answer)

# Display results
print("Parsed Results:")
for index, row in df.iterrows():
    print(f"Question {index+1}: {row['question']}")
    print(f"Ground Truth: {row['standardized_answer']} (from '{row['answer']}')")
    print(f"With Context: {row['with_context_parsed']}")
    print(f"Without Context: {row['without_context_parsed']}\n")

# Prepare for evaluation using standardized answers
references = df['standardized_answer'].tolist()
with_context_responses = df['with_context_parsed'].tolist()
without_context_responses = df['without_context_parsed'].tolist()

# Calculate accuracy
with_context_correct = sum(1 for ref, pred in zip(references, with_context_responses) 
                           if ref == pred and ref is not None)
without_context_correct = sum(1 for ref, pred in zip(references, without_context_responses) 
                             if ref == pred and ref is not None)

# Count valid references (non-None)
valid_references = sum(1 for ref in references if ref is not None)

# Calculate accuracy (if there are valid references)
if valid_references > 0:
    with_context_accuracy = with_context_correct / valid_references
    without_context_accuracy = without_context_correct / valid_references
    
    print(f"With Context Accuracy: {with_context_accuracy:.2%} ({with_context_correct}/{valid_references})")
    print(f"Without Context Accuracy: {without_context_accuracy:.2%} ({without_context_correct}/{valid_references})")
    print(f"Improvement: {(with_context_accuracy - without_context_accuracy):.2%}")
else:
    print("No valid references found for evaluation.")

In [None]:
# Compare errors between with-context and without-context approaches
print("\nError Analysis:")
print("-" * 100)
print(f"{'Question':50} | {'Truth':5} | {'With Ctx':8} | {'Without Ctx':10} | {'Notes'}")
print("-" * 100)

# Count different error types
context_helped_count = 0
context_misled_count = 0
both_incorrect_count = 0
both_correct_count = 0

for i, row in df.iterrows():
    # Get standardized answers for comparison
    ground_truth = row['standardized_answer']
    with_context = row['with_context_parsed']
    without_context = row['without_context_parsed']
    
    # Check if either prediction is incorrect (using standardized answers)
    with_context_correct = (with_context == ground_truth) if (with_context is not None and ground_truth is not None) else False
    without_context_correct = (without_context == ground_truth) if (without_context is not None and ground_truth is not None) else False
    
    # Track counts for summary
    if with_context_correct and without_context_correct:
        both_correct_count += 1
    elif with_context_correct and not without_context_correct:
        context_helped_count += 1
    elif not with_context_correct and without_context_correct:
        context_misled_count += 1
    elif not with_context_correct and not without_context_correct:
        both_incorrect_count += 1
    
    # Only show questions where at least one approach was incorrect
    if not (with_context_correct and without_context_correct) and ground_truth is not None:
        # Truncate question for display
        question = row['question'][:47] + "..." if len(row['question']) > 47 else row['question'].ljust(47)
        
        # Format for display
        truth_display = str(ground_truth).ljust(5) if ground_truth else "None".ljust(5)
        with_ctx_display = str(with_context).ljust(8) if with_context else "None".ljust(8)
        without_ctx_display = str(without_context).ljust(10) if without_context else "None".ljust(10)
        
        # Determine notes
        if with_context_correct and not without_context_correct:
            notes = "Context helped"
        elif not with_context_correct and without_context_correct:
            notes = "Context misled"
        else:
            notes = "Both incorrect"
            
        print(f"{question} | {truth_display} | {with_ctx_display} | {without_ctx_display} | {notes}")

# Print summary statistics
total_questions = len(df)
answerable_questions = sum(1 for ans in df['standardized_answer'] if ans is not None)

print("\nSummary Statistics:")
print(f"Total questions: {total_questions}")
print(f"Answerable questions (with valid ground truth): {answerable_questions}")
print(f"Both approaches correct: {both_correct_count} ({both_correct_count/answerable_questions*100:.1f}% of answerable)")
print(f"Context helped: {context_helped_count} ({context_helped_count/answerable_questions*100:.1f}% of answerable)")
print(f"Context misled: {context_misled_count} ({context_misled_count/answerable_questions*100:.1f}% of answerable)")
print(f"Both approaches incorrect: {both_incorrect_count} ({both_incorrect_count/answerable_questions*100:.1f}% of answerable)")

# Net impact of context
net_impact = context_helped_count - context_misled_count
print(f"\nNet impact of context: {'+' if net_impact > 0 else ''}{net_impact} questions " +
      f"({net_impact/answerable_questions*100:.1f}% of answerable)")

In [None]:
# Analyze the impact of context on each question
df['context_impact'] = None

for i, row in df.iterrows():
    # Get standardized answers for comparison
    ground_truth = row['standardized_answer']
    with_context = row['with_context_parsed']
    without_context = row['without_context_parsed']
    
    # Skip if ground truth or either prediction is None
    if ground_truth is None or with_context is None or without_context is None:
        df.at[i, 'context_impact'] = 'Unknown'
        continue
        
    with_correct = with_context == ground_truth
    without_correct = without_context == ground_truth
    
    if with_correct and not without_correct:
        df.at[i, 'context_impact'] = 'Positive'
    elif not with_correct and without_correct:
        df.at[i, 'context_impact'] = 'Negative'
    elif with_correct and without_correct:
        df.at[i, 'context_impact'] = 'Neutral'
    else:  # both incorrect
        df.at[i, 'context_impact'] = 'No Help'

# Summarize context impact
impact_counts = df['context_impact'].value_counts()
print("\nContext Impact Summary:")
for impact, count in impact_counts.items():
    percentage = count/len(df)*100
    print(f"{impact}: {count} questions ({percentage:.1f}%)")

# Calculate net positive impact
positive_count = impact_counts.get('Positive', 0)
negative_count = impact_counts.get('Negative', 0)
net_impact = positive_count - negative_count
net_percentage = net_impact / len(df) * 100
print(f"\nNet Positive Impact: {net_impact} questions ({net_percentage:.1f}%)")

# Visualize context impact with improved colors
plt.figure(figsize=(10, 6))
impact_order = ['Positive', 'Neutral', 'Negative', 'No Help', 'Unknown']
impact_colors = {
    'Positive': '#60BD68',  # Green
    'Neutral': '#5DA5DA',   # Blue
    'Negative': '#F15854',  # Red
    'No Help': '#FAA43A',   # Orange
    'Unknown': '#CCCCCC'    # Gray
}

# Reorder the data for consistent display
ordered_counts = []
ordered_labels = []
for impact in impact_order:
    if impact in impact_counts:
        ordered_counts.append(impact_counts[impact])
        ordered_labels.append(impact)

# Create the bar chart
bars = plt.bar(ordered_labels, ordered_counts, color=[impact_colors[impact] for impact in ordered_labels])
plt.title('Impact of Context on Multiple Choice Questions', fontsize=14, fontweight='bold')
plt.xlabel('Impact Type', fontsize=12)
plt.ylabel('Number of Questions', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add count and percentage labels on top of bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    percentage = height/len(df)*100
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
            f'{int(height)}\n({percentage:.1f}%)',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Additional analysis: When does context help vs. hurt?
# Create a function to extract key features from questions
def extract_features(question):
    features = {}
    # Check for question types
    features['is_cause_effect'] = 'cause' in question.lower() or 'because' in question.lower() or 'due to' in question.lower()
    features['mentions_specific_condition'] = any(word in question.lower() for word in ['diabetes', 'hypertension', 'obesity', 'smoking'])
    features['mentions_therapy'] = any(word in question.lower() for word in ['therapy', 'treatment', 'surgery', 'injection'])
    features['about_disease_mechanism'] = 'mechanism' in question.lower() or 'pathway' in question.lower()
    return features

# Apply feature extraction
for i, row in df.iterrows():
    features = extract_features(row['question'])
    for feature, value in features.items():
        df.at[i, feature] = value

# Analyze which features correlate with positive context impact
print("\nFeature Analysis for Context Impact:")
features = ['is_cause_effect', 'mentions_specific_condition', 'mentions_therapy', 'about_disease_mechanism']

for feature in features:
    positive_with_feature = df[(df['context_impact'] == 'Positive') & (df[feature] == True)].shape[0]
    total_with_feature = df[df[feature] == True].shape[0]
    
    negative_with_feature = df[(df['context_impact'] == 'Negative') & (df[feature] == True)].shape[0]
    
    if total_with_feature > 0:
        positive_rate = positive_with_feature / total_with_feature * 100
        negative_rate = negative_with_feature / total_with_feature * 100
        print(f"\nFeature: {feature}")
        print(f"  Questions with this feature: {total_with_feature}")
        print(f"  Positive impact rate: {positive_rate:.1f}%")
        print(f"  Negative impact rate: {negative_rate:.1f}%")
        print(f"  Net positive rate: {positive_rate - negative_rate:.1f}%")

# Visualize the feature analysis
feature_data = []
for feature in features:
    positive_count = df[(df['context_impact'] == 'Positive') & (df[feature] == True)].shape[0]
    negative_count = df[(df['context_impact'] == 'Negative') & (df[feature] == True)].shape[0]
    total_count = df[df[feature] == True].shape[0]
    
    if total_count > 0:
        feature_data.append({
            'feature': feature.replace('_', ' ').title(),
            'positive_rate': positive_count / total_count * 100,
            'negative_rate': negative_count / total_count * 100,
            'neutral_rate': (total_count - positive_count - negative_count) / total_count * 100
        })

if feature_data:
    feature_df = pd.DataFrame(feature_data)
    
    # Plot the feature analysis
    plt.figure(figsize=(12, 6))
    
    x = np.arange(len(feature_df))
    width = 0.25
    
    plt.bar(x - width, feature_df['positive_rate'], width, label='Positive Impact', color='#60BD68')
    plt.bar(x, feature_df['negative_rate'], width, label='Negative Impact', color='#F15854')
    plt.bar(x + width, feature_df['neutral_rate'], width, label='Neutral/No Help', color='#5DA5DA')
    
    plt.xlabel('Question Features', fontsize=12)
    plt.ylabel('Percentage of Questions', fontsize=12)
    plt.title('Impact of Context by Question Feature', fontsize=14, fontweight='bold')
    plt.xticks(x, feature_df['feature'])
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

In [None]:
df.to_csv('output_2hop_MCQ.csv', index=False)