In [2]:
import sys
!{sys.executable} -m pip install matplotlib seaborn --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/alihussein/venv/bin/python -m pip install --upgrade pip[0m


In [1]:
"""
Day 3: Evaluation & Documentation
Goal: Measure RAG system performance with ground truth evaluation
"""

import sys
sys.path.append('..')

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

print("‚úì Imports complete!")
print(f"Date: {datetime.now().strftime('%A, %B %d, %Y')}")
print(f"Time: {datetime.now().strftime('%I:%M %p')}")

Matplotlib is building the font cache; this may take a moment.


‚úì Imports complete!
Date: Thursday, February 19, 2026
Time: 07:06 PM


In [2]:
"""
Load the optimized RAG system from Day 2
"""

# Load vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
    persist_directory='../data/vectorstore',
    embedding_function=embeddings
)

# Initialize LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Create prompt template
template = """Answer the question based only on the following context:

{context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# Format documents function
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# Create RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("‚úì RAG system loaded!")
print(f"  Vector store: {vectorstore._collection.count()} chunks")
print(f"  Retrieval: k=4")
print(f"  LLM: gpt-3.5-turbo")

‚úì RAG system loaded!
  Vector store: 17306 chunks
  Retrieval: k=4
  LLM: gpt-3.5-turbo


In [3]:
"""
EVALUATION DATASET: 15 questions with verified ground truth answers
Categories: Factual (10), Comparative (3), Trend (2)
"""

evaluation_questions = [
    # FACTUAL QUESTIONS (10)
    {
        'category': 'factual',
        'question': 'What was JPMorgan Chase total revenue in 2024?',
        'ground_truth': '$224,532 million or $224.5 billion',
        'source': 'JPM_10K_2024.pdf',
        'notes': 'Look for consolidated statement of income'
    },
    {
        'category': 'factual',
        'question': 'What are Bank of America four main business segments?',
        'ground_truth': 'Consumer Banking, Global Wealth & Investment Management (GWIM), Global Banking, and Global Markets',
        'source': 'BAC_10K_2024.pdf',
        'notes': 'Check business segment overview section'
    },
    {
        'category': 'factual',
        'question': 'What was Citigroup total assets at end of 2023?',
        'ground_truth': '$2,416 billion or $2.416 trillion',
        'source': 'C_10K_2023.pdf',
        'notes': 'Consolidated balance sheet'
    },
    {
        'category': 'factual',
        'question': 'How many employees does JPMorgan have?',
        'ground_truth': 'Approximately 310,000+ employees',
        'source': 'JPM_10K_2024.pdf',
        'notes': 'Check business overview or workforce section'
    },
    {
        'category': 'factual',
        'question': 'What is Bank of America CET1 capital ratio in 2024?',
        'ground_truth': '11.8% standardized approach',
        'source': 'BAC_10K_2024.pdf',
        'notes': 'Regulatory capital section'
    },
    {
        'category': 'factual',
        'question': 'What was JPMorgan net income in 2024?',
        'ground_truth': '$57.0 billion or $57,040 million',
        'source': 'JPM_10K_2024.pdf',
        'notes': 'Income statement'
    },
    {
        'category': 'factual',
        'question': 'What is Citigroup return on equity ROE in 2024?',
        'ground_truth': 'Approximately 8.1%',
        'source': 'C_10K_2024.pdf',
        'notes': 'Financial highlights or ratios section'
    },
    {
        'category': 'factual',
        'question': 'What was Bank of America net interest income in 2024?',
        'ground_truth': '$56,060 million or $56.1 billion',
        'source': 'BAC_10K_2024.pdf',
        'notes': 'Income statement'
    },
    {
        'category': 'factual',
        'question': 'How many credit cards does Bank of America have outstanding?',
        'ground_truth': 'Over 51 million credit card accounts',
        'source': 'BAC_10K_2024.pdf',
        'notes': 'Consumer banking metrics'
    },
    {
        'category': 'factual',
        'question': 'What was Citigroup total revenue in 2024?',
        'ground_truth': '$81,107 million or $81.1 billion',
        'source': 'C_10K_2024.pdf',
        'notes': 'Income statement, total revenues'
    },
    
    # COMPARATIVE QUESTIONS (3)
    {
        'category': 'comparative',
        'question': 'Which bank had the highest net income in 2024: JPMorgan, Bank of America, or Citigroup?',
        'ground_truth': 'JPMorgan with $57.0 billion (vs BAC ~$26.5B, Citi ~$12.7B)',
        'source': 'Multiple: JPM_10K_2024, BAC_10K_2024, C_10K_2024',
        'notes': 'Compare net income across all three'
    },
    {
        'category': 'comparative',
        'question': 'Compare the total assets of JPMorgan and Citigroup in 2024',
        'ground_truth': 'JPMorgan: ~$4.1 trillion, Citigroup: ~$2.4 trillion. JPMorgan is larger.',
        'source': 'JPM_10K_2024.pdf, C_10K_2024.pdf',
        'notes': 'Balance sheets'
    },
    {
        'category': 'comparative',
        'question': 'Which bank has more employees: JPMorgan or Bank of America?',
        'ground_truth': 'JPMorgan with ~310,000 vs Bank of America with ~216,000',
        'source': 'JPM_10K_2024.pdf, BAC_10K_2024.pdf',
        'notes': 'Workforce/employee count sections'
    },
    
    # TREND QUESTIONS (2)
    {
        'category': 'trend',
        'question': 'How did JPMorgan revenue change from 2023 to 2024?',
        'ground_truth': 'Increased from ~$195.6B to $224.5B (increase of ~15%)',
        'source': 'JPM_10K_2024.pdf, JPM_10K_2023.pdf',
        'notes': 'Multi-year income statement comparison'
    },
    {
        'category': 'trend',
        'question': 'What is the trend in Citigroup net income from 2022 to 2024?',
        'ground_truth': 'Variable: 2022: ~$15B, 2023: ~$9.2B, 2024: ~$12.7B (declined then partially recovered)',
        'source': 'C_10K_2024.pdf, C_10K_2023.pdf, C_10K_2022.pdf',
        'notes': 'Three-year comparison needed'
    }
]

print(f"‚úì Created {len(evaluation_questions)} evaluation questions")
print(f"\nBreakdown:")
print(f"  Factual: {len([q for q in evaluation_questions if q['category'] == 'factual'])}")
print(f"  Comparative: {len([q for q in evaluation_questions if q['category'] == 'comparative'])}")
print(f"  Trend: {len([q for q in evaluation_questions if q['category'] == 'trend'])}")

‚úì Created 15 evaluation questions

Breakdown:
  Factual: 10
  Comparative: 3
  Trend: 2


In [4]:
"""
Run RAG system on all 15 questions and collect results
"""

print("Running RAG on all test questions...")
print("(This will take 2-3 minutes)\n")

results = []

for i, test_case in enumerate(evaluation_questions):
    print(f"[{i+1}/15] {test_case['question'][:60]}...")
    
    # Get RAG answer
    answer = rag_chain.invoke(test_case['question'])
    
    # Get source documents
    docs = retriever.invoke(test_case['question'])
    
    # Store result
    results.append({
        'id': i+1,
        'category': test_case['category'],
        'question': test_case['question'],
        'ground_truth': test_case['ground_truth'],
        'rag_answer': answer,
        'sources': [
            {
                'file': doc.metadata.get('source_file'),
                'ticker': doc.metadata.get('ticker'),
                'year': doc.metadata.get('year')
            }
            for doc in docs
        ],
        'primary_source': docs[0].metadata.get('source_file') if docs else 'None',
        'expected_source': test_case['source'],
        'notes': test_case['notes']
    })

print(f"\n‚úì Completed RAG inference on all {len(results)} questions!")

Running RAG on all test questions...
(This will take 2-3 minutes)

[1/15] What was JPMorgan Chase total revenue in 2024?...
[2/15] What are Bank of America four main business segments?...
[3/15] What was Citigroup total assets at end of 2023?...
[4/15] How many employees does JPMorgan have?...
[5/15] What is Bank of America CET1 capital ratio in 2024?...
[6/15] What was JPMorgan net income in 2024?...
[7/15] What is Citigroup return on equity ROE in 2024?...
[8/15] What was Bank of America net interest income in 2024?...
[9/15] How many credit cards does Bank of America have outstanding?...
[10/15] What was Citigroup total revenue in 2024?...
[11/15] Which bank had the highest net income in 2024: JPMorgan, Ban...
[12/15] Compare the total assets of JPMorgan and Citigroup in 2024...
[13/15] Which bank has more employees: JPMorgan or Bank of America?...
[14/15] How did JPMorgan revenue change from 2023 to 2024?...
[15/15] What is the trend in Citigroup net income from 2022 to 2024?...

‚

In [27]:
"""
Display questions side-by-side for manual evaluation
Rate each as: Correct, Partial, Incorrect
"""

print("="*80)
print("MANUAL EVALUATION - Review each answer")
print("="*80)

for r in results:
    print(f"\n{'='*80}")
    print(f"Question {r['id']} ({r['category'].upper()})")
    print(f"{'='*80}")
    print(f"\nQ: {r['question']}")
    print(f"\nGround Truth: {r['ground_truth']}")
    print(f"\nRAG Answer: {r['rag_answer']}")
    print(f"\nExpected Source: {r['expected_source']}")
    print(f"Actual Sources: {[s['file'] for s in r['sources']]}")
    print(f"\n{'='*80}")

MANUAL EVALUATION - Review each answer

Question 1 (FACTUAL)

Q: What was JPMorgan Chase total revenue in 2024?

Ground Truth: $224,532 million or $224.5 billion

RAG Answer: $224,532 million

Expected Source: JPM_10K_2024.pdf
Actual Sources: ['JPM_10K_2024.pdf', 'JPM_10K_2023.pdf', 'JPM_10K_2024.pdf', 'JPM_10K_2024.pdf']


Question 2 (FACTUAL)

Q: What are Bank of America four main business segments?

Ground Truth: Consumer Banking, Global Wealth & Investment Management (GWIM), Global Banking, and Global Markets

RAG Answer: Consumer Banking, Global Wealth & Investment Management (GWIM), Global Banking, and Global Markets.

Expected Source: BAC_10K_2024.pdf
Actual Sources: ['BAC_10K_2024.pdf', 'BAC_10K_2022.pdf', 'JPM_10K_2023.pdf', 'BAC_10K_2023.pdf']


Question 3 (FACTUAL)

Q: What was Citigroup total assets at end of 2023?

Ground Truth: $2,416 billion or $2.416 trillion

RAG Answer: $2,416,676 million

Expected Source: C_10K_2023.pdf
Actual Sources: ['C_10K_2024.pdf', 'C_10K_2023.

In [8]:
"""
After reviewing above, manually score each question
Correct = 1, Partial = 0.5, Incorrect = 0
"""

# MANUALLY UPDATE THESE SCORES based on your review above
manual_scores = [
    1.0,  # Q1: Correct - exact match ($224,532 million) ‚úì
    1.0,  # Q2: Correct - all 4 segments listed perfectly ‚úì
    1.0,  # Q3: Correct - $2,416,676M = $2,416B (same number) ‚úì
    1.0,  # Q4: Correct - 318,512 employees (close to 310K+) ‚úì
    0.0,  # Q5: INCORRECT - said 15.8%, should be 11.8% + wrong sources ‚úó
    0.0,  # Q6: INCORRECT - said $48.7B, should be $57.0B ‚úó
    0.5,  # Q7: Partial - said 9.4%, ground truth is 8.1% (close but off) ‚ö†
    1.0,  # Q8: Correct - exact match ($56,060 million) ‚úì
    0.0,  # Q9: INCORRECT - couldn't find answer (should be 51M+) ‚úó
    1.0,  # Q10: Correct - $81.1 billion ‚úì
    0.5,  # Q11: Partial - correct (JPM highest) but no numbers provided ‚ö†
    0.0,  # Q12: INCORRECT - wrong units ($41M vs $4.1T) + couldn't compare ‚úó
    0.5,  # Q13: Partial - correct direction but no employee counts ‚ö†
    0.0,  # Q14: INCORRECT - completely wrong revenue numbers ‚úó
    0.5   # Q15: Partial - captured decline but missed the recovery ‚ö†
]

# Add scores to results
for i, score in enumerate(manual_scores):
    results[i]['score'] = score
    results[i]['score_label'] = 'Correct' if score == 1.0 else ('Partial' if score == 0.5 else 'Incorrect')

print("‚úì Manual scores added!")
print(f"\nScoring guide:")
print("  1.0 = Correct (answer matches ground truth)")
print("  0.5 = Partial (answer partially correct or lacks detail)")
print("  0.0 = Incorrect (answer is wrong)")

‚úì Manual scores added!

Scoring guide:
  1.0 = Correct (answer matches ground truth)
  0.5 = Partial (answer partially correct or lacks detail)
  0.0 = Incorrect (answer is wrong)


In [28]:
"""
Calculate overall and per-category accuracy
"""

df_results = pd.DataFrame(results)

# Overall accuracy
overall_accuracy = df_results['score'].mean() * 100

# Per-category accuracy
category_accuracy = df_results.groupby('category')['score'].agg(['mean', 'count'])
category_accuracy['mean'] = category_accuracy['mean'] * 100
category_accuracy.columns = ['Accuracy (%)', 'Count']

print("="*60)
print("EVALUATION RESULTS")
print("="*60)

print(f"\nüìä Overall Accuracy: {overall_accuracy:.1f}%")
print(f"   ({df_results['score'].sum():.1f} / {len(df_results)} questions correct)")

print(f"\nüìà Accuracy by Category:")
print(category_accuracy)

print(f"\n‚úÖ Correct: {len(df_results[df_results['score'] == 1.0])} questions")
print(f"‚ö†Ô∏è  Partial: {len(df_results[df_results['score'] == 0.5])} questions")
print(f"‚ùå Incorrect: {len(df_results[df_results['score'] == 0.0])} questions")

EVALUATION RESULTS

üìä Overall Accuracy: 53.3%
   (8.0 / 15 questions correct)

üìà Accuracy by Category:
             Accuracy (%)  Count
category                        
comparative     33.333333      3
factual         65.000000     10
trend           25.000000      2

‚úÖ Correct: 6 questions
‚ö†Ô∏è  Partial: 4 questions
‚ùå Incorrect: 5 questions


In [29]:
"""
Analyze incorrect/partial answers to identify failure patterns
"""

# Get incorrect or partial answers
failures = df_results[df_results['score'] < 1.0]

print("="*60)
print("FAILURE MODE ANALYSIS")
print("="*60)

if len(failures) == 0:
    print("\nüéâ No failures! All questions answered correctly!")
else:
    print(f"\nFound {len(failures)} questions with issues:\n")
    
    for idx, row in failures.iterrows():
        print(f"\n{'='*60}")
        print(f"Question {row['id']}: {row['score_label']}")
        print(f"{'='*60}")
        print(f"Q: {row['question']}")
        print(f"Expected: {row['ground_truth']}")
        print(f"Got: {row['rag_answer']}")
        print(f"Category: {row['category']}")
        print(f"Sources used: {[s['file'] for s in row['sources']]}")

# Common failure patterns
print(f"\n\n{'='*60}")
print("COMMON FAILURE PATTERNS")
print(f"{'='*60}\n")

failure_modes = """
Based on evaluation, common failure modes include:

1. **Numerical Precision** (3 questions affected: Q5, Q6, Q7)
   - Challenge: Inconsistent units ($M vs $B vs $T), rounding differences
   - Example: Q5 asked for 11.8%, system said 15.8%
   - Why: Retrieved chunks with different metrics or time periods
   
2. **Cross-Document Synthesis** (2 questions affected: Q11, Q13)
   - Challenge: Comparing data across multiple companies
   - Example: Q12 "Compare JPM and Citi assets" ‚Üí Only retrieved JPM data
   - Why: Vector search prioritizes single company; lacks multi-document reasoning
   
3. **Multi-Year Trends** (2 questions affected: Q14, Q15)
   - Challenge: Analyzing changes across fiscal years
   - Example: Q14 "JPM revenue 2023-2024?" ‚Üí Wrong numbers retrieved
   - Why: Chunks from single year dominate retrieval; temporal reasoning weak
   
4. **Missing Context** (2 questions affected: Q9, Q12)
   - Challenge: Information split across multiple chunks/sections
   - Example: Q9 "How many credit cards?" ‚Üí Couldn't find answer
   - Why: Relevant data not captured in any retrieved chunk
"""

print(failure_modes)

FAILURE MODE ANALYSIS

Found 9 questions with issues:


Question 5: Incorrect
Q: What is Bank of America CET1 capital ratio in 2024?
Expected: 11.8% standardized approach
Got: The Bank of America CET1 capital ratio in 2024 is 15.8%.
Category: factual
Sources used: ['JPM_10K_2025.pdf', 'C_10K_2023.pdf', 'C_10K_2024.pdf', 'C_10K_2022.pdf']

Question 6: Incorrect
Q: What was JPMorgan net income in 2024?
Expected: $57.0 billion or $57,040 million
Got: JPMorgan's net income in 2024 was $48,665 million.
Category: factual
Sources used: ['JPM_10K_2024.pdf', 'JPM_10K_2024.pdf', 'JPM_10K_2023.pdf', 'JPM_10K_2025.pdf']

Question 7: Partial
Q: What is Citigroup return on equity ROE in 2024?
Expected: Approximately 8.1%
Got: 9.4%
Category: factual
Sources used: ['C_10K_2024.pdf', 'C_10K_2022.pdf', 'C_10K_2023.pdf', 'C_10K_2024.pdf']

Question 9: Incorrect
Q: How many credit cards does Bank of America have outstanding?
Expected: Over 51 million credit card accounts
Got: Based on the context provided

In [35]:
"""
Save evaluation results for documentation
"""

# Save detailed results
df_results.to_csv('../data/processed/evaluation_results.csv', index=False)

# Save as JSON with full details
with open('../data/processed/evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

# Save summary metrics (basic version - will update after recall)
summary_basic = {
    'evaluation_date': datetime.now().isoformat(),
    'total_questions': len(df_results),
    'overall_accuracy': float(overall_accuracy),
    'category_accuracy': category_accuracy.to_dict(),
    'correct_count': int(len(df_results[df_results['score'] == 1.0])),
    'partial_count': int(len(df_results[df_results['score'] == 0.5])),
    'incorrect_count': int(len(df_results[df_results['score'] == 0.0]))
}

with open('../data/processed/evaluation_summary_basic.json', 'w') as f:
    json.dump(summary_basic, f, indent=2)

print("‚úì Basic evaluation results saved!")
print("  - evaluation_results.csv")
print("  - evaluation_results.json")
print("  - evaluation_summary_basic.json")

‚úì Basic evaluation results saved!
  - evaluation_results.csv
  - evaluation_results.json
  - evaluation_summary_basic.json


In [None]:
## evaluation over

In [31]:
"""
RETRIEVAL RECALL ANALYSIS
Measure: Did the retriever find the right source document?
"""

print("="*60)
print("RETRIEVAL RECALL ANALYSIS")
print("="*60)

def calculate_retrieval_recall_for_question(question, expected_source):
    """
    Check if the expected source document was in the retrieved chunks
    """
    # Get retrieved documents
    docs = retriever.invoke(question)
    retrieved_sources = [doc.metadata.get('source_file') for doc in docs]
    
    # Check if expected source is in retrieved docs
    # Handle multiple expected sources (e.g., "JPM_10K_2024.pdf, BAC_10K_2024.pdf")
    expected_files = [s.strip() for s in expected_source.split(',')]
    
    found = any(exp in retrieved_sources for exp in expected_files)
    
    return {
        'found': found,
        'expected': expected_source,
        'retrieved': retrieved_sources
    }

# Calculate recall for all questions
recall_results = []

for r in results:
    recall_info = calculate_retrieval_recall_for_question(
        r['question'], 
        r['expected_source']
    )
    recall_results.append({
        'question_id': r['id'],
        'question': r['question'],
        'category': r['category'],
        'recall_success': recall_info['found'],
        'expected_source': recall_info['expected'],
        'retrieved_sources': recall_info['retrieved'],
        'answer_score': r['score']
    })

# Convert to DataFrame
df_recall = pd.DataFrame(recall_results)

# Calculate metrics
overall_recall = df_recall['recall_success'].mean() * 100

print(f"\nüìä RETRIEVAL RECALL METRICS")
print(f"="*60)
print(f"\nOverall Recall@4: {overall_recall:.1f}%")
print(f"  ({df_recall['recall_success'].sum()}/{len(df_recall)} questions had correct source retrieved)")

# Recall by category
recall_by_category = df_recall.groupby('category')['recall_success'].agg(['mean', 'count'])
recall_by_category['mean'] = recall_by_category['mean'] * 100
recall_by_category.columns = ['Recall (%)', 'Count']

print(f"\nüìà Recall by Category:")
print(recall_by_category)

# CRITICAL INSIGHT: Accuracy conditional on retrieval
print(f"\nüîç DIAGNOSTIC ANALYSIS")
print(f"="*60)

# Questions with good retrieval
good_retrieval = df_recall[df_recall['recall_success'] == True]
bad_retrieval = df_recall[df_recall['recall_success'] == False]

if len(good_retrieval) > 0:
    accuracy_with_good_retrieval = good_retrieval['answer_score'].mean() * 100
    print(f"\nWhen retrieval SUCCEEDS (correct source found):")
    print(f"  Answer accuracy: {accuracy_with_good_retrieval:.1f}%")
    print(f"  Questions: {len(good_retrieval)}")
else:
    print(f"\nNo questions had successful retrieval!")

if len(bad_retrieval) > 0:
    accuracy_with_bad_retrieval = bad_retrieval['answer_score'].mean() * 100
    print(f"\nWhen retrieval FAILS (correct source NOT found):")
    print(f"  Answer accuracy: {accuracy_with_bad_retrieval:.1f}%")
    print(f"  Questions: {len(bad_retrieval)}")

# Show retrieval failures
print(f"\n‚ùå RETRIEVAL FAILURES ({len(bad_retrieval)} questions):")
print(f"="*60)

for idx, row in bad_retrieval.iterrows():
    print(f"\nQ{row['question_id']}: {row['question'][:60]}...")
    print(f"  Expected: {row['expected_source']}")
    print(f"  Retrieved: {row['retrieved_sources']}")
    print(f"  Answer Score: {row['answer_score']}")

# Key insight
print(f"\nüí° KEY INSIGHT:")
print(f"="*60)

if overall_recall < 70:
    print("‚ö†Ô∏è  Retrieval recall is LOW (<70%)")
    print("‚Üí PRIORITY: Fix retrieval/chunking/embeddings FIRST")
    print("‚Üí Improving LLM won't help if chunks are wrong!")
elif overall_recall >= 70 and overall_accuracy < 70:
    print("‚úì Retrieval recall is GOOD (‚â•70%)")
    print("‚ö†Ô∏è  But accuracy is still low")
    print("‚Üí PRIORITY: Fix generation (better prompts, GPT-4, re-ranking)")
else:
    print("‚úì Both retrieval and generation performing reasonably well")
    print("‚Üí Focus on edge cases and advanced techniques")

# Save recall analysis
df_recall.to_csv('../data/processed/retrieval_recall_analysis.csv', index=False)
print(f"\n‚úì Recall analysis saved: data/processed/retrieval_recall_analysis.csv")

RETRIEVAL RECALL ANALYSIS

üìä RETRIEVAL RECALL METRICS

Overall Recall@4: 86.7%
  (13/15 questions had correct source retrieved)

üìà Recall by Category:
             Recall (%)  Count
category                      
comparative   66.666667      3
factual       90.000000     10
trend        100.000000      2

üîç DIAGNOSTIC ANALYSIS

When retrieval SUCCEEDS (correct source found):
  Answer accuracy: 57.7%
  Questions: 13

When retrieval FAILS (correct source NOT found):
  Answer accuracy: 25.0%
  Questions: 2

‚ùå RETRIEVAL FAILURES (2 questions):

Q5: What is Bank of America CET1 capital ratio in 2024?...
  Expected: BAC_10K_2024.pdf
  Retrieved: ['JPM_10K_2025.pdf', 'C_10K_2023.pdf', 'C_10K_2024.pdf', 'C_10K_2022.pdf']
  Answer Score: 0.0

Q11: Which bank had the highest net income in 2024: JPMorgan, Ban...
  Expected: Multiple: JPM_10K_2024, BAC_10K_2024, C_10K_2024
  Retrieved: ['JPM_10K_2024.pdf', 'JPM_10K_2023.pdf', 'C_10K_2022.pdf', 'JPM_10K_2024.pdf']
  Answer Score: 0.5

ü

In [37]:
"""
Save comprehensive evaluation results including recall metrics
"""

# Merge results with recall
df_results_complete = df_results.copy()
df_results_complete['recall_success'] = df_recall['recall_success'].values
df_results_complete['retrieved_sources'] = df_recall['retrieved_sources'].values

# Save to CSV
df_results_complete.to_csv('../data/processed/evaluation_results_complete.csv', index=False)

# Save comprehensive JSON
with open('../data/processed/evaluation_complete.json', 'w') as f:
    json.dump({
        'evaluation_date': datetime.now().isoformat(),
        'summary': {
            'overall_accuracy': float(overall_accuracy),
            'overall_recall_at_4': float(overall_recall),
            'accuracy_with_good_retrieval': float(accuracy_with_good_retrieval) if len(good_retrieval) > 0 else None,
            'accuracy_with_bad_retrieval': float(accuracy_with_bad_retrieval) if len(bad_retrieval) > 0 else None,
            'total_questions': len(df_results),
            'correct_count': int(len(df_results[df_results['score'] == 1.0])),
            'partial_count': int(len(df_results[df_results['score'] == 0.5])),
            'incorrect_count': int(len(df_results[df_results['score'] == 0.0])),
            'retrieval_success_count': int(df_recall['recall_success'].sum()),
            'retrieval_failure_count': int((~df_recall['recall_success']).sum())
        },
        'category_breakdown': {
            'accuracy': category_accuracy.to_dict(),
            'recall': recall_by_category.to_dict()
        },
        'questions_and_scores': results,
        'recall_analysis': recall_results
    }, f, indent=2)

print("‚úì Comprehensive evaluation saved!")
print("  Files created:")
print("    - evaluation_results_complete.csv")
print("    - evaluation_complete.json")

‚úì Comprehensive evaluation saved!
  Files created:
    - evaluation_results_complete.csv
    - evaluation_complete.json


In [38]:
"""
FINAL SUMMARY - Day 3 Evaluation Complete
"""

print("="*80)
print("üéâ DAY 3 EVALUATION COMPLETE")
print("="*80)

print(f"\nüìä Final Metrics:")
print(f"  ‚Ä¢ Overall Accuracy: {overall_accuracy:.1f}%")
print(f"  ‚Ä¢ Retrieval Recall@4: {overall_recall:.1f}%")
print(f"  ‚Ä¢ Test Questions: {len(df_results)}")
print(f"  ‚Ä¢ Correct: {len(df_results[df_results['score'] == 1.0])}")
print(f"  ‚Ä¢ Partial: {len(df_results[df_results['score'] == 0.5])}")
print(f"  ‚Ä¢ Incorrect: {len(df_results[df_results['score'] == 0.0])}")

print(f"\nüìÅ Outputs Created:")
print(f"  ‚úì evaluation_results.csv")
print(f"  ‚úì evaluation_results.json")
print(f"  ‚úì evaluation_results_complete.csv")
print(f"  ‚úì evaluation_complete.json")
print(f"  ‚úì retrieval_recall_analysis.csv")
print(f"  ‚úì evaluation_accuracy.png (from earlier)")
print(f"  ‚úì question_performance.png (from earlier)")
print(f"  ‚úì failure_patterns.png (from earlier)")

print(f"\nüîç Key Findings:")
if overall_recall >= 70:
    print(f"  ‚Ä¢ Retrieval is GOOD ({overall_recall:.1f}%)")
    print(f"  ‚Ä¢ Focus improvements on generation (GPT-4, re-ranking)")
else:
    print(f"  ‚Ä¢ Retrieval needs work ({overall_recall:.1f}%)")
    print(f"  ‚Ä¢ Focus improvements on search (hybrid, chunking)")

print(f"\n‚úÖ Ready for:")
print(f"  ‚Üí Update README with final numbers")
print(f"  ‚Üí Update evaluation_report.md")
print(f"  ‚Üí Commit to GitHub")
print(f"  ‚Üí Move to Day 4 tomorrow")

print(f"\nüöÄ Next: Commit everything to GitHub!")
print(f"   git add .")
print(f"   git commit -m \"Day 3 complete: Evaluation with retrieval recall\"")
print(f"   git push")

üéâ DAY 3 EVALUATION COMPLETE

üìä Final Metrics:
  ‚Ä¢ Overall Accuracy: 53.3%
  ‚Ä¢ Retrieval Recall@4: 86.7%
  ‚Ä¢ Test Questions: 15
  ‚Ä¢ Correct: 6
  ‚Ä¢ Partial: 4
  ‚Ä¢ Incorrect: 5

üìÅ Outputs Created:
  ‚úì evaluation_results.csv
  ‚úì evaluation_results.json
  ‚úì evaluation_results_complete.csv
  ‚úì evaluation_complete.json
  ‚úì retrieval_recall_analysis.csv
  ‚úì evaluation_accuracy.png (from earlier)
  ‚úì question_performance.png (from earlier)
  ‚úì failure_patterns.png (from earlier)

üîç Key Findings:
  ‚Ä¢ Retrieval is GOOD (86.7%)
  ‚Ä¢ Focus improvements on generation (GPT-4, re-ranking)

‚úÖ Ready for:
  ‚Üí Update README with final numbers
  ‚Üí Update evaluation_report.md
  ‚Üí Commit to GitHub
  ‚Üí Move to Day 4 tomorrow

üöÄ Next: Commit everything to GitHub!
   git add .
   git commit -m "Day 3 complete: Evaluation with retrieval recall"
   git push


In [15]:
"""
Generate key insights from evaluation
"""

print("="*80)
print("üìä KEY EVALUATION INSIGHTS")
print("="*80)

print(f"\n1Ô∏è‚É£ OVERALL PERFORMANCE")
print(f"   ‚Ä¢ Accuracy: {overall_accuracy:.1f}% ({df_results['score'].sum():.0f}/15 questions)")
print(f"   ‚Ä¢ Best Category: {category_accuracy['Accuracy (%)'].idxmax()} ({category_accuracy['Accuracy (%)'].max():.1f}%)")
print(f"   ‚Ä¢ Worst Category: {category_accuracy['Accuracy (%)'].idxmin()} ({category_accuracy['Accuracy (%)'].min():.1f}%)")

print(f"\n2Ô∏è‚É£ STRENGTHS")
print(f"   ‚úì Excellent at factual lookups (single company, single year)")
print(f"   ‚úì Correct source retrieval for most questions")
print(f"   ‚úì Precise numerical answers when data is clear")

print(f"\n3Ô∏è‚É£ WEAKNESSES")
print(f"   ‚úó Cross-document comparisons (comparative questions)")
print(f"   ‚úó Multi-year trend analysis (requires 2-3 years of data)")
print(f"   ‚úó Numerical precision issues (different units, rounding)")
print(f"   ‚úó Missing context when info spans multiple chunks")

print(f"\n4Ô∏è‚É£ SPECIFIC FAILURE EXAMPLES")

failures = df_results[df_results['score'] < 1.0]
for idx, row in failures.head(3).iterrows():
    print(f"\n   Question {row['id']}: {row['question'][:60]}...")
    print(f"   ‚Ä¢ Expected: {row['ground_truth'][:70]}...")
    print(f"   ‚Ä¢ Got: {row['rag_answer'][:70]}...")
    print(f"   ‚Ä¢ Issue: ", end="")
    if row['category'] == 'comparative':
        print("Cross-document synthesis needed")
    elif row['category'] == 'trend':
        print("Multi-year data required")
    else:
        print("Numerical precision or missing context")

print(f"\n5Ô∏è‚É£ RECOMMENDATIONS FOR IMPROVEMENT")
print(f"   ‚Üí Increase k to 5-6 for comparative/trend questions")
print(f"   ‚Üí Add post-processing to normalize numerical formats")
print(f"   ‚Üí Consider larger chunk sizes (1000 chars) for better context")
print(f"   ‚Üí Implement query routing (detect comparative vs factual)")

üìä KEY EVALUATION INSIGHTS

1Ô∏è‚É£ OVERALL PERFORMANCE
   ‚Ä¢ Accuracy: 53.3% (8/15 questions)
   ‚Ä¢ Best Category: factual (65.0%)
   ‚Ä¢ Worst Category: trend (25.0%)

2Ô∏è‚É£ STRENGTHS
   ‚úì Excellent at factual lookups (single company, single year)
   ‚úì Correct source retrieval for most questions
   ‚úì Precise numerical answers when data is clear

3Ô∏è‚É£ WEAKNESSES
   ‚úó Cross-document comparisons (comparative questions)
   ‚úó Multi-year trend analysis (requires 2-3 years of data)
   ‚úó Numerical precision issues (different units, rounding)
   ‚úó Missing context when info spans multiple chunks

4Ô∏è‚É£ SPECIFIC FAILURE EXAMPLES

   Question 5: What is Bank of America CET1 capital ratio in 2024?...
   ‚Ä¢ Expected: 11.8% standardized approach...
   ‚Ä¢ Got: The Bank of America CET1 capital ratio in 2024 is 15.8%....
   ‚Ä¢ Issue: Numerical precision or missing context

   Question 6: What was JPMorgan net income in 2024?...
   ‚Ä¢ Expected: $57.0 billion or $57,040 mill

In [16]:
"""
Create a summary document with all metrics
"""

summary_report = f"""
# RAG System Evaluation Report
Generated: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

## Executive Summary
- **Overall Accuracy:** {overall_accuracy:.1f}%
- **Total Questions:** {len(df_results)}
- **Correct Answers:** {len(df_results[df_results['score'] == 1.0])}
- **Partial Answers:** {len(df_results[df_results['score'] == 0.5])}
- **Incorrect Answers:** {len(df_results[df_results['score'] == 0.0])}

## Performance by Category

### Factual Questions (Single Lookup)
- Accuracy: {category_accuracy.loc['factual', 'Accuracy (%)']:.1f}%
- Count: {int(category_accuracy.loc['factual', 'Count'])}
- Strength: Best performing category

### Comparative Questions (Cross-Document)
- Accuracy: {category_accuracy.loc['comparative', 'Accuracy (%)']:.1f}%
- Count: {int(category_accuracy.loc['comparative', 'Count'])}
- Challenge: Requires synthesis across companies

### Trend Questions (Multi-Year)
- Accuracy: {category_accuracy.loc['trend', 'Accuracy (%)']:.1f}%
- Count: {int(category_accuracy.loc['trend', 'Count'])}
- Challenge: Needs data from multiple years

## Common Failure Modes

1. **Numerical Precision** (3 questions affected)
   - Different units ($M vs $B vs $T)
   - Rounding differences
   - Missing decimal places

2. **Cross-Document Synthesis** (2 questions affected)
   - Comparing data across banks
   - Requires multiple retrievals
   - Context prioritizes one company

3. **Multi-Year Trends** (2 questions affected)
   - Needs 2-3 years of data
   - Chunks may not span all years
   - Temporal reasoning required

4. **Missing Context** (2 questions affected)
   - Information split across chunks
   - Key details in different sections
   - Chunking breaks up complete info

## Recommendations

### Short-term Improvements
- Adjust k value based on question type
- Add numerical normalization post-processing
- Implement better error messages for missing data

### Long-term Enhancements
- Query routing (factual vs comparative)
- Hybrid retrieval (dense + sparse)
- Fine-tune embeddings on financial documents
- Larger context window models (GPT-4)

## Conclusion
The RAG system performs well on factual lookups (60% accuracy on factual questions) 
but struggles with cross-document comparisons and multi-year trends (25% on comparative, 
25% on trend questions). With targeted improvements, accuracy could reach 75-80%.
"""

with open('../outputs/evaluation_report.md', 'w') as f:
    f.write(summary_report)

print("‚úì Evaluation report saved: outputs/evaluation_report.md")
print("\n" + "="*80)
print("üìÑ REPORT PREVIEW")
print("="*80)
print(summary_report[:800] + "\n...")

‚úì Evaluation report saved: outputs/evaluation_report.md

üìÑ REPORT PREVIEW

# RAG System Evaluation Report
Generated: February 19, 2026 at 07:21 PM

## Executive Summary
- **Overall Accuracy:** 53.3%
- **Total Questions:** 15
- **Correct Answers:** 6
- **Partial Answers:** 4
- **Incorrect Answers:** 5

## Performance by Category

### Factual Questions (Single Lookup)
- Accuracy: 65.0%
- Count: 10
- Strength: Best performing category

### Comparative Questions (Cross-Document)
- Accuracy: 33.3%
- Count: 3
- Challenge: Requires synthesis across companies

### Trend Questions (Multi-Year)
- Accuracy: 25.0%
- Count: 2
- Challenge: Needs data from multiple years

## Common Failure Modes

1. **Numerical Precision** (3 questions affected)
   - Different units ($M vs $B vs $T)
   - Rounding differences
   - Missing decimal places

2. **Cross-Document Synthesis** (2 quest
...


In [34]:
"""
Generate README.md content
"""

readme_content = """# Financial Document RAG System

Retrieval-Augmented Generation (RAG) system for querying SEC 10-K filings from major US banks.

[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
[![LangChain](https://img.shields.io/badge/LangChain-1.2+-green.svg)](https://github.com/langchain-ai/langchain)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

## Overview

This project implements a Retrieval-Augmented Generation (RAG) system that enables natural language querying of financial documents. The system processes 9 SEC 10-K filings from JPMorgan Chase, Bank of America, and Citigroup (fiscal years 2022-2025), creating a semantic search engine powered by large language models.

**Key Features:**
- Processes 9 financial PDFs (~17,300 document chunks)
- Semantic search using OpenAI embeddings
- Natural language Q&A with GPT-3.5-turbo
- Source citation for all answers
- Metadata filtering by company and year
- Sub-2-second response time

## Performance

**Evaluation Methodology:**
- Test set: 15 questions with manually verified ground truth
- Categories: 10 factual, 3 comparative, 2 trend-based questions
- Metrics: Answer accuracy + Retrieval recall@4

**Results:**

| Metric | Score | Interpretation |
|--------|-------|----------------|
| **Overall Accuracy** | 53.3% | 8/15 questions correct or partial |
| **Retrieval Recall@4** | 86.7% | Correct source in top 4 chunks (13/15) |
| **Factual Recall** | 90.0% | Excellent at finding factual data |
| **Comparative Recall** | 66.7% | Moderate for multi-company queries |
| **Trend Recall** | 100.0% | Perfect for temporal queries |

**Diagnostic Analysis:**
- **When retrieval succeeds (86.7% of cases):** 57.7% answer accuracy
- **When retrieval fails (13.3% of cases):** 25.0% answer accuracy
- **Primary bottleneck:** Generation quality (GPT-3.5), not retrieval
- **Key insight:** System finds correct documents but struggles with answer synthesis

**Performance by Question Type:**

| Category | Accuracy | Retrieval Recall | Best Use Case |
|----------|----------|------------------|---------------|
| **Factual** | 60.0% | 90.0% | Single-company lookups |
| **Comparative** | 33.3% | 66.7% | Cross-company synthesis |
| **Trend** | 25.0% | 100.0% | Multi-year analysis |

**Evaluation Limitations:**
- Small sample size (n=15) provides directional insights with ¬±25% confidence interval
- Subjective partial scoring (0.5 points) for near-correct answers
- Does not separate unit correctness from value correctness
- For production: recommend 80-150 question test set with automated metrics

## Strengths

- **Excellent retrieval:** 86.7% recall - finds correct documents reliably  
- **Fast queries:** Sub-2-second response time  
- **Source attribution:** Full transparency on document sources  
- **Cost-effective:** $3-5 total project cost, ~$0.01/query

## Limitations

- **Generation bottleneck:** 57.7% accuracy even with correct documents  
- **Cross-document synthesis:** 33% accuracy on comparative questions  
- **Multi-year trends:** 25% accuracy on temporal analysis  
- **Small evaluation set:** Results not statistically robust (n=15)

## Improvement Roadmap

**Based on retrieval recall analysis, generation quality is the primary bottleneck.**

### Phase 1: Quick Wins (2-3 hours, Expected: 53% ‚Üí 70-75%)

**1. Upgrade to GPT-4** (HIGHEST IMPACT)
```python
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
```
- **Impact:** +15-20% accuracy (addresses generation bottleneck)
- **Cost:** $10-20 additional for evaluation
- **Rationale:** 86.7% retrieval recall but only 57.7% accuracy when retrieval succeeds
- **Target:** Would fix Q6, Q7, Q11-Q15 (generation failures with good chunks)

**2. Add Re-ranking**
```python
# Retrieve k=6 chunks, re-rank to best 4
reranked_docs = rerank_with_llm(query, docs)
```
- **Impact:** +5-10% accuracy
- **Cost:** Minimal (~$1 for evaluation)
- **Target:** Would improve Q12, Q14 (better chunk prioritization)

**3. Adaptive k Values**
```python
if question_type == "comparative": k = 6
elif question_type == "trend": k = 5
```
- **Impact:** +3-5% accuracy
- **Cost:** None
- **Target:** Would improve Q11, Q13, Q15 (more context for synthesis)

**Expected Result: 53% ‚Üí 70-75% accuracy**

### Phase 2: Address Retrieval Gaps (4-6 hours, Expected: +5-10%)

**4. Hybrid Search (BM25 + Vector)**
- Fix Q5 (BAC CET1 ratio) - retrieval failure
- Catch exact keyword matches
- **Impact:** +5% on factual questions

**5. Better Chunking Strategy**
- Semantic chunking (topic-based)
- Larger chunks (1000 chars)
- **Requires:** Complete re-ingestion

**Expected Result: 70-75% ‚Üí 80-85% accuracy**

### Phase 3: Advanced (Research Phase)

**6. Fine-tuned Embeddings**
- Train on financial documents
- Better domain understanding
- **Impact:** +5-10% overall

**7. Query Decomposition**
- Break complex questions into sub-queries
- Better for comparative/trend questions
- **Impact:** +10% on complex questions

**8. GraphRAG / Agentic RAG**
- Knowledge graph representation
- Multi-step reasoning
- **Impact:** +10-15% on comparative questions

**Expected Result: 80-85% ‚Üí 90%+ accuracy**

## Cost-Benefit Analysis

| Improvement | Time | Cost | Accuracy Gain | ROI | Priority |
|-------------|------|------|---------------|-----|----------|
| GPT-4 Upgrade | 5 min | $10-20 | +15-20% | High | **HIGHEST** |
| Re-ranking | 1-2 hrs | ~$1 | +5-10% | High | **HIGH** |
| Adaptive k | 30 min | $0 | +3-5% | High | **HIGH** |
| Hybrid Search | 4-6 hrs | $0 | +5% | Medium | Medium |
| Better Chunking | 3-4 hrs | $2 | +5-10% | Medium | Medium |
| Fine-tune Embeddings | 2-3 days | $50+ | +5-10% | Low | Low |
| GraphRAG | 1-2 weeks | $100+ | +10-15% | Low | Research |

**Recommended Path:** Phase 1 (GPT-4 + Re-ranking + Adaptive k) ‚Üí Validate ‚Üí Phase 2 if needed

## Architecture
```
User Question
    ‚Üì
[Embedding Model] ‚Üí Query Vector
    ‚Üì
[ChromaDB Vector Store] ‚Üí Retrieve Top 4 Chunks
    ‚Üì
[GPT-3.5-turbo] ‚Üí Generate Answer
    ‚Üì
Answer + Sources
```

### Technology Stack

- **LLM:** OpenAI GPT-3.5-turbo (temperature=0)
- **Embeddings:** OpenAI text-embedding-ada-002
- **Vector Store:** ChromaDB (local, persistent)
- **Framework:** LangChain 1.2+
- **Language:** Python 3.10+

### Design Decisions

| Decision | Rationale |
|----------|-----------|
| **Chunk Size: 800 chars** | Balances context vs precision; captures full paragraphs |
| **Overlap: 150 chars** | Prevents context loss at chunk boundaries |
| **k=4 retrieval** | Optimal balance; tested k=3,4,5 |
| **ChromaDB** | Free, local, built for LLM apps; easy deployment |
| **GPT-3.5** | Cost-effective ($3-5 total project cost) |

## Project Structure
```
financial-document-rag/
‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îú‚îÄ‚îÄ raw_pdfs/          # 9 original SEC 10-K PDFs
‚îÇ   ‚îú‚îÄ‚îÄ processed/         # Processed data, test results
‚îÇ   ‚îî‚îÄ‚îÄ vectorstore/       # ChromaDB database (17K+ chunks)
‚îú‚îÄ‚îÄ src/
‚îÇ   ‚îú‚îÄ‚îÄ ingestion.py       # PDF loading & chunking
‚îÇ   ‚îî‚îÄ‚îÄ rag_pipeline.py    # Q&A system (if converted from notebook)
‚îú‚îÄ‚îÄ notebooks/
‚îÇ   ‚îú‚îÄ‚îÄ day2_rag_pipeline.ipynb    # RAG development
‚îÇ   ‚îî‚îÄ‚îÄ day3_evaluation.ipynb      # Evaluation & analysis
‚îú‚îÄ‚îÄ outputs/
‚îÇ   ‚îú‚îÄ‚îÄ evaluation_accuracy.png
‚îÇ   ‚îú‚îÄ‚îÄ question_performance.png
‚îÇ   ‚îî‚îÄ‚îÄ evaluation_report.md
‚îú‚îÄ‚îÄ requirements.txt
‚îú‚îÄ‚îÄ .env                   # API keys (not in repo)
‚îú‚îÄ‚îÄ .gitignore
‚îî‚îÄ‚îÄ README.md
```

## Installation

### Prerequisites
- Python 3.10+
- OpenAI API key ([get one here](https://platform.openai.com/api-keys))

### Setup

1. **Clone repository**
```bash
git clone https://github.com/Ahussein9817/financial-document-rag.git
cd financial-document-rag
```

2. **Create virtual environment**
```bash
python -m venv venv
source venv/bin/activate  # On Windows: venv\\Scripts\\activate
```

3. **Install dependencies**
```bash
pip install -r requirements.txt
```

4. **Configure API key**
```bash
echo "OPENAI_API_KEY=your-key-here" > .env
```

5. **Run ingestion** (if vector store not included)
```bash
python src/ingestion.py
```

## Usage

### Interactive Q&A (Notebook)
```python
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Load system
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(persist_directory='data/vectorstore', embedding_function=embeddings)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Ask question
question = "What was JPMorgan's revenue in 2024?"
# ... (chain setup)
answer = rag_chain.invoke(question)
```

### Example Queries
```python
# Factual lookup
"What was Bank of America's net income in 2024?"

# Comparative
"Which bank had higher revenue: JPMorgan or Citigroup?"

# Trend analysis
"How has JPMorgan's revenue changed from 2023 to 2024?"

# Filtered search
ask_with_filter("What was the revenue?", ticker='JPM', year=2024)
```

## Evaluation

### Test Set
- 15 questions with ground truth
- 10 factual, 3 comparative, 2 trend-based
- Manually verified against source PDFs

### Strengths
- Accurate factual lookups (single company/year)  
- Correct source attribution  
- Fast response time (<2 seconds)

### Limitations
- Cross-document comparisons challenging  
- Multi-year trend analysis incomplete  
- Numerical precision issues (unit conversions)

See [evaluation_report.md](outputs/evaluation_report.md) for details.

## Cost Analysis

**Total project cost:** ~$3-5 USD

- Embeddings (17,300 chunks): ~$2.50
- LLM queries (~50 questions): ~$0.50
- Evaluation (15 questions): ~$0.20

**Per-query cost:** ~$0.01

## Future Improvements

### Short-term
- Increase k for comparative questions
- Add numerical normalization
- Better error handling

### Long-term
- Query routing (factual vs comparative)
- Hybrid retrieval (dense + sparse)
- Fine-tuned embeddings
- Multi-document synthesis

## Known Issues

1. **Comparative questions:** System struggles to synthesize across banks
2. **Numerical formats:** $M vs $B vs $T inconsistencies
3. **Multi-year trends:** Limited context span across years

## License

MIT License - see [LICENSE](LICENSE) file

## Author

**Amina Hussein**
- GitHub: [@Ahussein9817](https://github.com/Ahussein9817)
- Project: Financial Document RAG System
- Date: February 2026

## Acknowledgments

- SEC Edgar database for 10-K filings
- LangChain community
- OpenAI API

---

**Built with:** Python ‚Ä¢ LangChain ‚Ä¢ ChromaDB ‚Ä¢ OpenAI GPT-3.5
"""

with open('../README.md', 'w') as f:
    f.write(readme_content)

print("README.md created!")
print(f"   Location: ../README.md")
print(f"   Length: {len(readme_content)} characters")


README.md created!
   Location: ../README.md
   Length: 10425 characters


In [18]:
"""
Generate README.md content
"""

readme_content = """# Financial Document RAG System

Retrieval-Augmented Generation (RAG) system for querying SEC 10-K filings from major US banks.

[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
[![LangChain](https://img.shields.io/badge/LangChain-1.2+-green.svg)](https://github.com/langchain-ai/langchain)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

## üéØ Overview

This project implements a Retrieval-Augmented Generation (RAG) system that enables natural language querying of financial documents. The system processes 9 SEC 10-K filings from JPMorgan Chase, Bank of America, and Citigroup (fiscal years 2022-2025), creating a semantic search engine powered by large language models.

**Key Features:**
- üìÑ Processes 9 financial PDFs (~17,300 document chunks)
- üîç Semantic search using OpenAI embeddings
- üí¨ Natural language Q&A with GPT-3.5-turbo
- üìä Source citation for all answers
- üéØ Metadata filtering by company and year
- ‚ö° Sub-2-second response time

## üìä Performance

**Evaluation Results** (15-question test set):
- **Overall Accuracy:** 53.3%
- **Factual Questions:** 60% accuracy
- **Comparative Questions:** 25% accuracy
- **Trend Questions:** 25% accuracy

## üèóÔ∏è Architecture
```
User Question
    ‚Üì
[Embedding Model] ‚Üí Query Vector
    ‚Üì
[ChromaDB Vector Store] ‚Üí Retrieve Top 4 Chunks
    ‚Üì
[GPT-3.5-turbo] ‚Üí Generate Answer
    ‚Üì
Answer + Sources
```

### Technology Stack

- **LLM:** OpenAI GPT-3.5-turbo (temperature=0)
- **Embeddings:** OpenAI text-embedding-ada-002
- **Vector Store:** ChromaDB (local, persistent)
- **Framework:** LangChain 1.2+
- **Language:** Python 3.10+

### Design Decisions

| Decision | Rationale |
|----------|-----------|
| **Chunk Size: 800 chars** | Balances context vs precision; captures full paragraphs |
| **Overlap: 150 chars** | Prevents context loss at chunk boundaries |
| **k=4 retrieval** | Optimal balance; tested k=3,4,5 |
| **ChromaDB** | Free, local, built for LLM apps; easy deployment |
| **GPT-3.5** | Cost-effective ($3-5 total project cost) |

## üìÅ Project Structure
```
financial-document-rag/
‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îú‚îÄ‚îÄ raw_pdfs/          # 9 original SEC 10-K PDFs
‚îÇ   ‚îú‚îÄ‚îÄ processed/         # Processed data, test results
‚îÇ   ‚îî‚îÄ‚îÄ vectorstore/       # ChromaDB database (17K+ chunks)
‚îú‚îÄ‚îÄ src/
‚îÇ   ‚îú‚îÄ‚îÄ ingestion.py       # PDF loading & chunking
‚îÇ   ‚îî‚îÄ‚îÄ rag_pipeline.py    # Q&A system (if converted from notebook)
‚îú‚îÄ‚îÄ notebooks/
‚îÇ   ‚îú‚îÄ‚îÄ day2_rag_pipeline.ipynb    # RAG development
‚îÇ   ‚îî‚îÄ‚îÄ day3_evaluation.ipynb      # Evaluation & analysis
‚îú‚îÄ‚îÄ outputs/
‚îÇ   ‚îú‚îÄ‚îÄ evaluation_accuracy.png
‚îÇ   ‚îú‚îÄ‚îÄ question_performance.png
‚îÇ   ‚îî‚îÄ‚îÄ evaluation_report.md
‚îú‚îÄ‚îÄ requirements.txt
‚îú‚îÄ‚îÄ .env                   # API keys (not in repo)
‚îú‚îÄ‚îÄ .gitignore
‚îî‚îÄ‚îÄ README.md
```

## üöÄ Installation

### Prerequisites
- Python 3.10+
- OpenAI API key ([get one here](https://platform.openai.com/api-keys))

### Setup

1. **Clone repository**
```bash
git clone https://github.com/Ahussein9817/financial-document-rag.git
cd financial-document-rag
```

2. **Create virtual environment**
```bash
python -m venv venv
source venv/bin/activate  # On Windows: venv\\Scripts\\activate
```

3. **Install dependencies**
```bash
pip install -r requirements.txt
```

4. **Configure API key**
```bash
echo "OPENAI_API_KEY=your-key-here" > .env
```

5. **Run ingestion** (if vector store not included)
```bash
python src/ingestion.py
```

## üí° Usage

### Interactive Q&A (Notebook)
```python
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Load system
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(persist_directory='data/vectorstore', embedding_function=embeddings)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Ask question
question = "What was JPMorgan's revenue in 2024?"
# ... (chain setup)
answer = rag_chain.invoke(question)
```

### Example Queries
```python
# Factual lookup
"What was Bank of America's net income in 2024?"

# Comparative
"Which bank had higher revenue: JPMorgan or Citigroup?"

# Trend analysis
"How has JPMorgan's revenue changed from 2023 to 2024?"

# Filtered search
ask_with_filter("What was the revenue?", ticker='JPM', year=2024)
```

## üìà Evaluation

### Test Set
- 15 questions with ground truth
- 10 factual, 3 comparative, 2 trend-based
- Manually verified against source PDFs

### Strengths
‚úÖ Accurate factual lookups (single company/year)  
‚úÖ Correct source attribution  
‚úÖ Fast response time (<2 seconds)

### Limitations
‚ùå Cross-document comparisons challenging  
‚ùå Multi-year trend analysis incomplete  
‚ùå Numerical precision issues (unit conversions)

See [evaluation_report.md](outputs/evaluation_report.md) for details.

## üí∞ Cost Analysis

**Total project cost:** ~$3-5 USD

- Embeddings (17,300 chunks): ~$2.50
- LLM queries (~50 questions): ~$0.50
- Evaluation (15 questions): ~$0.20

**Per-query cost:** ~$0.01

## üîÆ Future Improvements

### Short-term
- [ ] Increase k for comparative questions
- [ ] Add numerical normalization
- [ ] Better error handling

### Long-term
- [ ] Query routing (factual vs comparative)
- [ ] Hybrid retrieval (dense + sparse)
- [ ] Fine-tuned embeddings
- [ ] Multi-document synthesis

## üêõ Known Issues

1. **Comparative questions:** System struggles to synthesize across banks
2. **Numerical formats:** $M vs $B vs $T inconsistencies
3. **Multi-year trends:** Limited context span across years

## üìù License

MIT License - see [LICENSE](LICENSE) file

## üë§ Author

**Amina Hussein**
- GitHub: [@Ahussein9817](https://github.com/Ahussein9817)
- Project: Financial Document RAG System
- Date: February 2026

## üôè Acknowledgments

- SEC Edgar database for 10-K filings
- LangChain community
- OpenAI API

---

**Built with:** Python ‚Ä¢ LangChain ‚Ä¢ ChromaDB ‚Ä¢ OpenAI GPT-3.5
"""

with open('../README.md', 'w') as f:
    f.write(readme_content)

print("‚úì README.md created!")
print(f"   Location: ../README.md")
print(f"   Length: {len(readme_content)} characters")

‚úì README.md created!
   Location: ../README.md
   Length: 5867 characters


In [25]:
"""
RETRIEVAL RECALL ANALYSIS
Measure: Did the retriever find the right source document?
"""

print("="*60)
print("RETRIEVAL RECALL ANALYSIS")
print("="*60)

def calculate_retrieval_recall_for_question(question, expected_source):
    """
    Check if the expected source document was in the retrieved chunks
    """
    # Get retrieved documents
    docs = retriever.invoke(question)
    retrieved_sources = [doc.metadata.get('source_file') for doc in docs]
    
    # Check if expected source is in retrieved docs
    # Handle multiple expected sources (e.g., "JPM_10K_2024.pdf, BAC_10K_2024.pdf")
    expected_files = [s.strip() for s in expected_source.split(',')]
    
    found = any(exp in retrieved_sources for exp in expected_files)
    
    return {
        'found': found,
        'expected': expected_source,
        'retrieved': retrieved_sources
    }

# Calculate recall for all questions
recall_results = []

for r in results:
    recall_info = calculate_retrieval_recall_for_question(
        r['question'], 
        r['expected_source']
    )
    recall_results.append({
        'question_id': r['id'],
        'question': r['question'],
        'category': r['category'],
        'recall_success': recall_info['found'],
        'expected_source': recall_info['expected'],
        'retrieved_sources': recall_info['retrieved'],
        'answer_score': r['score']
    })

# Convert to DataFrame
df_recall = pd.DataFrame(recall_results)

# Calculate metrics
overall_recall = df_recall['recall_success'].mean() * 100

print(f"\nüìä RETRIEVAL RECALL METRICS")
print(f"="*60)
print(f"\nOverall Recall@4: {overall_recall:.1f}%")
print(f"  ({df_recall['recall_success'].sum()}/{len(df_recall)} questions had correct source retrieved)")

# Recall by category
recall_by_category = df_recall.groupby('category')['recall_success'].agg(['mean', 'count'])
recall_by_category['mean'] = recall_by_category['mean'] * 100
recall_by_category.columns = ['Recall (%)', 'Count']

print(f"\nüìà Recall by Category:")
print(recall_by_category)

# CRITICAL INSIGHT: Accuracy conditional on retrieval
print(f"\nüîç DIAGNOSTIC ANALYSIS")
print(f"="*60)

# Questions with good retrieval
good_retrieval = df_recall[df_recall['recall_success'] == True]
bad_retrieval = df_recall[df_recall['recall_success'] == False]

if len(good_retrieval) > 0:
    accuracy_with_good_retrieval = good_retrieval['answer_score'].mean() * 100
    print(f"\nWhen retrieval SUCCEEDS (correct source found):")
    print(f"  Answer accuracy: {accuracy_with_good_retrieval:.1f}%")
    print(f"  Questions: {len(good_retrieval)}")
else:
    print(f"\nNo questions had successful retrieval!")

if len(bad_retrieval) > 0:
    accuracy_with_bad_retrieval = bad_retrieval['answer_score'].mean() * 100
    print(f"\nWhen retrieval FAILS (correct source NOT found):")
    print(f"  Answer accuracy: {accuracy_with_bad_retrieval:.1f}%")
    print(f"  Questions: {len(bad_retrieval)}")

# Show retrieval failures
print(f"\n‚ùå RETRIEVAL FAILURES ({len(bad_retrieval)} questions):")
print(f"="*60)

for idx, row in bad_retrieval.iterrows():
    print(f"\nQ{row['question_id']}: {row['question'][:60]}...")
    print(f"  Expected: {row['expected_source']}")
    print(f"  Retrieved: {row['retrieved_sources']}")
    print(f"  Answer Score: {row['answer_score']}")

# Key insight
print(f"\nüí° KEY INSIGHT:")
print(f"="*60)

if overall_recall < 70:
    print("‚ö†Ô∏è  Retrieval recall is LOW (<70%)")
    print("‚Üí PRIORITY: Fix retrieval/chunking/embeddings FIRST")
    print("‚Üí Improving LLM won't help if chunks are wrong!")
elif overall_recall >= 70 and overall_accuracy < 70:
    print("‚úì Retrieval recall is GOOD (‚â•70%)")
    print("‚ö†Ô∏è  But accuracy is still low")
    print("‚Üí PRIORITY: Fix generation (better prompts, GPT-4, re-ranking)")
else:
    print("‚úì Both retrieval and generation performing reasonably well")
    print("‚Üí Focus on edge cases and advanced techniques")

# Save recall analysis
df_recall.to_csv('../data/processed/retrieval_recall_analysis.csv', index=False)
print(f"\n‚úì Recall analysis saved: data/processed/retrieval_recall_analysis.csv")

RETRIEVAL RECALL ANALYSIS

üìä RETRIEVAL RECALL METRICS

Overall Recall@4: 86.7%
  (13/15 questions had correct source retrieved)

üìà Recall by Category:
             Recall (%)  Count
category                      
comparative   66.666667      3
factual       90.000000     10
trend        100.000000      2

üîç DIAGNOSTIC ANALYSIS

When retrieval SUCCEEDS (correct source found):
  Answer accuracy: 57.7%
  Questions: 13

When retrieval FAILS (correct source NOT found):
  Answer accuracy: 25.0%
  Questions: 2

‚ùå RETRIEVAL FAILURES (2 questions):

Q5: What is Bank of America CET1 capital ratio in 2024?...
  Expected: BAC_10K_2024.pdf
  Retrieved: ['JPM_10K_2025.pdf', 'C_10K_2023.pdf', 'C_10K_2024.pdf', 'C_10K_2022.pdf']
  Answer Score: 0.0

Q11: Which bank had the highest net income in 2024: JPMorgan, Ban...
  Expected: Multiple: JPM_10K_2024, BAC_10K_2024, C_10K_2024
  Retrieved: ['JPM_10K_2024.pdf', 'JPM_10K_2023.pdf', 'C_10K_2022.pdf', 'JPM_10K_2024.pdf']
  Answer Score: 0.5

ü