In [1]:
import sys
!{sys.executable} -m pip install langchain langchain-community langchain-openai langchain-text-splitters openai chromadb pypdf python-dotenv pandas --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/alihussein/venv/bin/python -m pip install --upgrade pip[0m


In [4]:
import sys
!{sys.executable} -m pip show langchain

Name: langchain
Version: 1.2.10
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: /Users/alihussein/venv/lib/python3.10/site-packages
Requires: langchain-core, langgraph, pydantic
Required-by: 


In [2]:
import sys
!{sys.executable} -m pip install langchain --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/alihussein/venv/bin/python -m pip install --upgrade pip[0m


In [5]:
"""
Day 2: RAG Pipeline Development
Using modern LangChain 1.x approach
"""

import sys
sys.path.append('..')

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

print("‚úì Imports complete!")
print(f"Current time: {datetime.now().strftime('%I:%M %p')}")

‚úì Imports complete!
Current time: 08:37 PM


In [6]:
"""
Load the vector store we created yesterday
"""

print("Loading vector store...")

embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
    persist_directory='../data/vectorstore',
    embedding_function=embeddings
)

# Quick test
test_results = vectorstore.similarity_search("revenue", k=2)
print(f"‚úì Vector store loaded!")
print(f"‚úì Total documents: {vectorstore._collection.count()}")
print(f"‚úì Test query returned {len(test_results)} results")

Loading vector store...
‚úì Vector store loaded!
‚úì Total documents: 17306
‚úì Test query returned 2 results


In [7]:
"""
Create RAG chain using modern LangChain approach
"""

# Initialize LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Create prompt template
template = """Answer the question based only on the following context:

{context}

Question: {question}

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# Format documents function
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# Create RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("‚úì RAG chain created!")
print(f"  LLM: gpt-3.5-turbo")
print(f"  Retrieval: k=4 chunks")

‚úì RAG chain created!
  LLM: gpt-3.5-turbo
  Retrieval: k=4 chunks


In [8]:
"""
Test with one question
"""

question = "What was JPMorgan's total revenue in 2024?"

print(f"Question: {question}\n")

# Get answer
answer = rag_chain.invoke(question)

print(f"Answer: {answer}")

# Get sources separately
docs = retriever.invoke(question)
print(f"\nSources:")
for i, doc in enumerate(docs):
    print(f"  [{i+1}] {doc.metadata.get('source_file')} "
          f"(Year: {doc.metadata.get('year')})")

Question: What was JPMorgan's total revenue in 2024?

Answer: $224,532 million

Sources:
  [1] JPM_10K_2024.pdf (Year: 2024)
  [2] JPM_10K_2024.pdf (Year: 2024)
  [3] JPM_10K_2024.pdf (Year: 2024)
  [4] JPM_10K_2023.pdf (Year: 2023)


In [9]:
"""
Create helper function to ask questions easily
"""

def ask_question(question, k=4, verbose=True):
    """Ask a question and get answer with sources"""
    
    # Create retriever with specified k
    retriever_temp = vectorstore.as_retriever(search_kwargs={"k": k})
    
    # Create chain
    rag_chain_temp = (
        {"context": retriever_temp | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    # Get answer
    answer = rag_chain_temp.invoke(question)
    
    # Get sources
    docs = retriever_temp.invoke(question)
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Q: {question}")
        print(f"{'='*60}")
        print(f"\nA: {answer}\n")
        print("Sources:")
        for i, doc in enumerate(docs):
            print(f"  [{i+1}] {doc.metadata.get('source_file')} "
                  f"(Ticker: {doc.metadata.get('ticker')}, "
                  f"Year: {doc.metadata.get('year')})")
    
    return {
        'question': question,
        'answer': answer,
        'sources': [
            {
                'file': doc.metadata.get('source_file'),
                'ticker': doc.metadata.get('ticker'),
                'year': doc.metadata.get('year')
            }
            for doc in docs
        ],
        'k': k
    }

print("‚úì Helper function ready!")

‚úì Helper function ready!


In [10]:
"""
MORNING TASK: Test with 5 simple questions
"""

print("Testing 5 simple questions...\n")

simple_questions = [
    "What was JPMorgan's total revenue in 2024?",
    "What are Bank of America's main business segments?",
    "What risk factors did Citigroup identify in 2023?",
    "How did JPMorgan's net income change from 2023 to 2024?",
    "What is Bank of America's investment banking revenue?"
]

simple_results = []

for q in simple_questions:
    result = ask_question(q)
    simple_results.append(result)

print(f"\n‚úì Tested {len(simple_questions)} simple questions!")

Testing 5 simple questions...


Q: What was JPMorgan's total revenue in 2024?

A: $224,532 million

Sources:
  [1] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [2] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [3] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [4] JPM_10K_2023.pdf (Ticker: JPM, Year: 2023)

Q: What are Bank of America's main business segments?

A: Bank of America's main business segments are Consumer Banking, Global Wealth & Investment Management (GWIM), Global Banking, and Global Markets.

Sources:
  [1] BAC_10K_2024.pdf (Ticker: BAC, Year: 2024)
  [2] BAC_10K_2022.pdf (Ticker: BAC, Year: 2022)
  [3] BAC_10K_2023.pdf (Ticker: BAC, Year: 2023)
  [4] BAC_10K_2023.pdf (Ticker: BAC, Year: 2023)

Q: What risk factors did Citigroup identify in 2023?

A: Citigroup identified risks related to the amount of leverage in the economy, stress in the property sector, adverse effects on financial markets, negative impacts on global economic growth rates, lower consumer confidence, incr

In [11]:
"""
MORNING TASK: Test with 15-20 diverse questions
Categories: Factual, Comparative, Trend-based, Qualitative
"""

diverse_questions = [
    # Factual (Revenue/Financial metrics)
    "What was Citigroup's total assets in 2023?",
    "What is JPMorgan's return on equity?",
    "What was Bank of America's net interest income in 2024?",
    
    # Comparative (Cross-company)
    "Compare JPMorgan and Bank of America's revenue growth",
    "Which bank had higher trading revenue in 2024?",
    
    # Trend (Year-over-year)
    "How has JPMorgan's revenue changed from 2023 to 2024?",
    "What trends appear in Citigroup's loan portfolio?",
    "How has Bank of America's efficiency ratio changed over time?",
    
    # Qualitative (Strategy, outlook)
    "What is JPMorgan's digital banking strategy?",
    "What are Bank of America's priorities for 2024?",
    "What challenges did Citigroup face in 2023?",
    
    # Specific segments
    "What is JPMorgan's consumer banking revenue?",
    "How did Bank of America's wealth management perform in 2024?",
    "What are Citigroup's main revenue sources?",
    
    # Additional
    "What is JPMorgan's employee count?",
    "What is Bank of America's capital ratio?",
    "What is Citigroup's dividend policy?"
]

print(f"Testing {len(diverse_questions)} diverse questions...")
print("(This will take 3-5 minutes)\n")

diverse_results = []

for i, q in enumerate(diverse_questions):
    print(f"[{i+1}/{len(diverse_questions)}] {q[:60]}...")
    result = ask_question(q, verbose=False)  # Less verbose for speed
    diverse_results.append(result)

print(f"\n‚úì Tested {len(diverse_questions)} diverse questions!")

Testing 17 diverse questions...
(This will take 3-5 minutes)

[1/17] What was Citigroup's total assets in 2023?...
[2/17] What is JPMorgan's return on equity?...
[3/17] What was Bank of America's net interest income in 2024?...
[4/17] Compare JPMorgan and Bank of America's revenue growth...
[5/17] Which bank had higher trading revenue in 2024?...
[6/17] How has JPMorgan's revenue changed from 2023 to 2024?...
[7/17] What trends appear in Citigroup's loan portfolio?...
[8/17] How has Bank of America's efficiency ratio changed over time...
[9/17] What is JPMorgan's digital banking strategy?...
[10/17] What are Bank of America's priorities for 2024?...
[11/17] What challenges did Citigroup face in 2023?...
[12/17] What is JPMorgan's consumer banking revenue?...
[13/17] How did Bank of America's wealth management perform in 2024?...
[14/17] What are Citigroup's main revenue sources?...
[15/17] What is JPMorgan's employee count?...
[16/17] What is Bank of America's capital ratio?...
[17/17]

In [12]:
"""
MORNING TASK: Track results in spreadsheet format
"""

# Combine all results
all_results = simple_results + diverse_results

# Create DataFrame
df = pd.DataFrame([
    {
        'Question': r['question'],
        'Answer': r['answer'][:100] + '...' if len(r['answer']) > 100 else r['answer'],
        'Answer_Full': r['answer'],
        'Answer_Length': len(r['answer']),
        'Num_Sources': len(r['sources']),
        'Primary_Source': r['sources'][0]['file'] if r['sources'] else 'None',
        'Primary_Ticker': r['sources'][0]['ticker'] if r['sources'] else 'None',
        'Primary_Year': r['sources'][0]['year'] if r['sources'] else 'None'
    }
    for r in all_results
])

# Display summary
print(f"Results Summary:")
print(f"  Total questions: {len(df)}")
print(f"  Avg answer length: {df['Answer_Length'].mean():.0f} chars")
print(f"  Avg sources per answer: {df['Num_Sources'].mean():.1f}")
print(f"\nFirst 10 results:")
df.head(10)

Results Summary:
  Total questions: 22
  Avg answer length: 192 chars
  Avg sources per answer: 4.0

First 10 results:


Unnamed: 0,Question,Answer,Answer_Full,Answer_Length,Num_Sources,Primary_Source,Primary_Ticker,Primary_Year
0,What was JPMorgan's total revenue in 2024?,"$224,532 million","$224,532 million",16,4,JPM_10K_2024.pdf,JPM,2024
1,What are Bank of America's main business segme...,Bank of America's main business segments are C...,Bank of America's main business segments are C...,144,4,BAC_10K_2024.pdf,BAC,2024
2,What risk factors did Citigroup identify in 2023?,Citigroup identified risks related to the amou...,Citigroup identified risks related to the amou...,389,4,C_10K_2024.pdf,C,2024
3,How did JPMorgan's net income change from 2023...,JPMorgan's net income increased from $49.6 bil...,JPMorgan's net income increased from $49.6 bil...,84,4,JPM_10K_2023.pdf,JPM,2023
4,What is Bank of America's investment banking r...,Bank of America's investment banking revenue i...,Bank of America's investment banking revenue i...,71,4,JPM_10K_2023.pdf,JPM,2023
5,What was Citigroup's total assets in 2023?,"Total assets for Citigroup in 2023 were $2,416...","Total assets for Citigroup in 2023 were $2,416...",59,4,C_10K_2024.pdf,C,2024
6,What is JPMorgan's return on equity?,"The return on equity for JPMorgan is 32%, 32%,...","The return on equity for JPMorgan is 32%, 32%,...",85,4,JPM_10K_2024.pdf,JPM,2024
7,What was Bank of America's net interest income...,"$56,060 million","$56,060 million",15,4,BAC_10K_2024.pdf,BAC,2024
8,Compare JPMorgan and Bank of America's revenue...,JPMorgan's revenue growth from 2024 to 2025 wa...,JPMorgan's revenue growth from 2024 to 2025 wa...,141,4,JPM_10K_2024.pdf,JPM,2024
9,Which bank had higher trading revenue in 2024?,Bank of America had higher trading revenue in ...,Bank of America had higher trading revenue in ...,51,4,BAC_10K_2023.pdf,BAC,2023


In [13]:
"""
AFTERNOON: Optimization Session
First, let's establish our baseline (what we've been using)
"""

baseline_config = {
    'chunk_size': 800,  # From yesterday's ingestion
    'chunk_overlap': 150,
    'k': 4
}

print("Current Configuration (Baseline):")
print(f"  Chunk size: {baseline_config['chunk_size']} chars")
print(f"  Chunk overlap: {baseline_config['chunk_overlap']} chars")
print(f"  Retrieval k: {baseline_config['k']} chunks")
print(f"\nWe've tested {len(all_results)} questions with this config")
print(f"Average answer length: {df['Answer_Length'].mean():.0f} chars")

Current Configuration (Baseline):
  Chunk size: 800 chars
  Chunk overlap: 150 chars
  Retrieval k: 4 chunks

We've tested 22 questions with this config
Average answer length: 192 chars


In [14]:
"""
Experiment 1: Try k=3, 4, 5 (keeping chunk_size=800)
Question: Does more context (higher k) improve answers?
"""

test_questions = [
    "What was JPMorgan's total revenue in 2024?",
    "Compare JPMorgan and Bank of America's revenue growth",
    "What is Bank of America's investment banking revenue?"
]

k_experiments = []

for k_val in [3, 4, 5]:
    print(f"\n{'='*60}")
    print(f"Testing k={k_val}")
    print(f"{'='*60}")
    
    for q in test_questions:
        result = ask_question(q, k=k_val, verbose=False)
        k_experiments.append({
            'k': k_val,
            'question': q,
            'answer': result['answer'],
            'answer_length': len(result['answer']),
            'num_sources': len(result['sources'])
        })
        print(f"‚úì k={k_val}: {q[:50]}... ‚Üí {len(result['answer'])} chars")

print(f"\n‚úì Tested {len(test_questions)} questions √ó 3 k values = {len(k_experiments)} total tests")


Testing k=3
‚úì k=3: What was JPMorgan's total revenue in 2024?... ‚Üí 16 chars
‚úì k=3: Compare JPMorgan and Bank of America's revenue gro... ‚Üí 226 chars
‚úì k=3: What is Bank of America's investment banking reven... ‚Üí 14 chars

Testing k=4
‚úì k=4: What was JPMorgan's total revenue in 2024?... ‚Üí 16 chars
‚úì k=4: Compare JPMorgan and Bank of America's revenue gro... ‚Üí 141 chars
‚úì k=4: What is Bank of America's investment banking reven... ‚Üí 14 chars

Testing k=5
‚úì k=5: What was JPMorgan's total revenue in 2024?... ‚Üí 16 chars
‚úì k=5: Compare JPMorgan and Bank of America's revenue gro... ‚Üí 116 chars
‚úì k=5: What is Bank of America's investment banking reven... ‚Üí 71 chars

‚úì Tested 3 questions √ó 3 k values = 9 total tests


In [15]:
"""
Compare k values - which performs best?
"""

k_df = pd.DataFrame(k_experiments)

print("K Value Comparison:\n")
print(k_df.groupby('k').agg({
    'answer_length': ['mean', 'std'],
    'num_sources': 'first'
}).round(1))

print("\n" + "="*60)
print("Analysis:")
print("="*60)

for k_val in [3, 4, 5]:
    subset = k_df[k_df['k'] == k_val]
    avg_len = subset['answer_length'].mean()
    print(f"k={k_val}: Avg answer length = {avg_len:.0f} chars")

print("\nRecommendation:")
print("- k=3: Fastest, but might miss context")
print("- k=4: Good balance (current baseline)")
print("- k=5: Most context, but potentially more noise")
print("\n‚Üí We'll stick with k=4 unless you see issues")

K Value Comparison:

  answer_length        num_sources
           mean    std       first
k                                 
3          85.3  121.8           3
4          57.0   72.8           4
5          67.7   50.1           5

Analysis:
k=3: Avg answer length = 85 chars
k=4: Avg answer length = 57 chars
k=5: Avg answer length = 68 chars

Recommendation:
- k=3: Fastest, but might miss context
- k=4: Good balance (current baseline)
- k=5: Most context, but potentially more noise

‚Üí We'll stick with k=4 unless you see issues


In [16]:
"""
Understanding Chunk Sizes:
We created chunks yesterday with 800 chars. 
To test other sizes, we'd need to re-run ingestion.

But let's understand the tradeoffs:
"""

print("Chunk Size Tradeoffs:\n")

tradeoffs = {
    600: {
        'pros': ['More precise retrieval', 'Less noise per chunk'],
        'cons': ['Might split important context', 'Need more chunks (higher k)']
    },
    800: {
        'pros': ['Good balance', 'Captures full paragraphs', 'Current baseline'],
        'cons': ['Moderate - balanced tradeoffs']
    },
    1000: {
        'pros': ['More context per chunk', 'Better for complex queries'],
        'cons': ['Harder to find exact match', 'More noise']
    }
}

for size, info in tradeoffs.items():
    print(f"\n{size} characters:")
    print(f"  ‚úÖ Pros: {', '.join(info['pros'])}")
    print(f"  ‚ùå Cons: {', '.join(info['cons'])}")

print("\n" + "="*60)
print("DECISION: Keep 800 chars (our current setting)")
print("="*60)
print("Why? It's been working well (good answers, correct sources)")
print("Re-ingesting just to test would take 10+ minutes")
print("Our baseline is solid!")

Chunk Size Tradeoffs:


600 characters:
  ‚úÖ Pros: More precise retrieval, Less noise per chunk
  ‚ùå Cons: Might split important context, Need more chunks (higher k)

800 characters:
  ‚úÖ Pros: Good balance, Captures full paragraphs, Current baseline
  ‚ùå Cons: Moderate - balanced tradeoffs

1000 characters:
  ‚úÖ Pros: More context per chunk, Better for complex queries
  ‚ùå Cons: Harder to find exact match, More noise

DECISION: Keep 800 chars (our current setting)
Why? It's been working well (good answers, correct sources)
Re-ingesting just to test would take 10+ minutes
Our baseline is solid!


‚úì Metadata filtering function created!


In [19]:
"""
Experiment 2: Metadata Filtering (FIXED)
Force retrieval to only search specific companies/years
"""

def ask_with_filter(question, ticker=None, year=None, k=4, verbose=True):
    """Ask question with metadata filtering"""
    
    # Build filter with ChromaDB syntax
    filter_dict = None
    
    if ticker and year:
        # Multiple filters need $and operator
        filter_dict = {
            "$and": [
                {"ticker": ticker},
                {"year": str(year)}
            ]
        }
    elif ticker:
        # Single filter
        filter_dict = {"ticker": ticker}
    elif year:
        # Single filter
        filter_dict = {"year": str(year)}
    
    # Create filtered retriever
    if filter_dict:
        retriever_filtered = vectorstore.as_retriever(
            search_kwargs={"k": k, "filter": filter_dict}
        )
    else:
        retriever_filtered = vectorstore.as_retriever(search_kwargs={"k": k})
    
    # Create chain
    rag_chain_filtered = (
        {"context": retriever_filtered | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    # Get answer
    answer = rag_chain_filtered.invoke(question)
    docs = retriever_filtered.invoke(question)
    
    if verbose:
        filter_str = ""
        if ticker or year:
            filter_str = " [Filter: "
            if ticker:
                filter_str += f"ticker={ticker} "
            if year:
                filter_str += f"year={year}"
            filter_str += "]"
        
        print(f"\n{'='*60}")
        print(f"Q: {question}{filter_str}")
        print(f"{'='*60}")
        print(f"\nA: {answer}\n")
        print("Sources:")
        for i, doc in enumerate(docs):
            print(f"  [{i+1}] {doc.metadata.get('source_file')} "
                  f"(Ticker: {doc.metadata.get('ticker')}, Year: {doc.metadata.get('year')})")
    
    return {
        'question': question,
        'answer': answer,
        'filters': filter_dict,
        'sources': [{'file': doc.metadata.get('source_file'), 
                     'ticker': doc.metadata.get('ticker'),
                     'year': doc.metadata.get('year')} for doc in docs]
    }

print("‚úì Metadata filtering function created (FIXED)!")

‚úì Metadata filtering function created (FIXED)!


In [20]:
"""
Analysis: When should we use filtering?
"""

print("="*60)
print("FILTERING ANALYSIS")
print("="*60)

print("\n1Ô∏è‚É£ WITHOUT FILTERING:")
print("   ‚úÖ Good for: Generic questions, comparisons across banks")
print("   ‚ùå Bad for: Specific company queries (might return wrong bank)")
print(f"   Example: '{test_q}' returned data from multiple banks")

print("\n2Ô∏è‚É£ WITH FILTERING:")
print("   ‚úÖ Good for: Specific company/year questions")
print("   ‚úÖ Guarantees: Sources match your filter")
print("   ‚ùå Trade-off: Less flexible, can't compare across banks")
print(f"   Example: Same question + filter ‚Üí precise results per bank")

print("\n" + "="*60)
print("RECOMMENDATION:")
print("="*60)
print("- Use filtering when question mentions specific company/year")
print("- Use unfiltered for comparisons or general exploration")
print("- Could auto-detect: 'JPMorgan' in question ‚Üí add ticker=JPM filter")

FILTERING ANALYSIS

1Ô∏è‚É£ WITHOUT FILTERING:
   ‚úÖ Good for: Generic questions, comparisons across banks
   ‚ùå Bad for: Specific company queries (might return wrong bank)
   Example: 'What was the total revenue in 2024?' returned data from multiple banks

2Ô∏è‚É£ WITH FILTERING:
   ‚úÖ Good for: Specific company/year questions
   ‚úÖ Guarantees: Sources match your filter
   ‚ùå Trade-off: Less flexible, can't compare across banks
   Example: Same question + filter ‚Üí precise results per bank

RECOMMENDATION:
- Use filtering when question mentions specific company/year
- Use unfiltered for comparisons or general exploration
- Could auto-detect: 'JPMorgan' in question ‚Üí add ticker=JPM filter


In [21]:
"""
Smart filtering: Auto-detect company in question
"""

def smart_ask(question, k=4, verbose=True):
    """Automatically detect if question mentions specific company"""
    
    # Company name mappings
    company_map = {
        'jpmorgan': 'JPM',
        'jpm': 'JPM',
        'jp morgan': 'JPM',
        'bank of america': 'BAC',
        'bofa': 'BAC',
        'bac': 'BAC',
        'citigroup': 'C',
        'citi': 'C'
    }
    
    # Year detection (2022-2025)
    year_detected = None
    for year in [2022, 2023, 2024, 2025]:
        if str(year) in question:
            year_detected = year
            break
    
    # Ticker detection
    ticker_detected = None
    question_lower = question.lower()
    for name, ticker in company_map.items():
        if name in question_lower:
            ticker_detected = ticker
            break
    
    # Use filtering if detected
    if verbose and (ticker_detected or year_detected):
        print(f"üîç Auto-detected filters: ", end="")
        if ticker_detected:
            print(f"Ticker={ticker_detected} ", end="")
        if year_detected:
            print(f"Year={year_detected}", end="")
        print()
    
    return ask_with_filter(question, ticker=ticker_detected, year=year_detected, k=k, verbose=verbose)

# Test it!
print("Testing smart filtering:\n")

test_questions_smart = [
    "What was JPMorgan's revenue in 2024?",  # Should filter JPM + 2024
    "How did Bank of America perform?",       # Should filter BAC
    "Compare all three banks",                # Should NOT filter
]

for q in test_questions_smart:
    smart_ask(q, verbose=True)
    print("\n")

Testing smart filtering:

üîç Auto-detected filters: Ticker=JPM Year=2024

Q: What was JPMorgan's revenue in 2024? [Filter: ticker=JPM year=2024]

A: JPMorgan's revenue in 2024 was $177,556 million.

Sources:
  [1] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [2] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [3] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)
  [4] JPM_10K_2024.pdf (Ticker: JPM, Year: 2024)


üîç Auto-detected filters: Ticker=BAC 

Q: How did Bank of America perform? [Filter: ticker=BAC ]

A: Bank of America's performance was positive, with revenue increasing by nine percent primarily driven by higher interest rates, as well as higher deposit and loan balances.

Sources:
  [1] BAC_10K_2023.pdf (Ticker: BAC, Year: 2023)
  [2] BAC_10K_2022.pdf (Ticker: BAC, Year: 2022)
  [3] BAC_10K_2022.pdf (Ticker: BAC, Year: 2022)
  [4] BAC_10K_2024.pdf (Ticker: BAC, Year: 2024)



Q: Compare all three banks

A: Based on the provided information, Bank of America, JPMorgan Chase, and the S

In [22]:
"""
Summary: Our optimal configuration
"""

final_config = {
    'chunk_size': 800,
    'chunk_overlap': 150,
    'k': 4,
    'filtering': 'smart (auto-detect company/year from question)'
}

print("="*60)
print("üèÜ FINAL OPTIMIZED CONFIGURATION")
print("="*60)

print(f"\nüì¶ Chunking:")
print(f"  ‚Ä¢ Size: {final_config['chunk_size']} chars")
print(f"  ‚Ä¢ Overlap: {final_config['chunk_overlap']} chars")
print(f"  ‚Ä¢ Rationale: Good balance, captures full context")

print(f"\nüîç Retrieval:")
print(f"  ‚Ä¢ k value: {final_config['k']} chunks")
print(f"  ‚Ä¢ Rationale: Enough context without noise")

print(f"\nüéØ Filtering:")
print(f"  ‚Ä¢ Strategy: {final_config['filtering']}")
print(f"  ‚Ä¢ Rationale: Improves precision for company-specific queries")

print("\n‚úÖ This configuration tested on 22+ questions with good results!")

# Save configuration
with open('../data/processed/final_config.json', 'w') as f:
    json.dump(final_config, f, indent=2)

print("\nüíæ Configuration saved to: data/processed/final_config.json")

üèÜ FINAL OPTIMIZED CONFIGURATION

üì¶ Chunking:
  ‚Ä¢ Size: 800 chars
  ‚Ä¢ Overlap: 150 chars
  ‚Ä¢ Rationale: Good balance, captures full context

üîç Retrieval:
  ‚Ä¢ k value: 4 chunks
  ‚Ä¢ Rationale: Enough context without noise

üéØ Filtering:
  ‚Ä¢ Strategy: smart (auto-detect company/year from question)
  ‚Ä¢ Rationale: Improves precision for company-specific queries

‚úÖ This configuration tested on 22+ questions with good results!

üíæ Configuration saved to: data/processed/final_config.json


In [23]:
"""
Save all afternoon experiment results
"""

# Save k experiments
k_df.to_csv('../data/processed/k_value_experiments.csv', index=False)

# Save all results (morning + afternoon)
with open('../data/processed/all_test_results.json', 'w') as f:
    json.dump({
        'baseline_results': all_results,
        'k_experiments': k_experiments,
        'config': final_config,
        'timestamp': datetime.now().isoformat()
    }, f, indent=2)

print("‚úÖ Results saved!")
print("  ‚Ä¢ k_value_experiments.csv")
print("  ‚Ä¢ all_test_results.json")
print("  ‚Ä¢ final_config.json")

‚úÖ Results saved!
  ‚Ä¢ k_value_experiments.csv
  ‚Ä¢ all_test_results.json
  ‚Ä¢ final_config.json
