## Setup & Load Data

In [None]:
import json
from pathlib import Path

import pandas as pd

from scientific_rag.application.rag.pipeline import RAGPipeline
from scientific_rag.domain.documents import PaperChunk

In [None]:
chunks_file = Path("../data/processed/chunks_arxiv.json")

print(f"Loading chunks from {chunks_file}...")
with open(chunks_file, encoding="utf-8") as f:
    chunks_data = json.load(f)

chunks = [PaperChunk(**chunk_data) for chunk_data in chunks_data]
print(f"Loaded {len(chunks)} chunks")

In [None]:
pipeline = RAGPipeline(chunks=chunks)
print("Pipeline initialized!")

## Test Queries

In [None]:
test_queries = [
    "What methods are used for axoplasmic reticula annotation in neural EM data?",
    "Explain neural network architectures and training approaches",
    "What are the applications of magnetoresistance in nanotechnology?",
]

## Test 1: Full Pipeline (All Components Enabled)

In [None]:
query = test_queries[0]
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=False,
    use_query_expansion=True,
    use_bm25=True,
    use_dense=True,
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print("ANSWER:")
print(response.answer)
print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Generated variations: {len(response.generated_query_variations)}")
print(f"Applied filters: {response.used_filters}")

## Test 2: Without Query Expansion

In [None]:
query = test_queries[0]
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=False,
    use_query_expansion=False,  # Disabled
    use_bm25=True,
    use_dense=True,
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print("ANSWER:")
print(response.answer)
print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Generated variations: {len(response.generated_query_variations)}")

## Test 3: BM25 Only (No Dense Retrieval)

In [None]:
query = test_queries[1]
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=False,
    use_query_expansion=True,
    use_bm25=True,
    use_dense=False,  # Disabled
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print("ANSWER:")
print(response.answer)
print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Applied filters: {response.used_filters}")

## Test 4: Dense Only (No BM25)

In [None]:
query = test_queries[1]
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=False,
    use_query_expansion=True,
    use_bm25=False,  # Disabled
    use_dense=True,
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print("ANSWER:")
print(response.answer)
print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Applied filters: {response.used_filters}")

## Test 5: Without Reranking

In [None]:
query = test_queries[2]
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=False,
    use_query_expansion=True,
    use_bm25=True,
    use_dense=True,
    use_reranking=False,  # Disabled
    retrieval_top_k=20,
    rerank_top_k=5,
)

print("ANSWER:")
print(response.answer)
print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Applied filters: {response.used_filters}")

## Test 6: Query with Section Filter

In [None]:
query = test_queries[0]  # Has "methods"
print(f"Query: {query}\n")

response = pipeline.run(
    query=query,
    use_self_query=True,  # Should extract 'methods' filter
    use_query_expansion=True,
    use_bm25=True,
    use_dense=True,
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print(f"Execution time: {response.execution_time:.2f}s")
print(f"Retrieved chunks: {len(response.retrieved_chunks)}")
print(f"Applied filters: {response.used_filters}")
print("\nGenerated query variations:")
for i, var in enumerate(response.generated_query_variations, 1):
    print(f"  {i}. {var}")

## Inspect Retrieved Chunks

In [None]:
print(f"\nTop {len(response.retrieved_chunks)} retrieved chunks:\n")

for i, chunk in enumerate(response.retrieved_chunks, 1):
    print(f"[{i}] Score: {chunk.score:.4f} | Section: {chunk.section.value} | Paper: {chunk.paper_id}")
    print(f"Text: {chunk.text[:150]}...")

## Compare Different Configurations

In [None]:
configs = [
    {"name": "Full Pipeline", "use_bm25": True, "use_dense": True, "use_reranking": True, "use_expansion": True},
    {"name": "No Reranking", "use_bm25": True, "use_dense": True, "use_reranking": False, "use_expansion": True},
    {"name": "BM25 Only", "use_bm25": True, "use_dense": False, "use_reranking": True, "use_expansion": True},
    {"name": "Dense Only", "use_bm25": False, "use_dense": True, "use_reranking": True, "use_expansion": True},
    {"name": "No Expansion", "use_bm25": True, "use_dense": True, "use_reranking": True, "use_expansion": False},
]

test_query = test_queries[0]
results = []

print(f"Testing query: {test_query}\n")

for config in configs:
    print(f"Running: {config['name']}...")

    response = pipeline.run(
        query=test_query,
        use_self_query=True,
        use_query_expansion=config["use_expansion"],
        use_bm25=config["use_bm25"],
        use_dense=config["use_dense"],
        use_reranking=config["use_reranking"],
        retrieval_top_k=20,
        rerank_top_k=5,
    )

    results.append(
        {
            "Configuration": config["name"],
            "Execution Time (s)": round(response.execution_time, 2),
            "Chunks Retrieved": len(response.retrieved_chunks),
            "Query Variations": len(response.generated_query_variations),
            "Answer Length": len(response.answer),
        }
    )

df = pd.DataFrame(results)
print("COMPARISON RESULTS")
print(df.to_string(index=False))

## Custom Query Test

In [None]:
custom_query = "What are neural networks?"

response = pipeline.run(
    query=custom_query,
    use_self_query=True,
    use_query_expansion=True,
    use_bm25=True,
    use_dense=True,
    use_reranking=True,
    retrieval_top_k=20,
    rerank_top_k=5,
)

print(f"Query: {custom_query}\n")
print(response.answer)
print(
    f"Stats: {response.execution_time:.2f}s | {len(response.retrieved_chunks)} chunks | Filters: {response.used_filters}"
)