# Papers QA: Production Pipeline

This notebook demonstrates the complete Papers QA pipeline with professional-grade code and best practices.

## Sections:
1. Setup & Configuration
2. Data Loading & Processing
3. Embedding & Indexing
4. QA Generation
5. Retrieval & Inference
6. Evaluation

## 1. Setup & Configuration

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import from papers_qa
from papers_qa import (
    DataLoader,
    DataProcessor,
    RetrieverPipeline,
    QAGenerator,
    QAEvaluator,
    configure_logging,
    get_logger,
    get_settings,
)
from pathlib import Path
import pandas as pd
import json

# Configure logging
configure_logging()
logger = get_logger(__name__)

# Get settings
settings = get_settings()

print(f"Environment: {settings.environment}")
print(f"Log Level: {settings.log_level}")
print(f"Embedding Model: {settings.model.embedding_model}")
print(f"Generation Model: {settings.model.generation_model}")

## 2. Data Loading & Processing

In [None]:
# Initialize data loader
loader = DataLoader()
processor = DataProcessor()

# Load documents from raw data directory
documents = loader.load_documents(settings.data.input_dir)
print(f"Loaded {len(documents)} documents")

if documents:
    print(f"\nFirst document structure:")
    print(json.dumps(documents[0], indent=2, default=str)[:500])

In [None]:
# Extract and process text
texts = []
for doc in documents:
    if isinstance(doc, dict):
        text = processor.extract_text_from_doc(doc)
    else:
        text = str(doc)
    texts.append(text)

# Show statistics
print(f"Total documents: {len(texts)}")
print(f"Average text length: {sum(len(t.split()) for t in texts) / len(texts):.0f} words")
print(f"Min/Max length: {min(len(t.split()) for t in texts)}/{max(len(t.split()) for t in texts)} words")

In [None]:
# Split long documents into chunks
chunks = []
for text in texts:
    text_chunks = processor.split_text(
        text,
        chunk_size=settings.data.chunk_size,
        overlap=settings.data.chunk_overlap
    )
    chunks.extend(text_chunks)

print(f"Total chunks after splitting: {len(chunks)}")
print(f"Sample chunk length: {len(chunks[0].split())} words" if chunks else "No chunks")

## 3. Embedding & Indexing

In [None]:
# Initialize retriever pipeline
retriever = RetrieverPipeline()

# Index documents (using chunks)
print(f"Indexing {len(chunks)} chunks...")
retriever.index_documents(chunks)
print(f"Indexing complete!")

# Save index
print(f"Saving index...")
retriever.save()
print(f"Index saved to {settings.data.cache_dir}")

## 4. QA Generation

In [None]:
# Initialize QA generator
generator = QAGenerator()

# Generate QA pairs from first few chunks
qa_pairs_list = []
for i, chunk in enumerate(chunks[:3]):  # Process first 3 chunks as example
    print(f"Generating QA pairs for chunk {i+1}...")
    try:
        qa_pairs = generator.generate_qa_pairs(chunk)
        qa_pairs_list.append(qa_pairs)
        print(f"Generated {len(qa_pairs)} QA pairs")
    except Exception as e:
        logger.error(f"Error generating QA pairs: {e}")

In [None]:
# Create QA dataset
qa_records = []
for chunk_idx, qa_pairs in enumerate(qa_pairs_list):
    for qa_pair in qa_pairs:
        qa_records.append({
            'question': qa_pair.get('question', ''),
            'answer': qa_pair.get('answer', ''),
            'context': chunks[chunk_idx],
            'chunk_id': chunk_idx
        })

qa_df = pd.DataFrame(qa_records)

print(f"\nGenerated QA Dataset:")
print(f"Total QA pairs: {len(qa_df)}")
print(f"\nDataset preview:")
print(qa_df.head())

In [None]:
# Save QA dataset
output_path = settings.data.output_dir / 'generated_qa_pairs.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
qa_df.to_csv(output_path, index=False)
print(f"QA dataset saved to {output_path}")

## 5. Retrieval & Inference

In [None]:
# Load the saved index
retriever = RetrieverPipeline()
retriever.load()
print(f"Index loaded successfully")

In [None]:
# Example queries
test_queries = [
    "What is the main topic of this paper?",
    "What are the key findings?",
    "What methods were used?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    print("=" * 80)
    
    results = retriever.retrieve(query, k=3)
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n[{i}] Similarity Score: {score:.4f}")
        print(f"Document: {doc[:200]}...")

## 6. Evaluation

In [None]:
# Initialize evaluator
evaluator = QAEvaluator()

# Example evaluation
if len(qa_df) > 0:
    # Use first QA pair as example
    reference_answer = qa_df.iloc[0]['answer']
    # For demo, use a slightly different answer
    hypothesis_answer = reference_answer[:100] + "..." if len(reference_answer) > 100 else reference_answer
    
    print(f"Reference: {reference_answer[:100]}...")
    print(f"\nHypothesis: {hypothesis_answer}")
    print("\n" + "="*80)
    
    metrics = evaluator.evaluate_answer(reference_answer, hypothesis_answer)
    
    print("\nEvaluation Metrics:")
    for key, value in metrics.items():
        if isinstance(value, (int, float)):
            print(f"  {key}: {value:.4f}")

In [None]:
# Batch evaluation
if len(qa_df) > 1:
    from papers_qa import BatchEvaluator
    
    batch_eval = BatchEvaluator()
    
    # Evaluate first few answers
    references = qa_df['answer'].head(5).tolist()
    # For demo, slightly modify predictions
    predictions = [r[:80] if len(r) > 80 else r for r in references]
    
    batch_metrics = batch_eval.evaluate_qa_pairs(references, predictions)
    
    print("\nBatch Evaluation Results:")
    for key, value in batch_metrics.items():
        if isinstance(value, (int, float)):
            print(f"  {key}: {value:.4f}")

## Summary

This notebook demonstrates the complete Papers QA pipeline:

1. **Setup**: Configuration management with Pydantic
2. **Data Loading**: Load and process documents
3. **Preprocessing**: Text cleaning and chunking
4. **Embeddings**: Create vector index with FAISS
5. **QA Generation**: Generate question-answer pairs
6. **Retrieval**: Retrieve relevant documents
7. **Evaluation**: Comprehensive metrics (BLEU, ROUGE, semantic similarity)

### Next Steps:
- Use CLI for production workflows: `papers-qa generate --input data/raw`
- Deploy with Docker: `docker-compose up`
- Run tests: `pytest tests/`
- See SETUP_GUIDE.md for more details