# 🎯 Strategy 3: Hybrid Approach (LlamaParse + AI Enhancement)

**Philosophy**: Combine the best of both worlds - use LlamaParse's sophisticated parsing capabilities, then enhance with custom AI processing for optimal academic structure.

## Optimization Areas:
- LlamaParse parameter optimization
- AI enhancement prompts
- Intelligent result merging
- Quality validation and filtering
- Multi-stage processing pipeline

## Available Papers:
- `30YearsResearchGate.pdf`
- `SchenkBekkerSchmitt2025PrecRes.pdf`

In [22]:
# Install required packages
!pip3 install llama_parse together pydantic

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [23]:
from llama_parse import LlamaParse
from together import Together
from pydantic import BaseModel, Field
from typing import Optional
import json
import re
import os

In [None]:
# API Keys
api_key = "llx-xxx"
tog_api = "xxx"
together = Together(api_key=tog_api)

In [25]:
# ⚡ OPTIMIZATION AREA: Enhanced Data Model
class ResearchChunk(BaseModel):
    section_type: str = Field(description="Type of section: abstract, introduction, methodology, results, discussion, conclusion, references, etc.")
    section_number: Optional[str] = Field(description="Section number if available (e.g., '2.1', '3.2')")
    page_number: int = Field(description="Page number of the chunk")
    content: str = Field(description="Parsed content of the chunk")
    is_figure_caption: bool = Field(default=False, description="Whether this chunk is a figure caption")
    is_table: bool = Field(default=False, description="Whether this chunk contains table data")
    
    # Enhanced fields for hybrid approach
    confidence_score: Optional[float] = Field(default=None, description="AI confidence in parsing accuracy")
    has_citations: bool = Field(default=False, description="Whether chunk contains citations")
    has_equations: bool = Field(default=False, description="Whether chunk contains mathematical equations")
    keywords: Optional[list[str]] = Field(default=None, description="Key academic terms found in chunk")
    subsection_title: Optional[str] = Field(default=None, description="Subsection title if identifiable")
    source_method: Optional[str] = Field(default=None, description="Method used to extract this chunk (llamaparse, ai_enhancement, merged)")

class ResearchPaper(BaseModel):
    title: Optional[str] = Field(description="Title of the research paper if identifiable")
    authors: Optional[str] = Field(description="Authors of the paper if identifiable")
    chunks: list[ResearchChunk] = Field(description="List of chunks that build the research paper")
    
    # Enhanced metadata
    abstract: Optional[str] = Field(default=None, description="Paper abstract if identifiable")
    publication_year: Optional[int] = Field(default=None, description="Publication year if found")
    journal: Optional[str] = Field(default=None, description="Journal or venue if identifiable")
    doi: Optional[str] = Field(default=None, description="DOI if found in paper")
    total_pages: Optional[int] = Field(default=None, description="Total number of pages processed")
    processing_method: Optional[str] = Field(default=None, description="Method used for processing (hybrid)")

In [26]:
# Research paper file paths
research_paper_1 = "/Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/30YearsResearchGate.pdf"
research_paper_2 = "/Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/SchenkBekkerSchmitt2025PrecRes.pdf"

# Output directory
output_dir = "/Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/input_papers/"
os.makedirs(output_dir, exist_ok=True)

print(f"Paper 1: {research_paper_1}")
print(f"Paper 2: {research_paper_2}")

Paper 1: /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/30YearsResearchGate.pdf
Paper 2: /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/SchenkBekkerSchmitt2025PrecRes.pdf


In [27]:
# ⚡ OPTIMIZATION AREA 1: Optimized LlamaParse Configuration
llamaparse_academic_prompt = """
The provided document is a research paper. Parse it systematically while preserving academic structure.

ACADEMIC PARSING PRIORITIES:
1. Identify and preserve section hierarchy (abstract, introduction, methodology, results, discussion, conclusion)
2. Maintain figure captions and table data with clear labeling
3. Preserve section numbering and subsection structure
4. Keep mathematical formulas, equations, and citations intact
5. Maintain academic formatting and terminology
6. Separate different content types (text, figures, tables, references)
7. Preserve page information for reference

Output should be well-structured markdown optimized for further AI processing.
"""

def stage1_llamaparse(pdf_path: str, temp_output_path: str):
    """Stage 1: Extract content using optimized LlamaParse"""
    
    # ⚡ OPTIMIZATION AREA: LlamaParse Parameters
    parser = LlamaParse(
        api_key=api_key,
        result_type="markdown",  # 🔧 TRY: "text", "markdown" for different formats
        system_prompt=llamaparse_academic_prompt,
        verbose=True,
        # 🔧 TRY: Experiment with these parameters:
        # language="en",
        # num_workers=4,
        # split_by_page=True,
        # use_vendor_multimodal_model=True,
    )
    
    print(f"Stage 1: LlamaParse processing {pdf_path}")
    parsed_documents = parser.load_data(pdf_path)
    
    # Save intermediate result
    with open(temp_output_path, 'w', encoding='utf-8') as f:
        for doc in parsed_documents:
            f.write(doc.text + '\n')
    
    print(f"Stage 1 complete: Saved to {temp_output_path}")
    
    # Read back the content
    with open(temp_output_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    return content

In [28]:
# ⚡ OPTIMIZATION AREA 2: AI Enhancement Prompts
ai_enhancement_prompt = """
You are processing research paper content that was already parsed by LlamaParse. 
Enhance the structure by breaking it into precise academic chunks.

ENHANCEMENT OBJECTIVES:
1. Create granular chunks for each distinct academic element
2. Identify and classify section types with high accuracy
3. Extract metadata (section numbers, page references)
4. Detect and separate figures, tables, and captions
5. Preserve citations and mathematical content
6. Maintain academic language and terminology

CRITICAL JSON FORMATTING:
1. Escape all quotes in content with \\"
2. Replace newlines in content with \\n
3. Ensure valid JSON structure
4. No trailing commas
5. Keep content strings under 1000 characters each

CHUNKING REQUIREMENTS:
1. Create AT LEAST 20-40 chunks from the provided text
2. Each paragraph should be its own chunk
3. Each section/subsection should be separate chunks
4. Identify section types: abstract, introduction, methodology, results, discussion, conclusion, references
5. Extract section numbers (e.g., "2.1", "3.2") when available
6. Mark figure captions and tables separately
7. Preserve academic language and citations

For each chunk:
- section_type: abstract, introduction, methodology, results, discussion, conclusion, references, figure_caption, table, other
- section_number: extract from text (e.g., "2.1", "3.2") or null
- page_number: extract from text or estimate
- content: the actual text content (PROPERLY ESCAPED and UNDER 1000 chars)
- is_figure_caption: true if this is a figure caption
- is_table: true if this contains table data
- source_method: "ai_enhancement"

CRITICAL: Do NOT merge paragraphs or sections. Each distinct element = one chunk.
"""

def robust_json_parse(ai_response, schema_class):
    """Robust JSON parsing with multiple fallback strategies"""
    
    if not ai_response or not ai_response.choices or len(ai_response.choices) == 0:
        raise Exception("Invalid AI response: no choices available")
    
    raw_content = ai_response.choices[0].message.content
    
    if not raw_content:
        raise Exception("Invalid AI response: no content available")
    
    # Strategy 1: Direct parsing
    try:
        return schema_class.model_validate_json(raw_content)
    except Exception as e1:
        print(f"Direct parsing failed: {e1}")
    
    # Strategy 2: Clean and retry
    try:
        cleaned = re.sub(r'(?<!\\)"(?![,:}\]])', r'\\"', raw_content)
        cleaned = re.sub(r'(?<!\\)\n(?![,:}\]])', r'\\n', cleaned)
        json.loads(cleaned)  # Validate
        return schema_class.model_validate_json(cleaned)
    except Exception as e2:
        print(f"Cleaning strategy failed: {e2}")
    
    # Strategy 3: AI reformat
    try:
        print("🔧 Requesting AI to reformat response...")
        reformat_response = together.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "Fix this JSON by properly escaping quotes, removing trailing commas, ensuring valid format. Keep all data intact.",
                },
                {
                    "role": "user",
                    "content": f"Fix this JSON:\n{raw_content[:8000]}",
                },
            ],
            model="meta-llama/Llama-Vision-Free",
            temperature=0.0,
            stream=False,
        )
        
        if reformat_response and reformat_response.choices and len(reformat_response.choices) > 0:
            reformat_content = reformat_response.choices[0].message.content
            if reformat_content:
                return schema_class.model_validate_json(reformat_content)
    except Exception as e3:
        print(f"AI reformat failed: {e3}")
    
    # Strategy 4: Manual fallback
    print("🔧 Using manual fallback...")
    return ResearchPaper(
        title="Research Paper (Hybrid - Fallback)",
        authors="Unknown (parsing error)",
        chunks=[
            ResearchChunk(
                section_type="other",
                page_number=1,
                content="Failed to parse content. Please check the input format.",
                source_method="fallback"
            )
        ],
        processing_method="hybrid"
    )

def stage2_ai_enhancement(llamaparse_content: str):
    """Stage 2: Enhance LlamaParse output with AI structuring"""
    
    print(f"Stage 2: AI enhancement processing ({len(llamaparse_content)} chars)")
    
    # ⚡ OPTIMIZATION AREA: AI Model and Parameters
    response = together.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": ai_enhancement_prompt,
            },
            {
                "role": "user",
                "content": f"Enhance this LlamaParse content into structured chunks:\n\n{llamaparse_content}",
            },
        ],
        model="meta-llama/Llama-Vision-Free",  # 🔧 TRY: Different models
        response_format={"type": "json_object", "schema": ResearchPaper.model_json_schema()},
        temperature=0.1,  # 🔧 TRY: Adjust for consistency vs creativity
        max_tokens=4000,  # 🔧 TRY: Adjust based on content length
        stream=False,
        # 🔧 TRY: Add other parameters like top_p, frequency_penalty
    )
    
    # Parse response with robust handling
    try:
        enhanced_data = robust_json_parse(response, ResearchPaper)
        print(f"Stage 2 complete: Generated {len(enhanced_data.chunks)} chunks")
        enhanced_data.processing_method = "hybrid"
        return enhanced_data
    except Exception as e:
        print(f"❌ Stage 2 failed: {e}")
        return None

In [29]:
# ⚡ OPTIMIZATION AREA 3: Intelligent Result Processing and Validation
def stage3_quality_enhancement(research_data: ResearchPaper, original_content: str):
    """Stage 3: Quality enhancement and validation"""
    
    if not research_data or not research_data.chunks:
        return research_data
    
    print(f"Stage 3: Quality enhancement for {len(research_data.chunks)} chunks")
    
    enhanced_chunks = []
    
    for chunk in research_data.chunks:
        # ⚡ OPTIMIZATION AREA: Quality Assessment
        # Add confidence scoring, content validation, etc.
        
        # Basic quality checks
        if len(chunk.content.strip()) < 20:  # Skip very short chunks
            continue
            
        # Enhance chunk metadata
        enhanced_chunk = chunk.model_copy()
        
        # Add confidence score based on content quality
        confidence = 0.8  # Base confidence
        if chunk.section_number:
            confidence += 0.1
        if chunk.section_type in ['abstract', 'introduction', 'methodology', 'results', 'discussion', 'conclusion']:
            confidence += 0.1
        enhanced_chunk.confidence_score = min(confidence, 1.0)
        
        # Detect citations
        if '(' in chunk.content and ')' in chunk.content:
            enhanced_chunk.has_citations = True
            
        # Detect equations (basic heuristic)
        if any(symbol in chunk.content for symbol in ['=', '∑', '∫', '∆', 'α', 'β', 'γ']):
            enhanced_chunk.has_equations = True
        
        # Extract keywords (basic implementation)
        academic_keywords = ['research', 'study', 'analysis', 'method', 'result', 'conclusion', 'data', 'model']
        found_keywords = [kw for kw in academic_keywords if kw.lower() in chunk.content.lower()]
        if found_keywords:
            enhanced_chunk.keywords = found_keywords[:5]  # Limit to 5
        
        enhanced_chunks.append(enhanced_chunk)
    
    # Update research paper with enhanced chunks
    research_data.chunks = enhanced_chunks
    
    print(f"Stage 3 complete: {len(enhanced_chunks)} quality-enhanced chunks")
    return research_data

def process_paper_hybrid(pdf_path: str, output_filename: str):
    """Complete hybrid processing pipeline"""
    
    print(f"\n🚀 HYBRID PROCESSING: {pdf_path}")
    print("=" * 60)
    
    # Stage 1: LlamaParse
    temp_file = os.path.join(output_dir, f"temp_{output_filename.replace('.md', '_llamaparse.md')}")
    llamaparse_content = stage1_llamaparse(pdf_path, temp_file)
    
    # Stage 2: AI Enhancement
    enhanced_data = stage2_ai_enhancement(llamaparse_content)
    
    if not enhanced_data:
        print("❌ Hybrid processing failed")
        return None
    
    # Stage 3: Quality Enhancement
    final_data = stage3_quality_enhancement(enhanced_data, llamaparse_content)
    
    # Save results
    output_path = os.path.join(output_dir, output_filename)
    save_hybrid_results(final_data, output_path)
    
    # Cleanup temp file
    if os.path.exists(temp_file):
        os.remove(temp_file)
    
    print(f"\n✅ HYBRID PROCESSING COMPLETE: {output_path}")
    return final_data

def save_hybrid_results(research_data: ResearchPaper, output_path: str):
    """Save hybrid processing results with enhanced formatting"""
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"# {research_data.title or 'Research Paper'} (Hybrid Processing)\n\n")
        f.write(f"**Authors:** {research_data.authors or 'Not identified'}\n\n")
        f.write(f"**Processing Method:** {research_data.processing_method}\n\n")
        f.write(f"**Total Chunks:** {len(research_data.chunks)}\n\n")
        
        if research_data.abstract:
            f.write(f"**Abstract:** {research_data.abstract}\n\n")
        
        f.write("---\n\n")
        
        for i, chunk in enumerate(research_data.chunks):
            f.write(f"## Chunk {i+1}\n\n")
            f.write(f"- **Section Type:** {chunk.section_type}\n")
            f.write(f"- **Section Number:** {chunk.section_number or 'N/A'}\n")
            f.write(f"- **Page:** {chunk.page_number}\n")
            f.write(f"- **Figure Caption:** {chunk.is_figure_caption}\n")
            f.write(f"- **Table:** {chunk.is_table}\n")
            f.write(f"- **Confidence:** {chunk.confidence_score or 'N/A'}\n")
            f.write(f"- **Has Citations:** {chunk.has_citations}\n")
            f.write(f"- **Has Equations:** {chunk.has_equations}\n")
            if chunk.keywords:
                f.write(f"- **Keywords:** {', '.join(chunk.keywords)}\n")
            f.write(f"- **Source Method:** {chunk.source_method or 'hybrid'}\n\n")
            f.write(f"**Content:**\n{chunk.content}\n\n")
            f.write("---\n\n")
    
    print(f"Saved {len(research_data.chunks)} enhanced chunks to: {output_path}")

In [30]:
# Process first research paper with hybrid approach
print("Processing Paper 1 with Hybrid Strategy...")
hybrid_data_1 = process_paper_hybrid(research_paper_1, "strategy3_paper1_hybrid.md")

Processing Paper 1 with Hybrid Strategy...

🚀 HYBRID PROCESSING: /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/30YearsResearchGate.pdf
Stage 1: LlamaParse processing /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/papers/30YearsResearchGate.pdf
Started parsing the file under job_id 93129083-8720-4b13-a601-8f0b761fb9ac
Started parsing the file under job_id 93129083-8720-4b13-a601-8f0b761fb9ac
Stage 1 complete: Saved to /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/input_papers/temp_strategy3_paper1_hybrid_llamaparse.md
Stage 2: AI enhancement processing (53349 chars)
Stage 1 complete: Saved to /Users/fredygerman/Personal/builds/exp/twiga-challenge-1/data/input_papers/temp_strategy3_paper1_hybrid_llamaparse.md
Stage 2: AI enhancement processing (53349 chars)


APIError: Error code: 402 - {"message": "Credit limit exceeded. Please navigate to https://api.together.xyz/settings/billing to add credit or upgrade your plan.", "type_": "credit_limit"}

In [None]:
# Process second research paper with hybrid approach
print("Processing Paper 2 with Hybrid Strategy...")
hybrid_data_2 = process_paper_hybrid(research_paper_2, "strategy3_paper2_hybrid.md")

In [None]:
# ⚡ OPTIMIZATION AREA 4: Advanced Quality Analysis
def analyze_hybrid_results(research_data, paper_name):
    """Comprehensive analysis of hybrid processing results"""
    
    if not research_data or not research_data.chunks:
        print(f"❌ No data for {paper_name}")
        return {}
    
    # Advanced metrics
    section_types = {}
    confidence_scores = []
    figure_count = 0
    table_count = 0
    citation_chunks = 0
    equation_chunks = 0
    total_keywords = 0
    content_lengths = []
    
    for chunk in research_data.chunks:
        # Basic counts
        section_types[chunk.section_type] = section_types.get(chunk.section_type, 0) + 1
        content_lengths.append(len(chunk.content))
        
        # Quality metrics
        if chunk.confidence_score:
            confidence_scores.append(chunk.confidence_score)
        
        if chunk.is_figure_caption:
            figure_count += 1
        if chunk.is_table:
            table_count += 1
        if chunk.has_citations:
            citation_chunks += 1
        if chunk.has_equations:
            equation_chunks += 1
        if chunk.keywords:
            total_keywords += len(chunk.keywords)
    
    # Calculate statistics
    avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0
    avg_content_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
    
    print(f"\n📊 HYBRID ANALYSIS FOR {paper_name}:")
    print(f"Paper Title: {research_data.title}")
    print(f"Authors: {research_data.authors}")
    print(f"Processing Method: {research_data.processing_method}")
    print(f"Total chunks: {len(research_data.chunks)}")
    print(f"Average confidence: {avg_confidence:.2f}")
    print(f"Section distribution: {section_types}")
    print(f"Figures detected: {figure_count}")
    print(f"Tables detected: {table_count}")
    print(f"Chunks with citations: {citation_chunks}")
    print(f"Chunks with equations: {equation_chunks}")
    print(f"Total keywords found: {total_keywords}")
    print(f"Average chunk length: {avg_content_length:.0f} characters")
    
    return {
        'chunks': len(research_data.chunks),
        'avg_confidence': avg_confidence,
        'sections': section_types,
        'figures': figure_count,
        'tables': table_count,
        'citations': citation_chunks,
        'equations': equation_chunks,
        'keywords': total_keywords,
        'avg_length': avg_content_length
    }

# Analyze both papers
if hybrid_data_1:
    results_1 = analyze_hybrid_results(hybrid_data_1, "Paper 1 (30YearsResearchGate)")
else:
    results_1 = {}
    
if hybrid_data_2:
    results_2 = analyze_hybrid_results(hybrid_data_2, "Paper 2 (SchenkBekkerSchmitt2025)")
else:
    results_2 = {}

In [None]:
# 🎯 YOUR OPTIMIZATION WORKSPACE
# Implement your Strategy 3 optimizations here

print("🚀 Strategy 3 Optimization Workspace")
print("\nOptimization Areas for Hybrid Approach:")
print("☐ LlamaParse parameter optimization")
print("☐ AI enhancement prompt refinement")
print("☐ Multi-stage processing pipeline")
print("☐ Intelligent result merging")
print("☐ Quality validation and filtering")
print("☐ Advanced metadata extraction")

# TODO: Add your optimized Hybrid implementation here

# Example optimization template:
'''
# Your optimized hybrid implementation:

# Stage 1: Optimized LlamaParse
optimized_llamaparse_config = {
    "result_type": "YOUR_OPTIMAL_TYPE",
    "system_prompt": "YOUR_OPTIMIZED_LLAMAPARSE_PROMPT",
    "language": "en",
    "num_workers": 4,
    # Add other optimized parameters
}

# Stage 2: Enhanced AI Processing
def your_enhanced_ai_processing(content):
    # Multiple AI passes for different aspects:
    # 1. Structure identification
    # 2. Content classification
    # 3. Metadata extraction
    # 4. Quality validation
    pass

# Stage 3: Intelligent Merging
def your_intelligent_merger(llamaparse_result, ai_result):
    # Combine results intelligently:
    # - Use confidence scores
    # - Resolve conflicts
    # - Optimize chunk boundaries
    pass

# Stage 4: Quality Enhancement
def your_quality_enhancer(merged_result):
    # Final quality improvements:
    # - Content validation
    # - Metadata enrichment
    # - Consistency checks
    pass
'''

In [None]:
# 📊 STRATEGY 3 RESULTS SUMMARY
print("🏆 STRATEGY 3: HYBRID APPROACH RESULTS")
print("=" * 50)

if results_1:
    print(f"\nPaper 1 Results:")
    print(f"  Total Chunks: {results_1['chunks']}")
    print(f"  Avg Confidence: {results_1['avg_confidence']:.2f}")
    print(f"  Section Types: {len(results_1['sections'])}")
    print(f"  Enhanced Features: {results_1['citations']} citations, {results_1['equations']} equations")
    print(f"  Figures/Tables: {results_1['figures'] + results_1['tables']}")
    print(f"  Keywords Found: {results_1['keywords']}")

if results_2:
    print(f"\nPaper 2 Results:")
    print(f"  Total Chunks: {results_2['chunks']}")
    print(f"  Avg Confidence: {results_2['avg_confidence']:.2f}")
    print(f"  Section Types: {len(results_2['sections'])}")
    print(f"  Enhanced Features: {results_2['citations']} citations, {results_2['equations']} equations")
    print(f"  Figures/Tables: {results_2['figures'] + results_2['tables']}")
    print(f"  Keywords Found: {results_2['keywords']}")

print("\n🎯 Hybrid Approach Advantages:")
print("✅ Combines LlamaParse accuracy with AI flexibility")
print("✅ Multi-stage processing for better quality")
print("✅ Enhanced metadata and confidence scoring")
print("✅ Robust error handling and fallbacks")
print("✅ Quality validation at each stage")

print("\n🎯 Next Steps:")
print("1. Review the enhanced chunk files")
print("2. Implement your multi-stage optimizations")
print("3. Test different model combinations")
print("4. Compare with single-method strategies")
print("5. Document your hybrid improvements")