# Maximum Context Scientific Paper Analyzer

**Strategy**: Preserve maximum context while staying within Ollama limits

**Approach**: Extract key sections → Detailed summarization → Single comprehensive analysis

## Setup

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from citation_extractor import display_citation_info, get_acs_citation
import textwrap
import re
import datetime

# Configuration
pdf_path = "/Users/aimiegarces/Agents/d4sc03921a.pdf"

print("✅ Setup complete")

## Extract Paper Citation

In [None]:
# Extract and display citation information
citation_result = display_citation_info(pdf_path, show_metadata=True, show_all_formats=False)
acs_citation = get_acs_citation(pdf_path)

## Configure Ollama

In [None]:
# Configure Ollama for maximum context and detailed responses
llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0.1,      # Low temp for analytical consistency
    num_ctx=32768,        # Maximum context window
    num_predict=4096      # Longer responses for detailed analysis
)

print("🤖 Ollama configured for maximum context (32K tokens)")
print("📊 Response length: up to 4K tokens for detailed analysis")

## Extract Paper Sections

In [None]:
def extract_key_sections(pdf_path):
    """Extract the most information-dense sections for maximum context preservation"""
    
    # Load full paper
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    full_text = "\n".join([doc.page_content for doc in documents])
    
    print(f"📄 Original paper: {len(full_text):,} characters")
    
    # Extract abstract
    abstract_patterns = [
        r'Abstract\s*[:\-]?\s*\n(.*?)(?=\n\s*\n|\nIntroduction|\n1\s+Introduction|\nKeywords|\n\d+\.)',
        r'ABSTRACT\s*[:\-]?\s*\n(.*?)(?=\n\s*\n|\nINTRODUCTION|\n1\s+INTRODUCTION|\nKEYWORDS|\n\d+\.)',
    ]
    
    abstract = ""
    for pattern in abstract_patterns:
        match = re.search(pattern, full_text, re.DOTALL | re.IGNORECASE)
        if match:
            abstract = match.group(1).strip()
            break
    
    # If no abstract header, extract from beginning
    if not abstract or len(abstract) < 100:
        intro_pattern = r'(.*?)(?=\n\s*1\s+Introduction|\n\s*Introduction)'
        match = re.search(intro_pattern, full_text, re.DOTALL)
        if match:
            pre_intro = match.group(1)
            lines = pre_intro.split('\n')
            for i, line in enumerate(lines):
                if len(line.strip()) > 50 and any(word in line.lower() for word in ['large language', 'models', 'emerged', 'review']):
                    abstract_lines = []
                    for line in lines[i:]:
                        if line.strip() and not line.startswith('1 ') and 'Introduction' not in line:
                            abstract_lines.append(line.strip())
                        elif len(abstract_lines) > 0:
                            break
                    abstract = ' '.join(abstract_lines)
                    break
    
    # Extract major sections
    def extract_section(section_name, end_markers=None):
        if end_markers is None:
            end_markers = [r'\n\d+\s+[A-Z]', r'\n[A-Z][a-z]+\s*\n', r'\nReferences', r'\nConclusion']
        
        # Create patterns using string concatenation to avoid f-string issues
        end_pattern = "|".join(end_markers)
        pattern1 = r'\n\d+\s+' + section_name + r'\s*\n(.*?)(?=' + end_pattern + ')'
        pattern2 = r'\n' + section_name + r'\s*\n(.*?)(?=' + end_pattern + ')'
        pattern3 = section_name + r'\s*[:\-]?\s*\n(.*?)(?=' + end_pattern + ')'
        patterns = [pattern1, pattern2, pattern3]
        
        for pattern in patterns:
            match = re.search(pattern, full_text, re.DOTALL | re.IGNORECASE)
            if match:
                content = match.group(1).strip()
                if len(content) > 200:  # Meaningful content
                    return content
        return ""
    
    # Extract key sections
    sections = {
        'abstract': abstract,
        'introduction': extract_section("Introduction"),
        'methods': extract_section("Methods") or extract_section("Methodology") or extract_section("Approach"),
        'results': extract_section("Results") or extract_section("Findings"),
        'discussion': extract_section("Discussion") or extract_section("Analysis"),
        'conclusion': extract_section("Conclusion") or extract_section("Conclusions")
    }
    
    # Report extraction results
    print("\n📊 SECTION EXTRACTION RESULTS:")
    total_extracted = 0
    for name, content in sections.items():
        length = len(content)
        total_extracted += length
        status = "✅" if length > 100 else "⚠️" if length > 0 else "❌"
        print(f"{status} {name.capitalize()}: {length:,} characters")
    
    print(f"\n📈 Total extracted: {total_extracted:,} characters ({total_extracted/len(full_text)*100:.1f}% of original)")
    print(f"🧠 Estimated tokens: {total_extracted//4:,} (target: <30K for Ollama)")
    
    return sections

# Extract sections from paper
paper_sections = extract_key_sections(pdf_path)

## Create Context-Preserving Summaries

In [None]:
# Create detailed summaries that preserve maximum context
summary_prompt = ChatPromptTemplate.from_template("""
Create a detailed, comprehensive summary of this research paper section. 
PRESERVE ALL IMPORTANT INFORMATION including:
- Technical details and specifications
- Numerical results and performance metrics
- Methodology and experimental setup
- Key findings and insights
- Comparisons with other approaches
- Limitations and challenges mentioned

Maintain technical accuracy while condensing length. This summary will be used for downstream scientific analysis.

SECTION: {section_name}

CONTENT:
{content}

DETAILED SUMMARY:
""")

summary_chain = summary_prompt | llm | StrOutputParser()

# Process each section that has substantial content
section_summaries = {}
print("🔄 Creating context-preserving summaries...\n")

for section_name, content in paper_sections.items():
    if len(content) > 500:  # Only summarize substantial sections
        print(f"📝 Summarizing {section_name}... ({len(content):,} chars)")
        try:
            summary = summary_chain.invoke({
                "section_name": section_name.upper(),
                "content": content
            })
            section_summaries[section_name] = summary
            print(f"✅ {section_name} summarized to {len(summary):,} characters")
        except Exception as e:
            print(f"❌ Error summarizing {section_name}: {str(e)}")
            section_summaries[section_name] = content[:2000] + "...[truncated]"  # Fallback
    
    elif len(content) > 100:  # Keep shorter sections as-is
        section_summaries[section_name] = content
        print(f"📋 Keeping {section_name} as-is ({len(content):,} chars)")

# Calculate final context size
total_summary_length = sum(len(summary) for summary in section_summaries.values())
estimated_tokens = total_summary_length // 4

print(f"\n🎯 CONTEXT PREPARATION COMPLETE")
print(f"📊 Total summarized content: {total_summary_length:,} characters")
print(f"🧠 Estimated tokens: {estimated_tokens:,}")
print(f"✅ Fits in Ollama context: {estimated_tokens < 30000}")

## Create Analysis Prompt

In [None]:
# Comprehensive analysis prompt with detailed requirements
comprehensive_analysis_prompt = ChatPromptTemplate.from_template("""
You are an expert in AI applications for scientific research. Analyze this complete research paper using the provided comprehensive content. Generate a HIGHLY DETAILED and thorough analysis focused on scientific applications and R&D implications.

INSTRUCTION: Provide comprehensive, detailed responses for each section. Include specific examples, quantitative details, technical specifications, and actionable insights.

COMPLETE PAPER CONTENT:

ABSTRACT:
{abstract}

INTRODUCTION:
{introduction}

METHODS/APPROACH:
{methods}

RESULTS/FINDINGS:
{results}

DISCUSSION/ANALYSIS:
{discussion}

CONCLUSION:
{conclusion}

Using this complete context, provide a comprehensive analysis with the following structure:

**Executive Summary** (4-5 detailed sentences)
- Core LLM capability/advancement and its specific scientific impact
- Primary scientific domain(s) addressed with specific applications
- Key quantitative improvements or breakthroughs demonstrated

**Technical Architecture & Training**
- Model architecture details (transformer variants, layer count, attention mechanisms)
- Model sizes, parameter counts, and computational requirements
- Training datasets used (PubMed, arXiv, domain-specific corpora)
- Fine-tuning approaches, RLHF implementation, or domain adaptation methods
- Benchmark performance on scientific tasks with specific metrics

**Scientific Applications Demonstrated**
- Use cases: literature review, hypothesis generation, experimental design, data analysis
- Performance metrics on scientific reasoning and predictions
- Comparison with domain-specific tools (ChemBERTa, BioGPT, etc.)

**Research Acceleration Potential**
- Time savings demonstrated for research workflows
- Novel scientific insights generated by the model
- Integration capabilities with existing scientific software/databases
- Productivity improvements measured

**Implementation & Deployment Considerations**
- Computational requirements (GPU/CPU needs, memory, storage)
- Scalability considerations and deployment architectures
- API availability, pricing models, on-premise deployment options
- Data privacy considerations for proprietary research

**Strategic R&D Implications**
- Impact on research productivity and methodology changes
- Skills/training implications for research teams
- Competitive advantages for R&D organizations
- Regulatory or validation challenges in scientific contexts

**Future Research Directions**
- Limitations in current scientific reasoning capabilities
- Multimodal capabilities needed (chemical structures, spectra, etc.)
- Opportunities for domain-specific fine-tuning
- Emerging research areas where these technologies could be applied

Focus on actionable insights for implementing LLMs in industrial R&D environments.

COMPREHENSIVE SCIENTIFIC ANALYSIS:
""")

print("📋 Analysis prompt created")

## Generate Comprehensive Analysis

In [None]:
# Generate the final comprehensive analysis with citation
analysis_chain = comprehensive_analysis_prompt | llm | StrOutputParser()

print("🔬 Generating comprehensive scientific analysis with maximum context...\n")

try:
    # Invoke analysis with all preserved context
    final_analysis = analysis_chain.invoke({
        "abstract": section_summaries.get('abstract', 'Not available'),
        "introduction": section_summaries.get('introduction', 'Not available'),
        "methods": section_summaries.get('methods', 'Not available'),
        "results": section_summaries.get('results', 'Not available'),
        "discussion": section_summaries.get('discussion', 'Not available'),
        "conclusion": section_summaries.get('conclusion', 'Not available')
    })
    
    # Display the comprehensive analysis with citation header
    print("🔬 COMPREHENSIVE SCIENTIFIC ANALYSIS REPORT")
    print("=" * 80)
    
    # Citation header
    print("\n📖 PAPER CITATION (ACS Style):")
    print("-" * 50)
    wrapped_citation = textwrap.fill(acs_citation, width=75, initial_indent="", subsequent_indent="")
    print(wrapped_citation)
    
    # Analysis metadata
    print(f"\n📊 ANALYSIS METADATA:")
    print(f"🧠 Context used: {total_summary_length:,} characters (~{estimated_tokens:,} tokens)")
    print(f"📄 Sections analyzed: {len([s for s in section_summaries.values() if s])}")
    print(f"⏱️  Analysis date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}")
    
    print("\n" + "=" * 80)
    print()
    
    # Format and display analysis
    print(final_analysis)
    print()
    print("✅ COMPREHENSIVE ANALYSIS COMPLETE")
    print("=" * 80)
    
    # Quick copy citation
    print(f"\n📋 QUICK COPY - ACS CITATION:")
    print("-" * 40)
    print(acs_citation)
    print("-" * 40)
    
except Exception as e:
    print(f"❌ Error generating analysis: {str(e)}")
    print(f"📊 Context size might be too large: {estimated_tokens:,} tokens")
    print("💡 Try reducing section content or increasing model context window")

## Context Quality Assessment

In [None]:
# Assess how much context we preserved
print("📊 CONTEXT PRESERVATION ASSESSMENT")
print("=" * 60)

sections_found = sum(1 for content in section_summaries.values() if len(content) > 100)
total_sections = len(section_summaries)

print(f"📚 Sections successfully extracted: {sections_found}/{total_sections}")
print(f"📄 Total content for analysis: {total_summary_length:,} characters")
print(f"🧠 Token efficiency: {estimated_tokens:,}/32K tokens ({estimated_tokens/32000*100:.1f}% of max)")

print("\n📋 Section breakdown:")
for name, content in section_summaries.items():
    if content:
        print(f"  • {name.capitalize()}: {len(content):,} chars")
    else:
        print(f"  • {name.capitalize()}: ❌ Not found")

print(f"\n💡 Context quality: {'Excellent' if sections_found >= 4 else 'Good' if sections_found >= 3 else 'Limited'}")
print("=" * 60)