# Scientific Research Paper Analyzer

**Goal**: Extract full research papers and generate comprehensive scientific analysis

**Output**: Structured analysis focused on R&D applications and scientific impact

## Setup

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
import textwrap
import re

print("✅ All imports successful")

## Load Complete Research Paper

In [None]:
def load_full_paper(pdf_path, chunk_size=4000, chunk_overlap=500):
    """Load and chunk the entire research paper for comprehensive analysis"""
    # Load PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # Get full text
    full_text = "\n".join([doc.page_content for doc in documents])
    
    # Split into manageable chunks for analysis
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    chunks = text_splitter.split_text(full_text)
    
    return full_text, chunks

# Load your research paper
pdf_path = "/Users/aimiegarces/Agents/d4sc03921a.pdf"
full_text, text_chunks = load_full_paper(pdf_path)

print(f"📄 Paper loaded successfully!")
print(f"📊 Total length: {len(full_text):,} characters")
print(f"📚 Split into: {len(text_chunks)} chunks")
print(f"📝 Average chunk size: {len(full_text)//len(text_chunks):,} characters")

## Setup Ollama Model

In [None]:
# Initialize Ollama with settings optimized for analysis
llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0.1,  # Low temperature for analytical consistency
    num_ctx=4096      # Large context window for comprehensive analysis
)

print("🤖 Ollama model configured for scientific analysis")

## Create Scientific Analysis Prompt

In [None]:
# Comprehensive scientific analysis prompt
analysis_prompt = ChatPromptTemplate.from_template("""
You are an expert in AI applications for scientific research. Analyze this research paper section and extract information relevant to the following analysis structure. Focus on scientific applications and R&D implications.

PAPER SECTION:
{text_chunk}

ANALYSIS FRAMEWORK:
**Executive Summary** (2-3 sentences)
- Core LLM capability/advancement and its scientific impact
- Primary scientific domain(s) addressed

**Technical Architecture & Training**
- Model architecture, size, and training specifics
- Scientific datasets used (PubMed, arXiv, domain-specific corpora)
- Fine-tuning approaches, RLHF, or domain adaptation methods
- Benchmark performance on scientific tasks

**Scientific Applications Demonstrated**
- Specific use cases: literature review, hypothesis generation, experimental design, data analysis
- Performance on scientific reasoning, chemical/biological predictions, or research workflows
- Comparison with domain-specific tools (ChemBERTa, BioGPT, etc.)

**Experimental Validation**
- How scientific accuracy was evaluated
- Expert validation studies or blind comparisons
- Error analysis and failure modes in scientific contexts

**Research Acceleration Potential**
- Time savings demonstrated for research workflows
- Novel scientific insights generated by the model
- Integration capabilities with existing scientific software/databases

**Implementation & Deployment Considerations**
- Computational requirements and scalability
- API availability, on-premise deployment options
- Integration with lab automation, LIMS, or research platforms
- Data privacy considerations for proprietary research

**Strategic R&D Implications**
- Impact on research productivity and methodology
- Skills/training implications for research teams
- Competitive advantages for R&D organizations
- Regulatory or validation challenges in scientific contexts

**Future Research Directions**
- Identified limitations in current scientific reasoning
- Multimodal capabilities needed (chemical structures, spectra, etc.)
- Opportunities for domain-specific fine-tuning

Extract and organize relevant information from this section. If a section doesn't contain information for a particular category, write "Not covered in this section" for that category. Focus on actionable insights for implementing LLMs in industrial R&D environments.

ANALYSIS:
""")

print("📋 Scientific analysis prompt template created")

## Process Paper in Chunks

In [None]:
# Create analysis chain
analysis_chain = analysis_prompt | llm | StrOutputParser()

# Process chunks and collect analyses
chunk_analyses = []
print("🔄 Processing paper chunks for comprehensive analysis...\n")

for i, chunk in enumerate(text_chunks[:3]):  # Process first 3 chunks for demo
    print(f"📖 Analyzing chunk {i+1}/{min(3, len(text_chunks))}...")
    
    try:
        analysis = analysis_chain.invoke({"text_chunk": chunk})
        chunk_analyses.append({
            "chunk_id": i+1,
            "analysis": analysis,
            "chunk_preview": chunk[:200] + "..."
        })
        print(f"✅ Chunk {i+1} analyzed successfully")
        
    except Exception as e:
        print(f"❌ Error analyzing chunk {i+1}: {str(e)}")
        continue

print(f"\n🎯 Completed analysis of {len(chunk_analyses)} chunks")

## Create Synthesis Prompt

In [None]:
# Synthesis prompt to combine all chunk analyses
synthesis_prompt = ChatPromptTemplate.from_template("""
You are an expert in AI applications for scientific research. Synthesize the following chunk analyses into a comprehensive, coherent report about this LLM research paper. Focus on scientific applications and R&D implications.

CHUNK ANALYSES:
{combined_analyses}

Create a final comprehensive report using this structure:

**Executive Summary** (2-3 sentences)
**Technical Architecture & Training**
**Scientific Applications Demonstrated**
**Experimental Validation**
**Research Acceleration Potential**
**Implementation & Deployment Considerations**
**Strategic R&D Implications**
**Future Research Directions**

Synthesize information across all chunks, eliminating redundancy and creating a coherent narrative. Prioritize actionable insights for implementing LLMs in industrial R&D environments. Assess both opportunities and risks for scientific research acceleration.

COMPREHENSIVE SCIENTIFIC ANALYSIS:
""")

synthesis_chain = synthesis_prompt | llm | StrOutputParser()
print("🔗 Synthesis prompt created for final report generation")

## Generate Final Scientific Analysis Report

In [None]:
if chunk_analyses:
    # Combine all chunk analyses
    combined_analyses = "\n\n".join([
        f"=== CHUNK {item['chunk_id']} ANALYSIS ===\n{item['analysis']}"
        for item in chunk_analyses
    ])
    
    print("🔄 Synthesizing comprehensive scientific analysis...\n")
    
    try:
        # Generate final synthesis
        final_analysis = synthesis_chain.invoke({"combined_analyses": combined_analyses})
        
        # Format and display the final report
        print("🔬 COMPREHENSIVE SCIENTIFIC RESEARCH ANALYSIS")
        print("=" * 80)
        print(f"📄 Paper: LLMs and Autonomous Agents in Chemistry")
        print(f"📊 Analysis based on {len(chunk_analyses)} text chunks")
        print(f"📈 Total content analyzed: {sum(len(item['analysis']) for item in chunk_analyses):,} characters")
        print("=" * 80)
        print()
        
        # Display formatted analysis
        formatted_analysis = textwrap.fill(final_analysis, width=75)
        print(formatted_analysis)
        print()
        print("✅ SCIENTIFIC ANALYSIS COMPLETE")
        print("=" * 80)
        
    except Exception as e:
        print(f"❌ Error generating final synthesis: {str(e)}")
        
else:
    print("❌ No chunk analyses available for synthesis")

## Optional: View Individual Chunk Analyses

In [None]:
# Optional: Display individual chunk analyses for detailed review
print("📚 INDIVIDUAL CHUNK ANALYSES")
print("=" * 60)

for item in chunk_analyses:
    print(f"\n📖 CHUNK {item['chunk_id']} ANALYSIS")
    print("-" * 40)
    print(f"Content preview: {item['chunk_preview']}")
    print("\nAnalysis:")
    formatted_chunk_analysis = textwrap.fill(item['analysis'], width=70)
    print(formatted_chunk_analysis)
    print("-" * 40)

## Extension: Process Different Papers

In [None]:
# Function to analyze any scientific paper with the same framework
def analyze_scientific_paper(pdf_path, max_chunks=5):
    """Complete scientific analysis pipeline for any research paper"""
    print(f"🔬 Starting scientific analysis of: {pdf_path.split('/')[-1]}")
    
    # Load and process
    full_text, chunks = load_full_paper(pdf_path)
    
    # Analyze chunks
    analyses = []
    for i, chunk in enumerate(chunks[:max_chunks]):
        try:
            analysis = analysis_chain.invoke({"text_chunk": chunk})
            analyses.append({"chunk_id": i+1, "analysis": analysis})
        except Exception as e:
            print(f"⚠️ Skipped chunk {i+1}: {str(e)}")
    
    # Synthesize
    if analyses:
        combined = "\n\n".join([f"=== CHUNK {item['chunk_id']} ===\n{item['analysis']}" for item in analyses])
        final_report = synthesis_chain.invoke({"combined_analyses": combined})
        return final_report
    
    return "Analysis failed - no chunks processed successfully"

# Example usage:
# new_analysis = analyze_scientific_paper("/path/to/another/paper.pdf")
# print(new_analysis)

print("🛠️ Scientific paper analysis function ready for use!")