# Research Paper Summarizer

**Goal**: Extract abstracts from PDFs and summarize them with local AI

**Flow**: PDF → Extract Abstract → Summarize → Done

## Setup

In [12]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import re

print ("Imported Successfully")

Imported Successfully


## Load PDF and Extract Abstract

In [13]:
def extract_abstract(pdf_path):
    # Load PDF
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    full_text = "\n".join([doc.page_content for doc in docs])
    
    # First try to find explicit "Abstract" sections
    abstract_patterns = [
        r'Abstract\s*[:\-]?\s*\n(.*?)(?=\n\s*\n|\nIntroduction|\n1\s+Introduction|\nKeywords|\n\d+\.)',
        r'ABSTRACT\s*[:\-]?\s*\n(.*?)(?=\n\s*\n|\nINTRODUCTION|\n1\s+INTRODUCTION|\nKEYWORDS|\n\d+\.)',
    ]
    
    for pattern in abstract_patterns:
        match = re.search(pattern, full_text, re.DOTALL | re.IGNORECASE)
        if match:
            abstract = match.group(1).strip()
            if len(abstract) > 100:
                return abstract
    
    # If no explicit "Abstract" header found, extract text after authors but before "Introduction"
    # This PDF has format: Title + Authors + Abstract (no header) + Introduction
    intro_pattern = r'(.*?)(?=\n\s*1\s+Introduction|\n\s*Introduction)'
    match = re.search(intro_pattern, full_text, re.DOTALL)
    
    if match:
        pre_intro_text = match.group(1)
        
        # Remove title and author info - get the paragraph that's clearly the abstract
        # Look for the substantial paragraph after the author line
        lines = pre_intro_text.split('\n')
        
        # Find where the abstract likely starts (after author names)
        abstract_start = -1
        for i, line in enumerate(lines):
            # Look for a line that seems like the start of abstract content
            if len(line.strip()) > 50 and any(word in line.lower() for word in ['large language', 'models', 'emerged', 'review']):
                abstract_start = i
                break
        
        if abstract_start >= 0:
            # Take from abstract start until we hit a clear section break
            abstract_lines = []
            for line in lines[abstract_start:]:
                if line.strip() and not line.startswith('1 ') and 'Introduction' not in line:
                    abstract_lines.append(line.strip())
                elif len(abstract_lines) > 0 and (line.strip() == '' or 'Introduction' in line):
                    break
            
            if abstract_lines:
                return ' '.join(abstract_lines)
    
    # Final fallback
    return " ".join(full_text.split()[:500])

# Load your paper
pdf_path = "/Users/aimiegarces/Agents/d4sc03921a.pdf"
abstract = extract_abstract(pdf_path)

print(f"Abstract extracted: {len(abstract)} characters")
print(f"First 200 chars: {abstract[:200]}...")

Abstract extracted: 1269 characters
First 200 chars: Large language models (LLMs) have emerged as powerful tools in chemistry, signiﬁcantly impacting molecule design, property prediction, and synthesis optimization. This review highlights LLM capabiliti...


## Setup Local AI Model

In [14]:
# Initialize Ollama model
llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0.3
)

print("Model loaded successfully")

Model loaded successfully


## Create Summarization Chain

In [15]:
# Create prompt template
prompt = ChatPromptTemplate.from_template("""
Summarize this research abstract in 2-3 clear sentences:

{abstract}

Summary:
""")

# Build the chain: prompt → model → parser
chain = prompt | llm | StrOutputParser()

print("Chain created successfully")

Chain created successfully


## Generate Summary

In [None]:
# Run the chain
summary = chain.invoke({"abstract": abstract})

# Clean and professional output formatting
print("📄 RESEARCH PAPER ANALYSIS")
print("=" * 80)

# Show paper info if available
print(f"📊 Abstract Length: {len(abstract)} characters")
print(f"📝 Summary Length: {len(summary)} characters")
print()

print("🔍 ORIGINAL ABSTRACT")
print("-" * 50)
# Format abstract nicely with line breaks
import textwrap
formatted_abstract = textwrap.fill(abstract, width=75)
print(formatted_abstract)
print()

print("🤖 AI SUMMARY")
print("-" * 50)
formatted_summary = textwrap.fill(summary, width=75)
print(formatted_summary)
print()

print("✅ Analysis Complete!")
print("=" * 80)

In [17]:
# DEBUG: Show what's actually in the PDF
loader = PyPDFLoader("/Users/aimiegarces/Agents/d4sc03921a.pdf")
docs = loader.load()
full_text = "\n".join([doc.page_content for doc in docs])

print("=== FIRST 1500 CHARACTERS OF YOUR PDF ===")
print(repr(full_text[:1500]))  # Using repr to see whitespace/newlines
print("=== END ===")

# Look for "Abstract" anywhere in the text
import re
abstract_positions = []
for match in re.finditer(r'abstract', full_text, re.IGNORECASE):
    start = max(0, match.start() - 50)
    end = min(len(full_text), match.end() + 200)
    abstract_positions.append((match.start(), full_text[start:end]))

print(f"\nFound {len(abstract_positions)} instances of 'abstract':")
for i, (pos, context) in enumerate(abstract_positions):
    print(f"\n--- Instance {i+1} at position {pos} ---")
    print(repr(context))

=== FIRST 1500 CHARACTERS OF YOUR PDF ===
'A review of large language models and\nautonomous agents in chemistry\nMayk Caldas Ramos, ab Christopher J. Collison c and Andrew D. White *ab\nLarge language models (LLMs) have emerged as powerful tools in chemistry, signiﬁcantly impacting\nmolecule design, property prediction, and synthesis optimization. This review highlights LLM capabilities\nin these domains and their potential to accelerate scientiﬁc discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to interact with their\nsurrounding environment. These agents perform diverse tasks such as paper scraping, interfacing with\nautomated laboratories, and synthesis planning. As agents are an emerging topic, we extend the scope\nof our review of agents beyond chemistry and discuss across any scientiﬁc domains. This review covers\nthe recent history, current capabilities, and design of LLMs and autonomous agents, addressing speciﬁc\nchal