# ScholarGenie: Paper Discovery Tutorial

This notebook demonstrates how to use the PaperFinder agent to discover scientific papers from multiple sources.

## Setup

In [None]:
import sys
sys.path.append('..')

from backend.agents.paper_finder import PaperFinderAgent
from backend.agents.pdf_parser import PDFParserAgent
import pandas as pd

## Initialize Agent

In [None]:
finder = PaperFinderAgent()
print("✅ PaperFinder agent initialized")

## Search arXiv

In [None]:
# Search for transformer papers
query = "attention mechanism in transformers"
results = finder.search_arxiv(query, max_results=5)

print(f"Found {len(results)} papers on arXiv:\n")

for i, paper in enumerate(results, 1):
    print(f"{i}. {paper['title']}")
    print(f"   Year: {paper['year']}")
    print(f"   PDF: {paper['pdf_url']}")
    print()

## Search Semantic Scholar

In [None]:
# Search Semantic Scholar
results_ss = finder.search_semantic_scholar(query, max_results=5)

print(f"Found {len(results_ss)} papers on Semantic Scholar:\n")

for i, paper in enumerate(results_ss, 1):
    print(f"{i}. {paper['title']}")
    print(f"   Citations: {paper.get('citation_count', 0)}")
    print(f"   Venue: {paper.get('venue', 'N/A')}")
    print(f"   OA: {'✅' if paper.get('is_open_access') else '❌'}")
    print()

## Combined Search

In [None]:
# Search across all sources
all_results = finder.search(query, max_results=10)

# Convert to DataFrame for analysis
df = pd.DataFrame(all_results)

print(f"Total unique papers: {len(df)}")
print(f"Open Access: {df['is_open_access'].sum()}")
print(f"\nSources:")
print(df['source'].value_counts())

In [None]:
# Display as table
df[['title', 'year', 'source', 'is_open_access']].head(10)

## Get Paper by DOI

In [None]:
# Fetch a specific paper by DOI
doi = "10.48550/arXiv.1706.03762"  # "Attention Is All You Need"

paper = finder.get_paper_by_doi(doi)

if paper:
    print(f"Title: {paper['title']}")
    print(f"Authors: {', '.join([a['name'] for a in paper['authors']])}")
    print(f"Year: {paper['year']}")
    print(f"PDF URL: {paper.get('pdf_url', 'N/A')}")
else:
    print("Paper not found")

## Find Open Access PDFs with Unpaywall

In [None]:
# Try to find OA version
doi = "10.1038/nature14539"  # Example Nature paper

oa_url = finder.find_open_access_pdf(doi)

if oa_url:
    print(f"✅ Found Open Access PDF: {oa_url}")
else:
    print("❌ No Open Access version available")

## Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Year distribution
if 'year' in df.columns:
    year_counts = df['year'].value_counts().sort_index()
    
    plt.figure(figsize=(10, 5))
    plt.bar(year_counts.index, year_counts.values)
    plt.xlabel('Year')
    plt.ylabel('Number of Papers')
    plt.title(f'Papers on "{query}" by Year')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Next Steps

Now that you've found papers, you can:

1. Parse PDFs with `PDFParserAgent`
2. Generate summaries with `SummarizerAgent`
3. Extract data with `ExtractorAgent`
4. Create presentations with `PresenterAgent`

See notebook `02_summarization_demo.ipynb` for the next steps!