# ScholarGenie: Summarization & Analysis Demo

This notebook demonstrates the complete pipeline: finding papers, parsing PDFs, generating summaries, and creating presentations.

## Setup

In [None]:
import sys
sys.path.append('..')

from backend.agents.paper_finder import PaperFinderAgent
from backend.agents.pdf_parser import PDFParserAgent
from backend.agents.summarizer import SummarizerAgent
from backend.agents.extractor import ExtractorAgent
from backend.agents.presenter import PresenterAgent
from backend.agents.evaluator import EvaluatorAgent

print("‚úÖ All agents imported successfully")

## Step 1: Find a Paper

In [None]:
finder = PaperFinderAgent()

# Get the famous "Attention Is All You Need" paper
arxiv_id = "1706.03762"
paper_data = finder.get_paper_by_arxiv_id(arxiv_id)

print(f"Found: {paper_data['title']}")
print(f"PDF URL: {paper_data['pdf_url']}")

## Step 2: Parse PDF

In [None]:
parser = PDFParserAgent()

# Download and parse
print("üìÑ Downloading and parsing PDF...")
paper = parser.download_and_parse(
    pdf_url=paper_data['pdf_url'],
    paper_id=paper_data['paper_id']
)

print(f"\n‚úÖ Parsed successfully!")
print(f"Title: {paper.title}")
print(f"Authors: {', '.join([a.name for a in paper.authors[:3]])}")
print(f"Sections: {len(paper.sections)}")
print(f"References: {len(paper.references)}")
print(f"Figures: {len(paper.figures)}")

## Step 3: View Content

In [None]:
# View abstract
print("Abstract:")
print(paper.abstract)

In [None]:
# View section titles
print("Sections:")
for i, section in enumerate(paper.sections, 1):
    print(f"{i}. {section.title}")

## Step 4: Generate Summary

In [None]:
summarizer = SummarizerAgent()

print("üìù Generating multi-granularity summary...")
print("(This may take a few minutes depending on your hardware)\n")

summary = summarizer.summarize_paper(paper)

print("‚úÖ Summary generated!")

In [None]:
# View TL;DR
print("TL;DR:")
print(summary.tldr)
print()

# View short summary
print("Short Summary:")
print(summary.short_summary)
print()

# View full summary
print("Full Summary:")
print(summary.full_summary)

In [None]:
# View key points
print("Key Points:")
for i, point in enumerate(summary.keypoints, 1):
    print(f"{i}. {point}")

## Step 5: Extract Structured Data

In [None]:
extractor = ExtractorAgent()

print("üîç Extracting structured data...")
extracted = extractor.extract(paper)

print("‚úÖ Extraction complete!")

In [None]:
# View extracted data
print("Methods:", extracted.methods[:5])
print("\nDatasets:", extracted.datasets[:5])
print("\nModels:", extracted.models[:5])
print("\nKey Findings:")
for finding in extracted.key_findings[:3]:
    print(f"- {finding}")

## Step 6: Evaluate Summary Quality

In [None]:
evaluator = EvaluatorAgent()

print("üìä Evaluating summary quality...")
evaluation = evaluator.evaluate_summary(paper, summary)

print("\n‚úÖ Evaluation complete!")

In [None]:
# View ROUGE scores
if 'rouge' in evaluation['metrics']:
    print("ROUGE Scores:")
    for metric, scores in evaluation['metrics']['rouge'].items():
        print(f"{metric}: F1={scores['fmeasure']:.3f}, P={scores['precision']:.3f}, R={scores['recall']:.3f}")

In [None]:
# View quality checks
print("\nQuality Checks:")
qc = evaluation['quality_checks']

print(f"TL;DR length: {qc['tldr_length']['word_count']} words - {'‚úÖ' if qc['tldr_length']['pass'] else '‚ùå'}")
print(f"Full summary length: {qc['full_summary_length']['word_count']} words - {'‚úÖ' if qc['full_summary_length']['pass'] else '‚ùå'}")
print(f"Keypoints count: {qc['keypoints_count']['count']} - {'‚úÖ' if qc['keypoints_count']['pass'] else '‚ùå'}")
print(f"Has methods: {'‚úÖ' if qc['has_methods'] else '‚ùå'}")
print(f"Has results: {'‚úÖ' if qc['has_results'] else '‚ùå'}")

In [None]:
# View warnings
if evaluation['warnings']:
    print("\nWarnings:")
    for warning in evaluation['warnings']:
        print(f"‚ö†Ô∏è  {warning}")
else:
    print("\n‚úÖ No warnings!")

## Step 7: Generate Presentation

In [None]:
presenter = PresenterAgent()

print("üé® Generating PowerPoint presentation...")

pptx_path = presenter.generate_pptx(
    paper=paper,
    summary=summary,
    extracted_data=extracted,
    output_path="./attention_presentation.pptx"
)

print(f"\n‚úÖ Presentation saved to: {pptx_path}")

## Step 8: Generate Report

In [None]:
print("üìÑ Generating Markdown report...")

report_path = presenter.generate_markdown_report(
    paper=paper,
    summary=summary,
    extracted_data=extracted,
    output_path="./attention_report.md"
)

print(f"\n‚úÖ Report saved to: {report_path}")

In [None]:
# Preview report
with open(report_path, 'r', encoding='utf-8') as f:
    report_content = f.read()

from IPython.display import Markdown
Markdown(report_content[:2000] + "\n\n... (truncated)")

## Visualize Summary Statistics

In [None]:
import matplotlib.pyplot as plt

# Summary lengths
summary_lengths = {
    'TL;DR': len(summary.tldr.split()),
    'Short': len(summary.short_summary.split()),
    'Full': len(summary.full_summary.split()),
    'Keypoints': sum(len(kp.split()) for kp in summary.keypoints)
}

plt.figure(figsize=(10, 5))
plt.bar(summary_lengths.keys(), summary_lengths.values(), color='skyblue')
plt.xlabel('Summary Type')
plt.ylabel('Word Count')
plt.title('Summary Length Comparison')
plt.tight_layout()
plt.show()

## Conclusion

You've completed the full ScholarGenie pipeline! You can now:

1. ‚úÖ Find papers from arXiv and Semantic Scholar
2. ‚úÖ Parse PDFs with GROBID
3. ‚úÖ Generate multi-level summaries
4. ‚úÖ Extract structured data
5. ‚úÖ Create presentations and reports
6. ‚úÖ Evaluate summary quality

### Next Steps:

- Try with your own papers
- Experiment with different models in `config.yaml`
- Use the vector store for semantic search
- Batch process multiple papers
- Deploy with Docker for production use