# Paper Collection Notebook

This notebook collects academic papers from Semantic Scholar and arXiv based on research areas defined in `config.yaml`.

## Workflow
1. Load configuration
2. Query APIs for each research area
3. Deduplicate and merge results
4. Save to data/raw/

In [None]:
# Setup
import sys
import os
from pathlib import Path

# Add parent directory to path
sys.path.append('..')

import yaml
import json
import logging
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import pandas as pd

from utils.paper_collector import PaperCollector

# Load environment variables
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ Setup complete!")

In [None]:
# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

research_areas = config['research_areas']

print(f"üìö Research Areas: {len(research_areas)}")
for area_name, area_config in research_areas.items():
    print(f"  - {area_name}: {len(area_config['keywords'])} keywords, max {area_config['max_papers']} papers")

In [None]:
# Initialize collector
api_key = os.getenv('SEMANTIC_SCHOLAR_API_KEY')
collector = PaperCollector(cache_dir='../cache', api_key=api_key)

print("‚úÖ PaperCollector initialized")
if api_key:
    print("   Using Semantic Scholar API key (higher rate limits)")
else:
    print("   ‚ö†Ô∏è  No API key - using default rate limits")

In [None]:
# Collect papers for each research area
all_papers_by_area = {}

for area_name, area_config in tqdm(research_areas.items(), desc="Research Areas"):
    print(f"\n{'='*60}")
    print(f"üìñ Collecting papers for: {area_name}")
    print(f"{'='*60}")
    
    area_papers = []
    keywords = area_config['keywords']
    max_papers = area_config['max_papers']
    
    papers_per_keyword = max_papers // len(keywords)
    
    for keyword in keywords:
        print(f"\nüîç Searching: '{keyword}'")
        
        # Search Semantic Scholar
        try:
            papers = collector.search_semantic_scholar(keyword, limit=papers_per_keyword)
            area_papers.extend(papers)
            print(f"   Found {len(papers)} papers from Semantic Scholar")
        except Exception as e:
            logger.error(f"Error searching Semantic Scholar: {e}")
        
        # Optional: Also search arXiv
        # try:
        #     arxiv_papers = collector.search_arxiv(keyword, max_results=papers_per_keyword//2)
        #     area_papers.extend(arxiv_papers)
        #     print(f"   Found {len(arxiv_papers)} papers from arXiv")
        # except Exception as e:
        #     logger.error(f"Error searching arXiv: {e}")
    
    # Remove duplicates
    unique_papers = {}
    for paper in area_papers:
        paper_id = paper.get('paperId', paper.get('id'))
        if paper_id and paper_id not in unique_papers:
            unique_papers[paper_id] = paper
    
    area_papers = list(unique_papers.values())
    all_papers_by_area[area_name] = area_papers
    
    print(f"\n‚úÖ Total unique papers for {area_name}: {len(area_papers)}")

print(f"\n{'='*60}")
print("üéâ Collection complete!")
print(f"{'='*60}")

In [None]:
# Summary statistics
total_papers = sum(len(papers) for papers in all_papers_by_area.values())

print("üìä Collection Summary:")
print(f"\nTotal papers collected: {total_papers}")
print("\nBy research area:")

for area_name, papers in all_papers_by_area.items():
    print(f"  {area_name:20s}: {len(papers):4d} papers")

# Create DataFrame for analysis
all_papers_flat = []
for area_name, papers in all_papers_by_area.items():
    for paper in papers:
        paper_copy = paper.copy()
        paper_copy['research_area'] = area_name
        all_papers_flat.append(paper_copy)

df = pd.DataFrame(all_papers_flat)

print(f"\nüìà DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

if 'year' in df.columns:
    print(f"\nYear range: {df['year'].min()} - {df['year'].max()}")

if 'citationCount' in df.columns:
    print(f"\nCitation statistics:")
    print(f"  Mean: {df['citationCount'].mean():.1f}")
    print(f"  Median: {df['citationCount'].median():.1f}")
    print(f"  Max: {df['citationCount'].max():.0f}")

In [None]:
# Save results
output_dir = Path('../data/raw')
output_dir.mkdir(parents=True, exist_ok=True)

# Save by area
output_file = output_dir / 'papers_by_area.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_papers_by_area, f, indent=2)

print(f"‚úÖ Saved papers by area to: {output_file}")

# Save flat list
output_file = output_dir / 'papers_metadata.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_papers_flat, f, indent=2)

print(f"‚úÖ Saved all papers metadata to: {output_file}")

# Save DataFrame as CSV
output_file = output_dir / 'papers_metadata.csv'
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"‚úÖ Saved DataFrame to: {output_file}")

print("\nüéâ All data saved successfully!")

In [None]:
# Preview top papers by citations
if 'citationCount' in df.columns and 'title' in df.columns:
    top_papers = df.nlargest(10, 'citationCount')[['title', 'year', 'citationCount', 'research_area']]
    print("\nüèÜ Top 10 Most Cited Papers:")
    print("\n" + top_papers.to_string(index=False))