# Agent Colab: Research Paper Entity Extraction Benchmark

This notebook sets up **Gemini 3 Pro Preview** as an autonomous agent to solve the Research Paper Entity Extraction and Citation Analysis benchmark.

**Requirements:**
- Google Colab Pro (for native Gemini access via `google.colab.ai`)

**Model Used:**
- `google/gemini-3-pro-preview` - Gemini 3 Pro Preview model

**Implementation:**
- Uses `google.colab.ai` module for native Colab Pro AI integration
- No external API keys required - uses Colab Pro's built-in AI capabilities
- Self-contained dataset generation (no file uploads needed)

**Note:** This notebook runs end-to-end without manual intervention.

## Setup and Dependencies

In [7]:
# Install required packages
%pip install -q pandas networkx

In [8]:
from google.colab import ai
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from datetime import datetime
from typing import Dict, List, Any, Tuple
import re
import networkx as nx
import warnings
import unittest
warnings.filterwarnings('ignore')

# List available models in Colab Pro
print("Available AI models in Colab Pro:")
available_models = ai.list_models()
for model in available_models:
    print(f"  - {model}")

Available AI models in Colab Pro:


TimeoutException: Requesting secret MODEL_PROXY_API_KEY timed out. Secrets can only be fetched when running from the Colab UI.

## Agent Configuration

Select and configure Gemini-3-Pro from available Colab Pro models.

In [None]:
# Select the model for agentic tasks
# Using Gemini 3 Pro Preview - the most advanced reasoning model available
MODEL_NAME = "google/gemini-3-pro-preview"

# Verify the model is available
if MODEL_NAME in available_models:
    print(f"Model '{MODEL_NAME}' is available - SELECTED")
else:
    print(f"Warning: '{MODEL_NAME}' not found. Available models: {available_models}")
    # Fallback to other Pro/capable models
    fallback_order = ["google/gemini-2.5-pro", "google/gemini-2.0-flash", "google/gemini-2.5-flash"]
    for fallback in fallback_order:
        if fallback in available_models:
            MODEL_NAME = fallback
            print(f"Using fallback model: {MODEL_NAME}")
            break

print(f"\nAgent model selected: {MODEL_NAME}")

Agent model initialized: gemini-3-pro


## Generate Dataset

Generate the synthetic benchmark dataset. This ensures the notebook is fully self-contained and reproducible.

In [None]:
# ============================================================================
# DATASET GENERATION - Self-contained synthetic data generation
# ============================================================================

import random
import csv
from datetime import timedelta

# Set seed for reproducibility
random.seed(42)

# Canonical authors with their name variations
CANONICAL_AUTHORS = {
    "auth_001": {
        "canonical_name": "John Smith",
        "variations": ["J. Smith", "John A. Smith", "J. A. Smith", "Smith, John"],
        "institution": "inst_001"
    },
    "auth_002": {
        "canonical_name": "Maria Garcia",
        "variations": ["M. Garcia", "Maria L. Garcia", "Garcia, Maria", "M. L. Garcia"],
        "institution": "inst_002"
    },
    "auth_003": {
        "canonical_name": "Wei Zhang",
        "variations": ["W. Zhang", "Wei W. Zhang", "Zhang, Wei", "Zhang Wei"],
        "institution": "inst_003"
    },
    "auth_004": {
        "canonical_name": "Emily Johnson",
        "variations": ["E. Johnson", "Emily R. Johnson", "Johnson, Emily", "E. R. Johnson"],
        "institution": "inst_001"
    },
    "auth_005": {
        "canonical_name": "Ahmed Hassan",
        "variations": ["A. Hassan", "Ahmed M. Hassan", "Hassan, Ahmed", "A. M. Hassan"],
        "institution": "inst_004"
    },
    "auth_006": {
        "canonical_name": "Sarah Williams",
        "variations": ["S. Williams", "Sarah K. Williams", "Williams, Sarah", "S. K. Williams"],
        "institution": "inst_002"
    },
    "auth_007": {
        "canonical_name": "Yuki Tanaka",
        "variations": ["Y. Tanaka", "Yuki S. Tanaka", "Tanaka, Yuki", "Tanaka Yuki"],
        "institution": "inst_005"
    },
    "auth_008": {
        "canonical_name": "Michael Brown",
        "variations": ["M. Brown", "Michael J. Brown", "Brown, Michael", "M. J. Brown"],
        "institution": "inst_003"
    },
    "auth_009": {
        "canonical_name": "Lisa Chen",
        "variations": ["L. Chen", "Lisa Y. Chen", "Chen, Lisa", "Chen Lisa"],
        "institution": "inst_004"
    },
    "auth_010": {
        "canonical_name": "David Miller",
        "variations": ["D. Miller", "David A. Miller", "Miller, David", "D. A. Miller"],
        "institution": "inst_005"
    }
}

# Canonical institutions with their variations
CANONICAL_INSTITUTIONS = {
    "inst_001": {
        "canonical_name": "Massachusetts Institute of Technology",
        "variations": ["MIT", "M.I.T.", "Massachusetts Inst. of Technology", "Mass. Institute of Technology"],
        "country": "USA"
    },
    "inst_002": {
        "canonical_name": "Stanford University",
        "variations": ["Stanford", "Stanford Univ.", "Stanford U.", "Leland Stanford Junior University"],
        "country": "USA"
    },
    "inst_003": {
        "canonical_name": "Tsinghua University",
        "variations": ["Tsinghua", "Tsinghua Univ.", "Qinghua University", "THU"],
        "country": "China"
    },
    "inst_004": {
        "canonical_name": "University of Oxford",
        "variations": ["Oxford", "Oxford Univ.", "Oxford University", "Univ. of Oxford"],
        "country": "UK"
    },
    "inst_005": {
        "canonical_name": "University of Tokyo",
        "variations": ["Tokyo Univ.", "UTokyo", "Tokyo University", "Univ. of Tokyo"],
        "country": "Japan"
    }
}

# Research topics and methods
RESEARCH_TOPICS = [
    "machine learning", "deep learning", "neural networks", "natural language processing",
    "computer vision", "reinforcement learning", "transformer models", "attention mechanisms",
    "graph neural networks", "federated learning", "transfer learning", "meta-learning",
    "generative models", "adversarial learning", "explainable AI", "optimization",
    "representation learning", "self-supervised learning", "multi-task learning", "few-shot learning"
]

RESEARCH_METHODS = [
    "gradient descent", "backpropagation", "stochastic optimization", "cross-validation",
    "ablation study", "hyperparameter tuning", "ensemble methods", "regularization",
    "dropout", "batch normalization", "attention mechanism", "skip connections",
    "data augmentation", "pre-training", "fine-tuning", "knowledge distillation"
]

VENUES = [
    "NeurIPS", "ICML", "ICLR", "AAAI", "CVPR", "ACL", "EMNLP", "NAACL",
    "ECCV", "ICCV", "KDD", "WWW", "SIGIR", "IJCAI", "UAI", "AISTATS"
]

ABSTRACT_TEMPLATES = [
    "We propose {method}, a novel approach to {topic} that achieves state-of-the-art results on {benchmark}. "
    "Our method leverages {technique} to address the challenge of {challenge}. "
    "Experiments demonstrate {improvement}% improvement over previous baselines.",
    
    "This paper introduces {method} for {topic}. Unlike prior work that relies on {old_approach}, "
    "we utilize {technique} to capture {aspect}. Our approach shows significant improvements in {metric}.",
    
    "Recent advances in {topic} have shown promising results using {technique}. "
    "We address limitations by proposing {method}, which combines {component1} with {component2}. "
    "Comprehensive experiments demonstrate the effectiveness of our approach.",
]

TEMPLATE_FILLS = {
    "method": ["DeepNet", "TransNet", "GraphFormer", "AttnNet", "MultiScale", "HierNet", "AdaptNet"],
    "benchmark": ["ImageNet", "COCO", "GLUE", "SQuAD", "WMT", "Citeseer", "PubMed"],
    "challenge": ["scalability", "generalization", "data efficiency", "computational cost"],
    "technique": ["self-attention", "graph convolution", "contrastive learning", "knowledge distillation"],
    "improvement": ["15", "23", "8", "31", "12", "19", "27"],
    "old_approach": ["hand-crafted features", "fixed architectures", "single-scale processing"],
    "aspect": ["semantic relationships", "hierarchical structure", "temporal dynamics"],
    "metric": ["accuracy", "F1 score", "BLEU score", "perplexity"],
    "component1": ["local attention", "global context", "residual connections"],
    "component2": ["positional encoding", "gating mechanisms", "skip connections"],
}


def generate_abstract(topics, methods):
    """Generate a synthetic abstract."""
    template = random.choice(ABSTRACT_TEMPLATES)
    fills = {key: random.choice(values) for key, values in TEMPLATE_FILLS.items()}
    fills["topic"] = random.choice(topics)
    abstract = template.format(**fills)
    if random.random() > 0.5:
        abstract += f" We employ {random.choice(methods)} in our implementation."
    return abstract


def generate_papers(num_papers=100):
    """Generate synthetic paper metadata."""
    papers = []
    author_ids = list(CANONICAL_AUTHORS.keys())
    base_date = datetime(2020, 1, 1)
    
    for i in range(num_papers):
        paper_id = f"paper_{i:04d}"
        num_authors = random.randint(1, 4)
        selected_author_ids = random.sample(author_ids, num_authors)
        
        # Use name variations for authors
        authors = []
        for aid in selected_author_ids:
            auth = CANONICAL_AUTHORS[aid]
            if random.random() > 0.4:
                authors.append(random.choice(auth["variations"]))
            else:
                authors.append(auth["canonical_name"])
        
        # Get institution with variations
        primary_inst_id = CANONICAL_AUTHORS[selected_author_ids[0]]["institution"]
        inst = CANONICAL_INSTITUTIONS[primary_inst_id]
        institution = random.choice(inst["variations"]) if random.random() > 0.5 else inst["canonical_name"]
        
        paper_topics = random.sample(RESEARCH_TOPICS, random.randint(2, 4))
        paper_methods = random.sample(RESEARCH_METHODS, random.randint(1, 3))
        abstract = generate_abstract(paper_topics, paper_methods)
        
        method_name = random.choice(TEMPLATE_FILLS["method"])
        main_topic = paper_topics[0].title()
        title_templates = [
            f"{method_name}: A Novel Approach to {main_topic}",
            f"Improving {main_topic} with {method_name}",
            f"{method_name} for Efficient {main_topic}",
        ]
        title = random.choice(title_templates)
        
        venue = random.choice(VENUES)
        pub_date = base_date + timedelta(days=random.randint(0, 1500))
        
        paper = {
            "paper_id": paper_id,
            "title": title,
            "authors": authors,
            "institution": institution,
            "abstract": abstract,
            "keywords": paper_topics,
            "venue": venue,
            "year": pub_date.year,
            "publication_date": pub_date.strftime("%Y-%m-%d"),
        }
        
        # Edge cases
        if i == 5: paper["abstract"] = ""
        if i == 12: paper["keywords"] = []
        if i == 45: paper["institution"] = None
        if i == 67:
            dup_author = selected_author_ids[0]
            paper["authors"].append(CANONICAL_AUTHORS[dup_author]["variations"][0])
        
        papers.append(paper)
    
    return papers


def generate_citations(papers, density=0.05):
    """Generate citation relationships."""
    citations = []
    paper_ids = [p["paper_id"] for p in papers]
    paper_years = {p["paper_id"]: p["year"] for p in papers}
    
    for citing_paper in paper_ids:
        citing_year = paper_years[citing_paper]
        citable = [p for p in paper_ids if paper_years[p] <= citing_year and p != citing_paper]
        
        if citable:
            num_citations = max(1, int(len(citable) * density * random.uniform(0.5, 1.5)))
            num_citations = min(num_citations, len(citable), 10)
            cited_papers = random.sample(citable, num_citations)
            for cited in cited_papers:
                citations.append({"citing_paper": citing_paper, "cited_paper": cited})
    
    # Edge cases: orphan and self-citation
    citations.append({"citing_paper": "paper_0010", "cited_paper": "paper_9999"})
    citations.append({"citing_paper": "paper_0015", "cited_paper": "paper_0015"})
    
    return citations


def generate_author_affiliations():
    """Generate author-institution mapping data."""
    affiliations = {"authors": {}, "institutions": {}}
    
    for auth_id, auth_data in CANONICAL_AUTHORS.items():
        affiliations["authors"][auth_id] = {
            "canonical_name": auth_data["canonical_name"],
            "known_variations": auth_data["variations"][:2],
            "primary_institution": auth_data["institution"],
        }
    
    for inst_id, inst_data in CANONICAL_INSTITUTIONS.items():
        affiliations["institutions"][inst_id] = {
            "canonical_name": inst_data["canonical_name"],
            "known_variations": inst_data["variations"][:2],
            "country": inst_data["country"]
        }
    
    return affiliations


# Generate the dataset
print("Generating synthetic dataset...")
papers_list = generate_papers(100)
citations_list = generate_citations(papers_list)
affiliations_data_gen = generate_author_affiliations()

# Convert to the formats expected by the benchmark
papers_raw = papers_list
citations_raw = pd.DataFrame(citations_list)
affiliations_raw = affiliations_data_gen

print(f"\n✓ Dataset generated successfully:")
print(f"  - Papers: {len(papers_raw)} records")
print(f"  - Citations: {len(citations_raw)} relationships")
print(f"  - Affiliations: {len(affiliations_raw.get('authors', {}))} authors, {len(affiliations_raw.get('institutions', {}))} institutions")
print(f"\nEdge cases included:")
print(f"  - Paper with missing abstract (paper_0005)")
print(f"  - Paper with missing keywords (paper_0012)")
print(f"  - Paper with missing institution (paper_0045)")
print(f"  - Paper with duplicate author entry (paper_0067)")
print(f"  - Orphan citation (paper_9999 doesn't exist)")
print(f"  - Self-citation (paper_0015 cites itself)")

FileNotFoundError: [Errno 2] No such file or directory: 'papers_metadata.json'

## Benchmark Prompt

The task specification for the agent.

In [None]:
BENCHMARK_PROMPT = """
# Research Paper Entity Extraction and Citation Analysis Benchmark

## Scenario

You are a data scientist tasked with building an automated pipeline for analyzing research paper metadata. 
Your goal is to extract structured information from a collection of research papers, resolve entity ambiguities, 
construct a citation network, and produce a comprehensive analytical report.

You must decide for yourself how to decompose the task, which intermediate computations to perform, and in what order.
Do not simply follow a fixed step-by-step structure.

## Context

You have access to three data sources (already loaded in memory):

### Input Data Structures

**papers_raw** (list[dict]): List of ~100 paper records. Each paper dict has this schema:
- "paper_id": str (e.g., "paper_0001")
- "title": str
- "authors": list[str] (e.g., ["J. Smith", "Maria Garcia"])
- "institution": str or None (e.g., "MIT" or "Stanford University")
- "abstract": str (may be empty string "")
- "keywords": list[str] (e.g., ["machine learning", "neural networks"], may be empty [])
- "venue": str (e.g., "NeurIPS", "ICML")
- "year": int
- "publication_date": str (ISO format "YYYY-MM-DD")

**citations_raw** (pd.DataFrame): Citation relationships with columns:
- citing_paper: str (paper_id of the paper doing the citing)
- cited_paper: str (paper_id of the paper being cited)

**affiliations_raw** (dict): Reference data for entity resolution. 
IMPORTANT: Structure is a dict-of-dicts keyed by ID, NOT a list:
{
    "authors": {
        "auth_001": {"canonical_name": str, "known_variations": list[str], "primary_institution": str},
        "auth_002": {...},
        ...
    },
    "institutions": {
        "inst_001": {"canonical_name": str, "known_variations": list[str], "country": str},
        "inst_002": {...},
        ...
    }
}

To iterate over authors: for auth_id, auth_info in affiliations_raw["authors"].items()
To iterate over institutions: for inst_id, inst_info in affiliations_raw["institutions"].items()

### Data Challenges (Intentional)

The data contains edge cases you must handle:
- Author name variations: Same person appears as "John Smith", "J. Smith", "Smith, John"
- Institution name variations: Same institution appears as "MIT", "Massachusetts Institute of Technology"
- Missing fields: Some papers have empty abstract ("") or empty keywords ([])
- Orphan citations: Some citations reference paper_ids that don't exist in papers_raw
- Self-citations: Some papers cite themselves

## Required Output Variables

You must produce these variables:

### Core Data Variables
- papers_df: pd.DataFrame with columns: paper_id, title, authors, institution, abstract, keywords, venue, year, publication_date
- citations_df: pd.DataFrame with columns: citing_paper, cited_paper
- affiliations_data: dict with 'authors' and 'institutions' keys

### Entity Extraction Variables
- extracted_authors: list[dict] with keys: name, paper_ids, name_variations
- extracted_institutions: list[dict] with keys: name, paper_ids, name_variations
- extracted_topics: dict[str, int] mapping topics to frequency counts
- methods_from_abstracts: list[str] of research methods found

### Entity Resolution Variables
- author_resolution_map: dict[str, str] mapping variations to canonical names
- institution_resolution_map: dict[str, str] mapping variations to canonical names
- resolved_author_count: int
- resolved_institution_count: int

### Citation Network Variables
- citation_graph: dict[str, list[str]] adjacency list
- in_degree: dict[str, int] incoming citations per paper
- out_degree: dict[str, int] outgoing citations per paper
- pagerank_scores: dict[str, float] PageRank scores
- top_cited_papers: list[str] top 10 most cited paper IDs
- orphan_citations: list[dict] citations to non-existent papers
- self_citations: list[str] papers that cite themselves

### Validation Dictionary
- validation_results: dict[str, bool] with keys:
  - papers_loaded_ok, citations_loaded_ok, affiliations_loaded_ok
  - no_duplicate_paper_ids, authors_extracted, institutions_extracted
  - resolution_maps_valid, citation_graph_built, pagerank_computed
  - orphans_identified, self_citations_identified, all_pagerank_finite

### Summary Statistics
- summary_stats: dict with keys:
  - total_papers, total_citations, unique_authors_raw, unique_authors_resolved
  - unique_institutions_raw, unique_institutions_resolved
  - papers_with_missing_abstract, papers_with_missing_keywords
  - orphan_citation_count, self_citation_count, avg_citations_per_paper
  - most_common_venue, year_range

### Final Report
- final_report: dict with this EXACT structure:
{
    "metadata": {
        "task": "Research Paper Entity Extraction and Citation Analysis",
        "papers_analyzed": int,
        "execution_timestamp": str  # ISO format datetime
    },
    "entity_extraction": {
        "authors": {
            "total_unique": int,  # Number of unique resolved authors
            "top_5_by_paper_count": [{"name": str, "paper_count": int}, ...]  # Top 5 authors by paper count
        },
        "institutions": {
            "total_unique": int,
            "top_5_by_paper_count": [{"name": str, "paper_count": int}, ...]
        },
        "topics": {
            "total_unique": int,
            "top_10_by_frequency": [{"topic": str, "count": int}, ...]  # Top 10 topics
        }
    },
    "citation_analysis": {
        "total_citations": int,
        "top_10_cited_papers": [{"paper_id": str, "citation_count": int, "title": str}, ...],
        "orphan_citations": [{"citing_paper": str, "cited_paper": str}, ...],
        "self_citations": [str, ...],  # List of paper_ids
        "network_statistics": {
            "avg_in_degree": float,
            "avg_out_degree": float,
            "max_in_degree": int,
            "max_out_degree": int
        }
    },
    "data_quality": {
        "missing_abstracts": int,
        "missing_keywords": int,
        "missing_institutions": int,
        "duplicate_author_entries": int
    },
    "validation_summary": {
        "all_checks_passed": bool,
        "failed_checks": [str, ...]  # List of failed validation key names
    }
}

## Constraints
1. Do not hardcode specific paper IDs, author names, or institution names
2. Entity resolution must use fuzzy matching or reference data
3. PageRank must use damping factor 0.85
4. Handle edge cases gracefully

## Success Criteria
1. All validation checks pass
2. Entity resolution reduces author count
3. Orphan citations are identified (at least one exists)
4. Self-citations are identified (at least one exists)
5. PageRank scores sum to approximately 1.0
6. Final report follows exact schema
7. All numeric values are finite

Write complete Python code to solve this task. Store all results in the specified variable names.
"""

print("Benchmark prompt loaded")

## Agent Execution

In [None]:
def run_agent_task(prompt, data_context):
    """Run the agent using google.colab.ai to generate code for the task."""
    
    # Prepare context with data samples
    context = f"""
You have access to the following data (already loaded in Python):

papers_raw: A list of {len(data_context['papers'])} paper dictionaries
Sample: {json.dumps(data_context['papers'][0], indent=2)}

citations_raw: A pandas DataFrame with {len(data_context['citations'])} rows
Columns: {data_context['citations'].columns.tolist()}
Sample:
{data_context['citations'].head(3).to_string()}

affiliations_raw: A dictionary with author and institution reference data
Keys: {list(data_context['affiliations'].keys())}
Sample author: {json.dumps(list(data_context['affiliations']['authors'].values())[0], indent=2)}
Sample institution: {json.dumps(list(data_context['affiliations']['institutions'].values())[0], indent=2)}

{prompt}
"""
    
    print("Sending task to agent...")
    print("="*50)
    
    # Use google.colab.ai to generate response
    # The ai.generate_text function uses the Colab Pro's native AI capabilities
    response = ai.generate_text(
        prompt=context,
        model_name=MODEL_NAME,
    )
    
    return response


# Prepare data context
data_context = {
    'papers': papers_raw,
    'citations': citations_raw,
    'affiliations': affiliations_raw
}

# Run the agent
agent_response = run_agent_task(BENCHMARK_PROMPT, data_context)
print("Agent response received")
print("="*50)
print(agent_response[:2000] + "..." if len(agent_response) > 2000 else agent_response)

In [None]:
# Extract Python code from agent response and execute it
def extract_and_execute_code(response_text):
    """Extract Python code blocks from the response and execute them."""
    
    # Find all code blocks
    code_blocks = re.findall(r'```python\n(.*?)```', response_text, re.DOTALL)
    
    if not code_blocks:
        # Try without language specifier
        code_blocks = re.findall(r'```\n(.*?)```', response_text, re.DOTALL)
    
    if not code_blocks:
        print("No code blocks found in response")
        return None
    
    # Combine all code blocks
    full_code = "\n\n".join(code_blocks)
    
    print(f"Extracted {len(code_blocks)} code block(s)")
    print("Executing agent code...")
    print("="*50)
    
    # Execute the code
    exec_globals = {
        'papers_raw': papers_raw,
        'citations_raw': citations_raw,
        'affiliations_raw': affiliations_raw,
        'pd': pd,
        'np': np,
        'json': json,
        're': re,
        'nx': nx,
        'defaultdict': defaultdict,
        'Counter': Counter,
        'datetime': datetime,
        'Dict': Dict,
        'List': List,
        'Any': Any,
        'Tuple': Tuple,
    }
    
    try:
        exec(full_code, exec_globals)
        print("Code executed successfully!")
        return exec_globals
    except Exception as e:
        print(f"Error executing code: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute the agent's code
exec_result = extract_and_execute_code(agent_response)

# If successful, extract variables to global scope
if exec_result:
    required_vars = [
        'papers_df', 'citations_df', 'affiliations_data',
        'extracted_authors', 'extracted_institutions', 'extracted_topics', 'methods_from_abstracts',
        'author_resolution_map', 'institution_resolution_map', 'resolved_author_count', 'resolved_institution_count',
        'citation_graph', 'in_degree', 'out_degree', 'pagerank_scores', 'top_cited_papers',
        'orphan_citations', 'self_citations',
        'validation_results', 'summary_stats', 'final_report'
    ]
    
    print("\nVariable extraction:")
    for var in required_vars:
        if var in exec_result:
            globals()[var] = exec_result[var]
            print(f"  ✓ {var}")
        else:
            print(f"  ✗ {var} (missing)")

## Agent Output

Display the results produced by the agent.

In [None]:
# Display the agent's outputs
try:
    print("=== VALIDATION RESULTS ===")
    print(json.dumps(validation_results, indent=2))
    print("\n=== FINAL REPORT ===")
    print(json.dumps(final_report, indent=2, default=str))
except NameError as e:
    print(f"Variable not defined: {e}")
    print("Agent may not have completed the task successfully.")

---

# Unit Tests

Comprehensive tests to validate the agent's solution.

In [None]:
class TestDataLoading(unittest.TestCase):
    """Tests for data loading functionality."""
    
    def test_papers_df_exists_and_not_empty(self):
        self.assertIsInstance(papers_df, pd.DataFrame)
        self.assertGreater(len(papers_df), 0)
    
    def test_papers_df_has_required_columns(self):
        required = {'paper_id', 'title', 'authors', 'institution', 
                   'abstract', 'keywords', 'venue', 'year', 'publication_date'}
        self.assertTrue(required.issubset(set(papers_df.columns)))
    
    def test_citations_df_exists_and_not_empty(self):
        self.assertIsInstance(citations_df, pd.DataFrame)
        self.assertGreater(len(citations_df), 0)
    
    def test_citations_df_has_required_columns(self):
        required = {'citing_paper', 'cited_paper'}
        self.assertTrue(required.issubset(set(citations_df.columns)))
    
    def test_affiliations_data_structure(self):
        self.assertIsInstance(affiliations_data, dict)
        self.assertIn('authors', affiliations_data)
        self.assertIn('institutions', affiliations_data)
    
    def test_no_duplicate_paper_ids(self):
        self.assertEqual(papers_df['paper_id'].nunique(), len(papers_df))


class TestEntityExtraction(unittest.TestCase):
    """Tests for entity extraction functionality."""
    
    def test_extracted_authors_not_empty(self):
        self.assertGreater(len(extracted_authors), 0)
    
    def test_extracted_authors_structure(self):
        for author in extracted_authors:
            self.assertIn('name', author)
            self.assertIn('paper_ids', author)
            self.assertIn('name_variations', author)
    
    def test_extracted_institutions_not_empty(self):
        self.assertGreater(len(extracted_institutions), 0)
    
    def test_extracted_topics_is_dict(self):
        self.assertIsInstance(extracted_topics, dict)
    
    def test_methods_from_abstracts_is_list(self):
        self.assertIsInstance(methods_from_abstracts, list)


class TestEntityResolution(unittest.TestCase):
    """Tests for entity resolution functionality."""
    
    def test_author_resolution_map_not_empty(self):
        self.assertGreater(len(author_resolution_map), 0)
    
    def test_institution_resolution_map_not_empty(self):
        self.assertGreater(len(institution_resolution_map), 0)
    
    def test_resolved_counts_are_positive(self):
        self.assertGreater(resolved_author_count, 0)
        self.assertGreater(resolved_institution_count, 0)


class TestCitationNetwork(unittest.TestCase):
    """Tests for citation network functionality."""
    
    def test_citation_graph_not_empty(self):
        self.assertGreater(len(citation_graph), 0)
    
    def test_pagerank_scores_not_empty(self):
        self.assertGreater(len(pagerank_scores), 0)
    
    def test_pagerank_scores_sum_to_one(self):
        total = sum(pagerank_scores.values())
        self.assertAlmostEqual(total, 1.0, delta=0.01)
    
    def test_pagerank_scores_are_finite(self):
        for score in pagerank_scores.values():
            self.assertTrue(np.isfinite(score))
    
    def test_orphan_citations_identified(self):
        self.assertIsInstance(orphan_citations, list)
        self.assertGreater(len(orphan_citations), 0)
    
    def test_self_citations_identified(self):
        self.assertIsInstance(self_citations, list)
        self.assertGreater(len(self_citations), 0)


class TestValidationResults(unittest.TestCase):
    """Tests for validation results."""
    
    def test_validation_results_is_dict(self):
        self.assertIsInstance(validation_results, dict)
    
    def test_validation_results_has_required_keys(self):
        required_keys = {
            "papers_loaded_ok", "citations_loaded_ok", "affiliations_loaded_ok",
            "no_duplicate_paper_ids", "authors_extracted", "institutions_extracted",
            "resolution_maps_valid", "citation_graph_built", "pagerank_computed",
            "orphans_identified", "self_citations_identified", "all_pagerank_finite"
        }
        self.assertTrue(required_keys.issubset(set(validation_results.keys())))
    
    def test_all_validations_pass(self):
        failed = [k for k, v in validation_results.items() if not v]
        self.assertEqual(len(failed), 0, f"Failed validations: {failed}")


class TestSummaryStats(unittest.TestCase):
    """Tests for summary statistics."""
    
    def test_summary_stats_is_dict(self):
        self.assertIsInstance(summary_stats, dict)
    
    def test_summary_stats_has_required_keys(self):
        required_keys = {
            "total_papers", "total_citations", "unique_authors_raw",
            "unique_authors_resolved", "unique_institutions_raw",
            "unique_institutions_resolved", "papers_with_missing_abstract",
            "papers_with_missing_keywords", "orphan_citation_count",
            "self_citation_count", "avg_citations_per_paper",
            "most_common_venue", "year_range"
        }
        self.assertTrue(required_keys.issubset(set(summary_stats.keys())))


class TestFinalReport(unittest.TestCase):
    """Tests for final report structure."""
    
    def test_final_report_is_dict(self):
        self.assertIsInstance(final_report, dict)
    
    def test_final_report_has_metadata(self):
        self.assertIn('metadata', final_report)
    
    def test_final_report_has_entity_extraction(self):
        self.assertIn('entity_extraction', final_report)
    
    def test_final_report_has_citation_analysis(self):
        self.assertIn('citation_analysis', final_report)
    
    def test_final_report_has_data_quality(self):
        self.assertIn('data_quality', final_report)
    
    def test_final_report_has_validation_summary(self):
        self.assertIn('validation_summary', final_report)
    
    def test_all_checks_passed(self):
        self.assertTrue(final_report['validation_summary']['all_checks_passed'])

In [None]:
# Run all unit tests
def run_tests():
    """Run all unit tests and report results."""
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()
    
    suite.addTests(loader.loadTestsFromTestCase(TestDataLoading))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtraction))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityResolution))
    suite.addTests(loader.loadTestsFromTestCase(TestCitationNetwork))
    suite.addTests(loader.loadTestsFromTestCase(TestValidationResults))
    suite.addTests(loader.loadTestsFromTestCase(TestSummaryStats))
    suite.addTests(loader.loadTestsFromTestCase(TestFinalReport))
    
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    
    print("\n" + "="*50)
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success: {result.wasSuccessful()}")
    
    return result

# Execute tests
try:
    test_result = run_tests()
except Exception as e:
    print(f"Error running tests: {e}")
    print("Some required variables may not be defined.")

## Final Summary

In [None]:
# Final summary
print("="*60)
print("BENCHMARK EXECUTION SUMMARY")
print("="*60)

try:
    print(f"\nAgent Model: {MODEL_NAME}")
    print(f"Papers Analyzed: {len(papers_df)}")
    print(f"Citations Processed: {len(citations_df)}")
    print(f"\nEntity Resolution:")
    print(f"  Authors: {summary_stats.get('unique_authors_raw', 'N/A')} raw -> {resolved_author_count} resolved")
    print(f"  Institutions: {summary_stats.get('unique_institutions_raw', 'N/A')} raw -> {resolved_institution_count} resolved")
    print(f"\nCitation Network:")
    print(f"  Orphan citations found: {len(orphan_citations)}")
    print(f"  Self-citations found: {len(self_citations)}")
    print(f"  PageRank sum: {sum(pagerank_scores.values()):.4f}")
    print(f"\nValidation Summary:")
    failed = [k for k, v in validation_results.items() if not v]
    if failed:
        print(f"  FAILED checks: {failed}")
    else:
        print("  ALL CHECKS PASSED ✓")
    print(f"\nTest Results:")
    print(f"  Tests run: {test_result.testsRun}")
    print(f"  Failures: {len(test_result.failures)}")
    print(f"  Errors: {len(test_result.errors)}")
    
    if test_result.wasSuccessful() and not failed:
        print("\n" + "="*60)
        print("✓ BENCHMARK COMPLETED SUCCESSFULLY!")
        print("="*60)
    else:
        print("\n" + "="*60)
        print("✗ BENCHMARK COMPLETED WITH ISSUES")
        print("="*60)
except Exception as e:
    print(f"\nError generating summary: {e}")