# Agent Colab: Research Paper Entity Extraction Benchmark

This notebook sets up **Gemini 3 Pro Preview** as an autonomous agent to solve the Research Paper Entity Extraction and Citation Analysis benchmark.

**Requirements:**
- Google Colab Pro (for native Gemini access via `google.colab.ai`)

**Model Used:**
- `google/gemini-3-pro-preview` - Gemini 3 Pro Preview model

**Implementation:**
- Uses `google.colab.ai` module for native Colab Pro AI integration
- No external API keys required - uses Colab Pro's built-in AI capabilities
- Self-contained dataset generation (no file uploads needed)

**Note:** This notebook runs end-to-end without manual intervention.

## Setup and Dependencies

In [7]:
# Install required packages
%pip install -q pandas networkx

In [8]:
from google.colab import ai
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from datetime import datetime
from typing import Dict, List, Any, Tuple
import re
import networkx as nx
import warnings
import unittest
warnings.filterwarnings('ignore')

# List available models in Colab Pro
print("Available AI models in Colab Pro:")
available_models = ai.list_models()
for model in available_models:
    print(f"  - {model}")

Available AI models in Colab Pro:


TimeoutException: Requesting secret MODEL_PROXY_API_KEY timed out. Secrets can only be fetched when running from the Colab UI.

## Agent Configuration

Select and configure Gemini-3-Pro from available Colab Pro models.

In [None]:
# Select the model for agentic tasks
# Using Gemini 3 Pro Preview - the most advanced reasoning model available
MODEL_NAME = "google/gemini-3-pro-preview"

# Verify the model is available
if MODEL_NAME in available_models:
    print(f"Model '{MODEL_NAME}' is available - SELECTED")
else:
    print(f"Warning: '{MODEL_NAME}' not found. Available models: {available_models}")
    # Fallback to other Pro/capable models
    fallback_order = ["google/gemini-2.5-pro", "google/gemini-2.0-flash", "google/gemini-2.5-flash"]
    for fallback in fallback_order:
        if fallback in available_models:
            MODEL_NAME = fallback
            print(f"Using fallback model: {MODEL_NAME}")
            break

print(f"\nAgent model selected: {MODEL_NAME}")

Agent model initialized: gemini-3-pro


## Generate Dataset

Generate the synthetic benchmark dataset. This ensures the notebook is fully self-contained and reproducible.

In [None]:
# ============================================================================# ENHANCED DATASET GENERATION - With Headroom Challenges# ============================================================================import randomimport csvfrom datetime import timedeltarandom.seed(42)# Authors - including ambiguous ones (auth_011, auth_012)CANONICAL_AUTHORS = {    "auth_001": {"canonical_name": "John Smith", "variations": ["J. Smith", "John A. Smith"], "typos": ["Jonh Smith"], "institution": "inst_001"},    "auth_002": {"canonical_name": "Maria Garcia", "variations": ["M. Garcia", "Maria L. Garcia"], "typos": ["Maria Gracia"], "institution": "inst_002"},    "auth_003": {"canonical_name": "Wei Zhang", "variations": ["W. Zhang", "Zhang, Wei"], "typos": [], "institution": "inst_003"},    "auth_004": {"canonical_name": "Emily Johnson", "variations": ["E. Johnson"], "typos": [], "institution": "inst_001"},    "auth_005": {"canonical_name": "Ahmed Hassan", "variations": ["A. Hassan"], "typos": [], "institution": "inst_004"},    "auth_006": {"canonical_name": "Sarah Williams", "variations": ["S. Williams"], "typos": [], "institution": "inst_002"},    "auth_007": {"canonical_name": "Yuki Tanaka", "variations": ["Y. Tanaka"], "typos": [], "institution": "inst_005"},    "auth_008": {"canonical_name": "Michael Brown", "variations": ["M. Brown"], "typos": [], "institution": "inst_003"},    "auth_009": {"canonical_name": "Lisa Chen", "variations": ["L. Chen"], "typos": [], "institution": "inst_004"},    "auth_010": {"canonical_name": "David Miller", "variations": ["D. Miller"], "typos": [], "institution": "inst_005"},    # TRAP: Different person with same initials!    "auth_011": {"canonical_name": "James Smith", "variations": ["J. Smith", "J. B. Smith"], "typos": [], "institution": "inst_004"},    "auth_012": {"canonical_name": "Wei Zhang", "variations": ["W. Zhang", "W. X. Zhang"], "typos": [], "institution": "inst_002"},}CANONICAL_INSTITUTIONS = {    "inst_001": {"canonical_name": "Massachusetts Institute of Technology", "variations": ["MIT", "M.I.T."], "typos": ["Massachusets Institute of Technology"], "country": "USA"},    "inst_002": {"canonical_name": "Stanford University", "variations": ["Stanford", "Stanford Univ."], "typos": ["Standford University"], "country": "USA"},    "inst_003": {"canonical_name": "Tsinghua University", "variations": ["Tsinghua", "THU"], "typos": [], "country": "China"},    "inst_004": {"canonical_name": "University of Oxford", "variations": ["Oxford", "Oxford Univ."], "typos": [], "country": "UK"},    "inst_005": {"canonical_name": "University of Tokyo", "variations": ["Tokyo Univ.", "UTokyo"], "typos": [], "country": "Japan"},}VENUES = {    "neurips": {"canonical": "NeurIPS", "variations": ["NeurIPS", "NIPS", "Neural Information Processing Systems"]},    "icml": {"canonical": "ICML", "variations": ["ICML", "International Conference on Machine Learning"]},    "cvpr": {"canonical": "CVPR", "variations": ["CVPR", "IEEE/CVF CVPR"]},    "acl": {"canonical": "ACL", "variations": ["ACL", "Annual Meeting of the ACL"]},}RESEARCH_TOPICS = ["machine learning", "deep learning", "neural networks", "natural language processing",    "computer vision", "reinforcement learning", "transformer models", "attention mechanisms"]CITATION_RING_PAPERS = ["paper_0030", "paper_0031", "paper_0032", "paper_0033", "paper_0034"]TEMPORAL_ANOMALY_PAPERS = ["paper_0050", "paper_0051"]def generate_papers(num_papers=100):    papers = []    author_ids = list(CANONICAL_AUTHORS.keys())    base_date = datetime(2020, 1, 1)        for i in range(num_papers):        paper_id = f"paper_{i:04d}"        num_authors = random.randint(1, 3)        selected_ids = random.sample(author_ids, num_authors)                authors = []        for aid in selected_ids:            auth = CANONICAL_AUTHORS[aid]            if random.random() > 0.5 and auth["variations"]:                authors.append(random.choice(auth["variations"]))            else:                authors.append(auth["canonical_name"])                inst_id = CANONICAL_AUTHORS[selected_ids[0]]["institution"]        inst = CANONICAL_INSTITUTIONS[inst_id]        institution = random.choice(inst["variations"]) if random.random() > 0.5 else inst["canonical_name"]                venue_key = random.choice(list(VENUES.keys()))        venue = random.choice(VENUES[venue_key]["variations"])                pub_date = base_date + timedelta(days=random.randint(0, 1500))                paper = {            "paper_id": paper_id, "title": f"Research on {random.choice(RESEARCH_TOPICS).title()}",            "authors": authors, "institution": institution,            "abstract": f"This paper presents research on {random.choice(RESEARCH_TOPICS)}.",            "keywords": random.sample(RESEARCH_TOPICS, 2),            "venue": venue, "year": pub_date.year,            "publication_date": pub_date.strftime("%Y-%m-%d"),        }                # Basic edge cases        if i == 5: paper["abstract"] = ""        if i == 12: paper["keywords"] = []        if i == 45: paper["institution"] = None                # HEADROOM: Ambiguous J. Smith        if i == 8:            paper["authors"] = ["J. Smith", "Maria Garcia"]            paper["institution"] = "MIT"        if i == 9:            paper["authors"] = ["J. Smith", "Ahmed Hassan"]            paper["institution"] = "Oxford"                    # HEADROOM: Typos        if i == 35:            paper["authors"] = ["Jonh Smith", "Maria Gracia"]            paper["institution"] = "Massachusets Institute of Technology"                # Citation ring papers        if paper_id in CITATION_RING_PAPERS:            paper["year"] = 2022            paper["publication_date"] = "2022-06-15"                # Temporal anomaly targets        if paper_id in TEMPORAL_ANOMALY_PAPERS:            paper["year"] = 2023            paper["publication_date"] = "2023-01-15"                if i == 40:            paper["year"] = 2021            paper["publication_date"] = "2021-03-01"                # Venue disambiguation        if i == 55: paper["venue"] = "NIPS"        if i == 56: paper["venue"] = "NeurIPS"                # Conflicting affiliation        if i == 25:            paper["authors"] = ["Maria Garcia"]            paper["institution"] = "MIT"                papers.append(paper)    return papersdef generate_citations(papers):    citations = []    paper_years = {p["paper_id"]: p["year"] for p in papers}    paper_ids = [p["paper_id"] for p in papers]        for citing in paper_ids:        citable = [p for p in paper_ids if paper_years[p] <= paper_years[citing] and p != citing]        if citable:            for cited in random.sample(citable, min(3, len(citable))):                citations.append({"citing_paper": citing, "cited_paper": cited})        # Orphan and self-citation    citations.append({"citing_paper": "paper_0010", "cited_paper": "paper_9999"})    citations.append({"citing_paper": "paper_0015", "cited_paper": "paper_0015"})        # Citation ring    ring = CITATION_RING_PAPERS    for i in range(len(ring)):        citations.append({"citing_paper": ring[i], "cited_paper": ring[(i+1) % len(ring)]})    citations.append({"citing_paper": ring[0], "cited_paper": ring[2]})    citations.append({"citing_paper": ring[1], "cited_paper": ring[3]})        # Temporal anomalies    for future in TEMPORAL_ANOMALY_PAPERS:        citations.append({"citing_paper": "paper_0040", "cited_paper": future})        return citationsdef generate_affiliations():    affiliations = {"authors": {}, "institutions": {}, "disambiguation_notes": [], "venue_notes": []}    for aid, auth in CANONICAL_AUTHORS.items():        affiliations["authors"][aid] = {            "canonical_name": auth["canonical_name"],            "known_variations": auth["variations"],            "primary_institution": auth["institution"]        }    for iid, inst in CANONICAL_INSTITUTIONS.items():        affiliations["institutions"][iid] = {            "canonical_name": inst["canonical_name"],            "known_variations": inst["variations"],            "country": inst["country"]        }    affiliations["disambiguation_notes"].append({"warning": "J. Smith at MIT (auth_001) is DIFFERENT from J. Smith at Oxford (auth_011)"})    affiliations["venue_notes"].append("NIPS was renamed to NeurIPS in 2018")    return affiliations# Generateprint("Generating ENHANCED dataset with headroom challenges...")papers_list = generate_papers(100)citations_list = generate_citations(papers_list)affiliations_data_gen = generate_affiliations()papers_raw = papers_listcitations_raw = pd.DataFrame(citations_list)affiliations_raw = affiliations_data_genprint(f"\n✓ Dataset generated:")print(f"  Papers: {len(papers_raw)}, Citations: {len(citations_raw)}")print(f"\n⚠️ HEADROOM CHALLENGES INCLUDED:")print(f"  - Ambiguous authors (J. Smith at MIT vs J. Smith at Oxford)")print(f"  - Typos (Jonh Smith, Maria Gracia)")print(f"  - Citation ring ({len(CITATION_RING_PAPERS)} papers)")print(f"  - Temporal anomalies (2021 paper cites 2023 papers)")print(f"  - Venue variations (NIPS vs NeurIPS)")print(f"  - Conflicting affiliations")

FileNotFoundError: [Errno 2] No such file or directory: 'papers_metadata.json'

## Benchmark Prompt

The task specification for the agent.

In [None]:
BENCHMARK_PROMPT = """# Research Paper Entity Extraction and Citation Analysis Benchmark## (ENHANCED VERSION - Headroom Testing for Gemini 3 Pro)## ScenarioYou are a data scientist tasked with building an **advanced automated pipeline** for analyzing research paper metadata. Your goal is to extract structured information from a collection of research papers, resolve entity ambiguities (including challenging edge cases), detect anomalies in the citation network, and produce a comprehensive analytical report.**You must decide for yourself how to decompose the task**, which intermediate computations to perform, and in what order. **Do not simply follow a fixed step-by-step structure.****This task contains deliberately challenging edge cases** that require careful reasoning to solve correctly.---## ContextYou have access to three data sources (already loaded in memory):### Input Data Structures**`papers_raw`** (`list[dict]`): List of ~100 paper records. Each paper dict has this schema:```python{    "paper_id": str,           # e.g., "paper_0001"    "title": str,    "authors": list[str],      # e.g., ["J. Smith", "Maria Garcia"]    "institution": str | None, # e.g., "MIT" or "Stanford University"    "abstract": str,           # May be empty string ""    "keywords": list[str],     # e.g., ["machine learning", "neural networks"], may be []    "venue": str,              # e.g., "NeurIPS", "ICML", "NIPS" (venue names may vary!)    "year": int,    "publication_date": str    # ISO format "YYYY-MM-DD"}```**`citations_raw`** (`pd.DataFrame`): Citation relationships with columns:- `citing_paper`: str (paper_id of the paper doing the citing)- `cited_paper`: str (paper_id of the paper being cited)**`affiliations_raw`** (`dict`): Reference data for entity resolution. **Structure is a dict-of-dicts keyed by ID**:```python{    "authors": {        "auth_001": {            "canonical_name": str,       # e.g., "John Smith"            "known_variations": list[str], # e.g., ["J. Smith", "John A. Smith"]            "primary_institution": str   # Institution ID, e.g., "inst_001"        },        # ... more authors (NOTE: Some authors share initials but are DIFFERENT people!)    },    "institutions": {        "inst_001": {            "canonical_name": str,       # e.g., "Massachusetts Institute of Technology"            "known_variations": list[str], # e.g., ["MIT", "M.I.T."]            "country": str        },        # ... more institutions    },    "disambiguation_notes": list[dict],  # Hints about ambiguous entities    "venue_notes": list[str]             # Notes about venue name changes}```### Data Challenges (ENHANCED - Requires Advanced Reasoning)The data contains **challenging edge cases** you must handle:#### Basic Challenges (Standard)- **Author name variations**: Same person appears as "John Smith", "J. Smith", "Smith, John"- **Institution name variations**: Same institution appears as "MIT", "Massachusetts Institute of Technology"- **Missing fields**: Some papers have empty abstract (`""`) or empty keywords (`[]`)- **Orphan citations**: Some citations reference paper_ids that don't exist in papers_raw- **Self-citations**: Some papers cite themselves#### Advanced Challenges (Headroom Testing)1. **⚠️ AMBIGUOUS AUTHORS**: Some authors share the same initials but are **DIFFERENT PEOPLE** at different institutions.   - Example: "J. Smith" at MIT is **NOT** the same person as "J. Smith" at Oxford   - You must use **institution context** to disambiguate   - Naive merging will produce INCORRECT results2. **⚠️ TYPOS/OCR ERRORS**: Some author and institution names contain typos.   - Example: "Jonh Smith" should map to "John Smith"   - Example: "Massachusets Institute of Technology" should map to "MIT"   - You must use fuzzy matching or edit distance3. **⚠️ CITATION RING DETECTION**: A subset of papers cite each other in a suspicious circular pattern.   - Identify groups where: A→B→C→D→E→A (and cross-citations within)   - These should be flagged as anomalous4. **⚠️ TEMPORAL ANOMALIES**: Some citations violate temporal logic.   - A paper from 2021 cannot cite a paper from 2023   - Identify and flag these impossible citations5. **⚠️ VENUE DISAMBIGUATION**: Venues may appear with different names.   - "NIPS" → "NeurIPS" (renamed in 2018)   - "CVPR" → "Conference on Computer Vision and Pattern Recognition"   - These should be normalized to canonical forms6. **⚠️ CONFLICTING AFFILIATIONS**: Some papers list an author with an incorrect institution.   - Cross-reference with affiliations_raw to detect mismatches---## RequirementsYou are free to choose the order and decomposition of the task, but your final implementation must produce all of the following variables.### Required Variables#### Data Variables| Variable | Type | Description ||----------|------|-------------|| `papers_df` | `pd.DataFrame` | Papers data with columns: paper_id, title, authors, institution, abstract, keywords, venue, year, publication_date || `citations_df` | `pd.DataFrame` | Citation relationships with columns: citing_paper, cited_paper || `affiliations_data` | `dict` | Author affiliations reference data |#### Entity Variables| Variable | Type | Description ||----------|------|-------------|| `extracted_authors` | `list[dict]` | Each dict: `name`, `paper_ids`, `name_variations` || `extracted_institutions` | `list[dict]` | Each dict: `name`, `paper_ids`, `name_variations` || `extracted_topics` | `dict[str, int]` | Topic → frequency count || `methods_from_abstracts` | `list[str]` | Research methods found in abstracts |#### Resolution Variables| Variable | Type | Description ||----------|------|-------------|| `author_resolution_map` | `dict[str, str]` | Variation → canonical name || `institution_resolution_map` | `dict[str, str]` | Variation → canonical name || `resolved_author_count` | `int` | Unique authors after resolution || `resolved_institution_count` | `int` | Unique institutions after resolution |#### Citation Network Variables| Variable | Type | Description ||----------|------|-------------|| `citation_graph` | `dict[str, list[str]]` | Adjacency list || `in_degree` | `dict[str, int]` | Incoming citations per paper || `out_degree` | `dict[str, int]` | Outgoing citations per paper || `pagerank_scores` | `dict[str, float]` | PageRank centrality scores || `top_cited_papers` | `list[str]` | Top 10 most cited paper_ids || `orphan_citations` | `list[dict]` | Citations to non-existent papers || `self_citations` | `list[str]` | Paper_ids that cite themselves |#### ⭐ NEW: Anomaly Detection Variables (Headroom)| Variable | Type | Description ||----------|------|-------------|| `citation_ring_papers` | `list[str]` | Paper_ids involved in suspicious citation rings || `temporal_anomalies` | `list[dict]` | Citations where citing_paper.year > cited_paper.year. Each dict: `citing_paper`, `cited_paper`, `citing_year`, `cited_year` || `ambiguous_author_resolutions` | `list[dict]` | Cases where "J. Smith" was disambiguated. Each dict: `name_variation`, `resolved_to`, `institution_used`, `reasoning` || `typo_corrections` | `list[dict]` | Typos that were corrected. Each dict: `original`, `corrected`, `confidence` || `venue_normalizations` | `dict[str, str]` | Map of venue variations to canonical names || `affiliation_conflicts` | `list[dict]` | Papers where listed institution doesn't match author's known institution. Each dict: `paper_id`, `author`, `listed_institution`, `expected_institution` |#### Validation Variables```pythonvalidation_results: dict[str, bool]```Required keys (ENHANCED):| Key | What to Check ||-----|---------------|| `"papers_loaded_ok"` | papers_df has expected columns and >0 rows || `"citations_loaded_ok"` | citations_df has expected columns and >0 rows || `"affiliations_loaded_ok"` | affiliations_data is valid dict || `"no_duplicate_paper_ids"` | All paper_ids unique || `"authors_extracted"` | extracted_authors has >0 entries || `"institutions_extracted"` | extracted_institutions has >0 entries || `"resolution_maps_valid"` | Resolution maps are non-empty || `"citation_graph_built"` | citation_graph is non-empty || `"pagerank_computed"` | pagerank_scores is non-empty with floats || `"orphans_identified"` | Checked for orphan citations || `"self_citations_identified"` | Checked for self-citations || `"all_pagerank_finite"` | All PageRank values are finite || `"citation_rings_checked"` | ⭐ Checked for citation rings || `"temporal_anomalies_checked"` | ⭐ Checked for temporal violations || `"ambiguous_authors_handled"` | ⭐ Used institution context for disambiguation || `"typos_handled"` | ⭐ Applied fuzzy matching for typos || `"venues_normalized"` | ⭐ Normalized venue name variations |#### Summary Statistics`summary_stats: dict[str, Any]` with required keys:| Key | Type | Description ||-----|------|-------------|| `"total_papers"` | `int` | Total papers || `"total_citations"` | `int` | Total citation relationships || `"unique_authors_raw"` | `int` | Before resolution || `"unique_authors_resolved"` | `int` | After resolution || `"unique_institutions_raw"` | `int` | Before resolution || `"unique_institutions_resolved"` | `int` | After resolution || `"papers_with_missing_abstract"` | `int` | Empty/null abstract || `"papers_with_missing_keywords"` | `int` | Empty/null keywords || `"orphan_citation_count"` | `int` | Orphan citations || `"self_citation_count"` | `int` | Self-citations || `"avg_citations_per_paper"` | `float` | Average outgoing citations || `"most_common_venue"` | `str` | Most frequent venue || `"year_range"` | `tuple[int, int]` | (min_year, max_year) || `"citation_ring_count"` | `int` | ⭐ Papers in citation rings || `"temporal_anomaly_count"` | `int` | ⭐ Temporal violations || `"typo_correction_count"` | `int` | ⭐ Typos corrected || `"affiliation_conflict_count"` | `int` | ⭐ Affiliation mismatches |#### Final Report```pythonfinal_report: dict```Must have this structure:```python{    "metadata": {        "task": "Research Paper Entity Extraction and Citation Analysis",        "papers_analyzed": int,        "execution_timestamp": str    },    "entity_extraction": {        "authors": {            "total_unique": int,            "top_5_by_paper_count": [{"name": str, "paper_count": int}, ...]        },        "institutions": {            "total_unique": int,            "top_5_by_paper_count": [{"name": str, "paper_count": int}, ...]        },        "topics": {            "total_unique": int,            "top_10_by_frequency": [{"topic": str, "count": int}, ...]        }    },    "citation_analysis": {        "total_citations": int,        "top_10_cited_papers": [{"paper_id": str, "citation_count": int, "title": str}, ...],        "orphan_citations": [{"citing_paper": str, "cited_paper": str}, ...],        "self_citations": [str, ...],        "network_statistics": {            "avg_in_degree": float,            "avg_out_degree": float,            "max_in_degree": int,            "max_out_degree": int        }    },    "anomaly_detection": {  # ⭐ NEW SECTION        "citation_rings": {            "detected": bool,            "papers_involved": [str, ...],            "description": str        },        "temporal_anomalies": {            "count": int,            "examples": [{"citing": str, "cited": str, "issue": str}, ...]        },        "ambiguous_resolutions": [            {"variation": str, "resolved_to": str, "method": str}, ...        ],        "typo_corrections": [            {"original": str, "corrected": str}, ...        ],        "affiliation_conflicts": [            {"paper_id": str, "author": str, "conflict": str}, ...        ]    },    "data_quality": {        "missing_abstracts": int,        "missing_keywords": int,        "missing_institutions": int,        "duplicate_author_entries": int    },    "validation_summary": {        "all_checks_passed": bool,        "failed_checks": [str, ...]    }}```---## Constraints1. **Do not hardcode specific paper IDs, author names, or institution names**2. **Entity resolution MUST use institution context for disambiguation** - "J. Smith" at MIT ≠ "J. Smith" at Oxford3. **Typo handling MUST use fuzzy matching** (e.g., Levenshtein distance)4. **PageRank with damping factor 0.85**5. **Citation rings require cycle detection** in the citation graph6. **Temporal anomalies require comparing publication years**7. **All intermediate variables must be inspectable**8. **Handle edge cases gracefully**---## Success CriteriaYour solution is successful if:1. **All validation checks pass** (including new headroom checks)2. **Entity resolution correctly disambiguates** "J. Smith" at different institutions as different people3. **Citation rings are detected** (there is at least one ring of 5 papers)4. **Temporal anomalies are detected** (there is at least one)5. **Typos are corrected** with fuzzy matching6. **Venue names are normalized** (NIPS → NeurIPS)7. **Affiliation conflicts are identified**8. **PageRank scores sum to ~1.0**9. **Final report follows the exact schema**10. **All numeric values are finite**---## Output Format```pythonimport jsonprint("=== VALIDATION RESULTS ===")print(json.dumps(validation_results, indent=2))print("\n=== FINAL REPORT ===")print(json.dumps(final_report, indent=2, default=str))```"""print("Enhanced benchmark prompt loaded")print("Includes: Ambiguous author disambiguation, citation rings, temporal anomalies, typos, venue normalization")

## Agent Execution

In [None]:
def run_agent_task(prompt, data_context):
    """Run the agent using google.colab.ai to generate code for the task."""
    
    # Prepare context with data samples
    context = f"""
You have access to the following data (already loaded in Python):

papers_raw: A list of {len(data_context['papers'])} paper dictionaries
Sample: {json.dumps(data_context['papers'][0], indent=2)}

citations_raw: A pandas DataFrame with {len(data_context['citations'])} rows
Columns: {data_context['citations'].columns.tolist()}
Sample:
{data_context['citations'].head(3).to_string()}

affiliations_raw: A dictionary with author and institution reference data
Keys: {list(data_context['affiliations'].keys())}
Sample author: {json.dumps(list(data_context['affiliations']['authors'].values())[0], indent=2)}
Sample institution: {json.dumps(list(data_context['affiliations']['institutions'].values())[0], indent=2)}

{prompt}
"""
    
    print("Sending task to agent...")
    print("="*50)
    
    # Use google.colab.ai to generate response
    # The ai.generate_text function uses the Colab Pro's native AI capabilities
    response = ai.generate_text(
        prompt=context,
        model_name=MODEL_NAME,
    )
    
    return response


# Prepare data context
data_context = {
    'papers': papers_raw,
    'citations': citations_raw,
    'affiliations': affiliations_raw
}

# Run the agent
agent_response = run_agent_task(BENCHMARK_PROMPT, data_context)
print("Agent response received")
print("="*50)
print(agent_response[:2000] + "..." if len(agent_response) > 2000 else agent_response)

In [None]:
# Extract Python code from agent response and execute it
def extract_and_execute_code(response_text):
    """Extract Python code blocks from the response and execute them."""
    
    # Find all code blocks
    code_blocks = re.findall(r'```python\n(.*?)```', response_text, re.DOTALL)
    
    if not code_blocks:
        # Try without language specifier
        code_blocks = re.findall(r'```\n(.*?)```', response_text, re.DOTALL)
    
    if not code_blocks:
        print("No code blocks found in response")
        return None
    
    # Combine all code blocks
    full_code = "\n\n".join(code_blocks)
    
    print(f"Extracted {len(code_blocks)} code block(s)")
    print("Executing agent code...")
    print("="*50)
    
    # Execute the code
    exec_globals = {
        'papers_raw': papers_raw,
        'citations_raw': citations_raw,
        'affiliations_raw': affiliations_raw,
        'pd': pd,
        'np': np,
        'json': json,
        're': re,
        'nx': nx,
        'defaultdict': defaultdict,
        'Counter': Counter,
        'datetime': datetime,
        'Dict': Dict,
        'List': List,
        'Any': Any,
        'Tuple': Tuple,
    }
    
    try:
        exec(full_code, exec_globals)
        print("Code executed successfully!")
        return exec_globals
    except Exception as e:
        print(f"Error executing code: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute the agent's code
exec_result = extract_and_execute_code(agent_response)

# If successful, extract variables to global scope
if exec_result:
    required_vars = [
        'papers_df', 'citations_df', 'affiliations_data',
        'extracted_authors', 'extracted_institutions', 'extracted_topics', 'methods_from_abstracts',
        'author_resolution_map', 'institution_resolution_map', 'resolved_author_count', 'resolved_institution_count',
        'citation_graph', 'in_degree', 'out_degree', 'pagerank_scores', 'top_cited_papers',
        'orphan_citations', 'self_citations',
        'validation_results', 'summary_stats', 'final_report'
    ]
    
    print("\nVariable extraction:")
    for var in required_vars:
        if var in exec_result:
            globals()[var] = exec_result[var]
            print(f"  ✓ {var}")
        else:
            print(f"  ✗ {var} (missing)")

## Agent Output

Display the results produced by the agent.

In [None]:
# Display the agent's outputs
try:
    print("=== VALIDATION RESULTS ===")
    print(json.dumps(validation_results, indent=2))
    print("\n=== FINAL REPORT ===")
    print(json.dumps(final_report, indent=2, default=str))
except NameError as e:
    print(f"Variable not defined: {e}")
    print("Agent may not have completed the task successfully.")

---

# Unit Tests

Comprehensive tests to validate the agent's solution.

In [None]:
class TestDataLoading(unittest.TestCase):    def test_papers_df_exists_and_not_empty(self):        self.assertIsInstance(papers_df, pd.DataFrame)        self.assertGreater(len(papers_df), 0)        def test_papers_df_has_required_columns(self):        required = {'paper_id', 'title', 'authors', 'institution', 'abstract', 'keywords', 'venue', 'year'}        self.assertTrue(required.issubset(set(papers_df.columns)))        def test_citations_df_exists(self):        self.assertIsInstance(citations_df, pd.DataFrame)        self.assertGreater(len(citations_df), 0)        def test_affiliations_data_structure(self):        self.assertIsInstance(affiliations_data, dict)        self.assertIn('authors', affiliations_data)class TestEntityExtraction(unittest.TestCase):    def test_extracted_authors_not_empty(self):        self.assertGreater(len(extracted_authors), 0)        def test_extracted_institutions_not_empty(self):        self.assertGreater(len(extracted_institutions), 0)class TestEntityResolution(unittest.TestCase):    def test_author_resolution_map_not_empty(self):        self.assertGreater(len(author_resolution_map), 0)        def test_resolved_counts_positive(self):        self.assertGreater(resolved_author_count, 0)class TestCitationNetwork(unittest.TestCase):    def test_citation_graph_not_empty(self):        self.assertGreater(len(citation_graph), 0)        def test_pagerank_scores_sum_to_one(self):        self.assertAlmostEqual(sum(pagerank_scores.values()), 1.0, delta=0.01)        def test_orphan_citations_identified(self):        self.assertGreater(len(orphan_citations), 0)        def test_self_citations_identified(self):        self.assertGreater(len(self_citations), 0)class TestHeadroomChallenges(unittest.TestCase):    """Tests for ENHANCED headroom challenges - weaker models should fail these."""        def test_citation_rings_detected(self):        self.assertTrue(hasattr(globals().get('citation_ring_papers', None), '__len__') or                        'citation_ring_papers' in dir(),                       "Must detect citation rings")        self.assertGreater(len(citation_ring_papers), 0, "Should find citation ring papers")        def test_temporal_anomalies_detected(self):        self.assertTrue('temporal_anomalies' in dir() or hasattr(globals().get('temporal_anomalies', None), '__len__'),                       "Must detect temporal anomalies")        self.assertGreater(len(temporal_anomalies), 0, "Should find temporal anomalies")        def test_typo_corrections_made(self):        self.assertTrue('typo_corrections' in dir() or hasattr(globals().get('typo_corrections', None), '__len__'),                       "Must correct typos")        def test_ambiguous_authors_disambiguated(self):        self.assertTrue('ambiguous_author_resolutions' in dir() or                        hasattr(globals().get('ambiguous_author_resolutions', None), '__len__'),                       "Must disambiguate authors like J. Smith")class TestValidationResults(unittest.TestCase):    def test_validation_results_is_dict(self):        self.assertIsInstance(validation_results, dict)        def test_headroom_validations_present(self):        self.assertIn("citation_rings_checked", validation_results)        self.assertIn("temporal_anomalies_checked", validation_results)class TestFinalReport(unittest.TestCase):    def test_final_report_has_anomaly_detection(self):        self.assertIn('anomaly_detection', final_report, "Report must include anomaly_detection section")        def test_final_report_has_citation_rings(self):        self.assertIn('citation_rings', final_report.get('anomaly_detection', {}))        def test_all_checks_passed(self):        self.assertTrue(final_report['validation_summary']['all_checks_passed'])

In [None]:
# Run all unit tests
def run_tests():
    """Run all unit tests and report results."""
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()
    
    suite.addTests(loader.loadTestsFromTestCase(TestDataLoading))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtraction))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityResolution))
    suite.addTests(loader.loadTestsFromTestCase(TestCitationNetwork))
    suite.addTests(loader.loadTestsFromTestCase(TestValidationResults))
    suite.addTests(loader.loadTestsFromTestCase(TestSummaryStats))
    suite.addTests(loader.loadTestsFromTestCase(TestFinalReport))
    
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    
    print("\n" + "="*50)
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success: {result.wasSuccessful()}")
    
    return result

# Execute tests
try:
    test_result = run_tests()
except Exception as e:
    print(f"Error running tests: {e}")
    print("Some required variables may not be defined.")

## Final Summary

In [None]:
# Final summary
print("="*60)
print("BENCHMARK EXECUTION SUMMARY")
print("="*60)

try:
    print(f"\nAgent Model: {MODEL_NAME}")
    print(f"Papers Analyzed: {len(papers_df)}")
    print(f"Citations Processed: {len(citations_df)}")
    print(f"\nEntity Resolution:")
    print(f"  Authors: {summary_stats.get('unique_authors_raw', 'N/A')} raw -> {resolved_author_count} resolved")
    print(f"  Institutions: {summary_stats.get('unique_institutions_raw', 'N/A')} raw -> {resolved_institution_count} resolved")
    print(f"\nCitation Network:")
    print(f"  Orphan citations found: {len(orphan_citations)}")
    print(f"  Self-citations found: {len(self_citations)}")
    print(f"  PageRank sum: {sum(pagerank_scores.values()):.4f}")
    print(f"\nValidation Summary:")
    failed = [k for k, v in validation_results.items() if not v]
    if failed:
        print(f"  FAILED checks: {failed}")
    else:
        print("  ALL CHECKS PASSED ✓")
    print(f"\nTest Results:")
    print(f"  Tests run: {test_result.testsRun}")
    print(f"  Failures: {len(test_result.failures)}")
    print(f"  Errors: {len(test_result.errors)}")
    
    if test_result.wasSuccessful() and not failed:
        print("\n" + "="*60)
        print("✓ BENCHMARK COMPLETED SUCCESSFULLY!")
        print("="*60)
    else:
        print("\n" + "="*60)
        print("✗ BENCHMARK COMPLETED WITH ISSUES")
        print("="*60)
except Exception as e:
    print(f"\nError generating summary: {e}")