# Golden Solution: Research Paper Entity Extraction and Citation Analysis

This notebook contains the **reference implementation** for the benchmark task.
It demonstrates that the task is solvable and provides the expected outputs.

**Features:**
- Self-contained dataset generation (no file uploads needed)
- Runs end-to-end without manual intervention
- Passes all unit tests

**Note:** This is the golden solution - it should pass all unit tests.

## Setup and Imports

In [None]:
# Install required packages
%pip install -q pandas networkx

In [None]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from datetime import datetime, timedelta
from typing import Dict, List, Any, Tuple
import re
import networkx as nx
import warnings
import unittest
import random

warnings.filterwarnings('ignore')

## Generate Dataset

Generate the synthetic benchmark dataset. This ensures the notebook is fully self-contained.

In [None]:
# ============================================================================
# DATASET GENERATION - Self-contained synthetic data generation
# ============================================================================

# Set seed for reproducibility
random.seed(42)

# Canonical authors with their name variations
CANONICAL_AUTHORS = {
    "auth_001": {"canonical_name": "John Smith", "variations": ["J. Smith", "John A. Smith", "J. A. Smith", "Smith, John"], "institution": "inst_001"},
    "auth_002": {"canonical_name": "Maria Garcia", "variations": ["M. Garcia", "Maria L. Garcia", "Garcia, Maria", "M. L. Garcia"], "institution": "inst_002"},
    "auth_003": {"canonical_name": "Wei Zhang", "variations": ["W. Zhang", "Wei W. Zhang", "Zhang, Wei", "Zhang Wei"], "institution": "inst_003"},
    "auth_004": {"canonical_name": "Emily Johnson", "variations": ["E. Johnson", "Emily R. Johnson", "Johnson, Emily", "E. R. Johnson"], "institution": "inst_001"},
    "auth_005": {"canonical_name": "Ahmed Hassan", "variations": ["A. Hassan", "Ahmed M. Hassan", "Hassan, Ahmed", "A. M. Hassan"], "institution": "inst_004"},
    "auth_006": {"canonical_name": "Sarah Williams", "variations": ["S. Williams", "Sarah K. Williams", "Williams, Sarah", "S. K. Williams"], "institution": "inst_002"},
    "auth_007": {"canonical_name": "Yuki Tanaka", "variations": ["Y. Tanaka", "Yuki S. Tanaka", "Tanaka, Yuki", "Tanaka Yuki"], "institution": "inst_005"},
    "auth_008": {"canonical_name": "Michael Brown", "variations": ["M. Brown", "Michael J. Brown", "Brown, Michael", "M. J. Brown"], "institution": "inst_003"},
    "auth_009": {"canonical_name": "Lisa Chen", "variations": ["L. Chen", "Lisa Y. Chen", "Chen, Lisa", "Chen Lisa"], "institution": "inst_004"},
    "auth_010": {"canonical_name": "David Miller", "variations": ["D. Miller", "David A. Miller", "Miller, David", "D. A. Miller"], "institution": "inst_005"}
}

CANONICAL_INSTITUTIONS = {
    "inst_001": {"canonical_name": "Massachusetts Institute of Technology", "variations": ["MIT", "M.I.T.", "Massachusetts Inst. of Technology"], "country": "USA"},
    "inst_002": {"canonical_name": "Stanford University", "variations": ["Stanford", "Stanford Univ.", "Stanford U."], "country": "USA"},
    "inst_003": {"canonical_name": "Tsinghua University", "variations": ["Tsinghua", "Tsinghua Univ.", "THU"], "country": "China"},
    "inst_004": {"canonical_name": "University of Oxford", "variations": ["Oxford", "Oxford Univ.", "Univ. of Oxford"], "country": "UK"},
    "inst_005": {"canonical_name": "University of Tokyo", "variations": ["Tokyo Univ.", "UTokyo", "Univ. of Tokyo"], "country": "Japan"}
}

RESEARCH_TOPICS = ["machine learning", "deep learning", "neural networks", "natural language processing",
    "computer vision", "reinforcement learning", "transformer models", "attention mechanisms",
    "graph neural networks", "federated learning", "transfer learning", "meta-learning",
    "generative models", "adversarial learning", "explainable AI", "optimization"]

RESEARCH_METHODS = ["gradient descent", "backpropagation", "stochastic optimization", "cross-validation",
    "ablation study", "hyperparameter tuning", "ensemble methods", "regularization",
    "dropout", "batch normalization", "attention mechanism", "skip connections"]

VENUES = ["NeurIPS", "ICML", "ICLR", "AAAI", "CVPR", "ACL", "EMNLP", "KDD", "WWW", "IJCAI"]

def generate_papers(num_papers=100):
    papers = []
    author_ids = list(CANONICAL_AUTHORS.keys())
    base_date = datetime(2020, 1, 1)
    
    for i in range(num_papers):
        paper_id = f"paper_{i:04d}"
        num_authors = random.randint(1, 4)
        selected_author_ids = random.sample(author_ids, num_authors)
        
        authors = []
        for aid in selected_author_ids:
            auth = CANONICAL_AUTHORS[aid]
            if random.random() > 0.4:
                authors.append(random.choice(auth["variations"]))
            else:
                authors.append(auth["canonical_name"])
        
        primary_inst_id = CANONICAL_AUTHORS[selected_author_ids[0]]["institution"]
        inst = CANONICAL_INSTITUTIONS[primary_inst_id]
        institution = random.choice(inst["variations"]) if random.random() > 0.5 else inst["canonical_name"]
        
        paper_topics = random.sample(RESEARCH_TOPICS, random.randint(2, 4))
        abstract = f"This paper presents research on {paper_topics[0]} using {random.choice(RESEARCH_METHODS)}."
        
        paper = {
            "paper_id": paper_id, "title": f"Research on {paper_topics[0].title()}",
            "authors": authors, "institution": institution, "abstract": abstract,
            "keywords": paper_topics, "venue": random.choice(VENUES),
            "year": (base_date + timedelta(days=random.randint(0, 1500))).year,
            "publication_date": (base_date + timedelta(days=random.randint(0, 1500))).strftime("%Y-%m-%d"),
        }
        
        # Edge cases
        if i == 5: paper["abstract"] = ""
        if i == 12: paper["keywords"] = []
        if i == 45: paper["institution"] = None
        if i == 67: paper["authors"].append(CANONICAL_AUTHORS[selected_author_ids[0]]["variations"][0])
        
        papers.append(paper)
    return papers

def generate_citations(papers, density=0.05):
    citations = []
    paper_ids = [p["paper_id"] for p in papers]
    paper_years = {p["paper_id"]: p["year"] for p in papers}
    
    for citing_paper in paper_ids:
        citing_year = paper_years[citing_paper]
        citable = [p for p in paper_ids if paper_years[p] <= citing_year and p != citing_paper]
        if citable:
            num_citations = min(max(1, int(len(citable) * density * random.uniform(0.5, 1.5))), len(citable), 10)
            for cited in random.sample(citable, num_citations):
                citations.append({"citing_paper": citing_paper, "cited_paper": cited})
    
    citations.append({"citing_paper": "paper_0010", "cited_paper": "paper_9999"})  # Orphan
    citations.append({"citing_paper": "paper_0015", "cited_paper": "paper_0015"})  # Self-citation
    return citations

def generate_author_affiliations():
    affiliations = {"authors": {}, "institutions": {}}
    for auth_id, auth_data in CANONICAL_AUTHORS.items():
        affiliations["authors"][auth_id] = {
            "canonical_name": auth_data["canonical_name"],
            "known_variations": auth_data["variations"][:2],
            "primary_institution": auth_data["institution"],
        }
    for inst_id, inst_data in CANONICAL_INSTITUTIONS.items():
        affiliations["institutions"][inst_id] = {
            "canonical_name": inst_data["canonical_name"],
            "known_variations": inst_data["variations"][:2],
            "country": inst_data["country"]
        }
    return affiliations

# Generate the dataset
print("Generating synthetic dataset...")
papers_raw = generate_papers(100)
citations_raw = pd.DataFrame(generate_citations(papers_raw))
affiliations_raw = generate_author_affiliations()

print(f"✓ Papers: {len(papers_raw)}, Citations: {len(citations_raw)}, Authors: {len(affiliations_raw['authors'])}")

In [None]:
## Golden Solution Implementation

The reference implementation for the benchmark task.

# ============================================================================
# CORE DATA VARIABLES
# ============================================================================

papers_df = pd.DataFrame(papers_raw)
citations_df = citations_raw.copy()
affiliations_data = affiliations_raw.copy()

print(f"papers_df: {len(papers_df)} rows")
print(f"citations_df: {len(citations_df)} rows")
print(f"affiliations_data keys: {list(affiliations_data.keys())}")

# ============================================================================
# ENTITY RESOLUTION MAPS
# ============================================================================

author_resolution_map = {}
institution_resolution_map = {}

# Build resolution maps from reference data
for auth_id, auth_info in affiliations_data["authors"].items():
    canonical = auth_info["canonical_name"]
    author_resolution_map[canonical] = canonical
    for var in auth_info.get("known_variations", []):
        author_resolution_map[var] = canonical

for inst_id, inst_info in affiliations_data["institutions"].items():
    canonical = inst_info["canonical_name"]
    institution_resolution_map[canonical] = canonical
    for var in inst_info.get("known_variations", []):
        institution_resolution_map[var] = canonical

print(f"Author resolution map: {len(author_resolution_map)} entries")
print(f"Institution resolution map: {len(institution_resolution_map)} entries")

# ============================================================================
# ENTITY EXTRACTION
# ============================================================================

extracted_authors_dict = defaultdict(lambda: {"name": "", "paper_ids": [], "name_variations": set()})
extracted_institutions_dict = defaultdict(lambda: {"name": "", "paper_ids": [], "name_variations": set()})
all_keywords = []
methods_from_abstracts = []

KNOWN_METHODS = ["gradient descent", "backpropagation", "cross-validation", "ablation study",
                 "attention mechanism", "dropout", "batch normalization", "pre-training", "fine-tuning"]

for _, row in papers_df.iterrows():
    pid = row["paper_id"]
    
    # Extract authors
    authors = row["authors"] if isinstance(row["authors"], list) else []
    for auth in authors:
        resolved = author_resolution_map.get(auth, auth)
        extracted_authors_dict[resolved]["name"] = resolved
        extracted_authors_dict[resolved]["paper_ids"].append(pid)
        extracted_authors_dict[resolved]["name_variations"].add(auth)
    
    # Extract institutions
    inst = row.get("institution")
    if inst:
        resolved_inst = institution_resolution_map.get(inst, inst)
        extracted_institutions_dict[resolved_inst]["name"] = resolved_inst
        extracted_institutions_dict[resolved_inst]["paper_ids"].append(pid)
        extracted_institutions_dict[resolved_inst]["name_variations"].add(inst)
    
    # Extract topics
    kws = row.get("keywords", [])
    if isinstance(kws, list):
        all_keywords.extend(kws)
    
    # Extract methods from abstracts
    abstract = row.get("abstract", "")
    if isinstance(abstract, str):
        for method in KNOWN_METHODS:
            if method.lower() in abstract.lower():
                methods_from_abstracts.append(method)

# Convert to required format
extracted_authors = [{"name": v["name"], "paper_ids": v["paper_ids"], "name_variations": list(v["name_variations"])} 
                     for v in extracted_authors_dict.values()]
extracted_institutions = [{"name": v["name"], "paper_ids": v["paper_ids"], "name_variations": list(v["name_variations"])} 
                          for v in extracted_institutions_dict.values()]
extracted_topics = dict(Counter(all_keywords))
methods_from_abstracts = list(set(methods_from_abstracts))

resolved_author_count = len(extracted_authors)
resolved_institution_count = len(extracted_institutions)

print(f"Extracted authors: {len(extracted_authors)}")
print(f"Extracted institutions: {len(extracted_institutions)}")
print(f"Extracted topics: {len(extracted_topics)}")
print(f"Methods from abstracts: {len(methods_from_abstracts)}")

# ============================================================================
# CITATION NETWORK
# ============================================================================

valid_paper_ids = set(papers_df["paper_id"].unique())

citation_graph = defaultdict(list)
in_degree = {pid: 0 for pid in valid_paper_ids}
out_degree = {pid: 0 for pid in valid_paper_ids}
orphan_citations = []
self_citations = []

for _, row in citations_df.iterrows():
    src, dst = row["citing_paper"], row["cited_paper"]
    
    if src in valid_paper_ids:
        citation_graph[src].append(dst)
        out_degree[src] += 1
    
    if dst in valid_paper_ids:
        in_degree[dst] += 1
    
    if dst not in valid_paper_ids:
        orphan_citations.append({"citing_paper": src, "cited_paper": dst})
    
    if src == dst:
        self_citations.append(src)

self_citations = list(set(self_citations))
citation_graph = dict(citation_graph)

# PageRank computation
G = nx.DiGraph()
G.add_nodes_from(valid_paper_ids)
for src, targets in citation_graph.items():
    for dst in targets:
        if dst in valid_paper_ids:
            G.add_edge(src, dst)

pagerank_scores = nx.pagerank(G, alpha=0.85, max_iter=100, tol=1e-6)

# Top cited papers
sorted_by_citations = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)
top_cited_papers = [x[0] for x in sorted_by_citations[:10]]

print(f"Citation graph nodes: {len(citation_graph)}")
print(f"Orphan citations: {len(orphan_citations)}")
print(f"Self citations: {len(self_citations)}")
print(f"PageRank sum: {sum(pagerank_scores.values()):.4f}")

In [None]:
# ============================================================================
# VALIDATION AND SUMMARY STATISTICS
# ============================================================================

# Count unique authors/institutions before resolution
unique_authors_raw = set()
for authors in papers_df["authors"]:
    if isinstance(authors, list):
        unique_authors_raw.update(authors)

unique_institutions_raw = set(papers_df["institution"].dropna().unique())

# Summary statistics
summary_stats = {
    "total_papers": len(papers_df),
    "total_citations": len(citations_df),
    "unique_authors_raw": len(unique_authors_raw),
    "unique_authors_resolved": resolved_author_count,
    "unique_institutions_raw": len(unique_institutions_raw),
    "unique_institutions_resolved": resolved_institution_count,
    "papers_with_missing_abstract": int((papers_df["abstract"] == "").sum() + papers_df["abstract"].isna().sum()),
    "papers_with_missing_keywords": int(sum(1 for kw in papers_df["keywords"] if not isinstance(kw, list) or len(kw) == 0)),
    "orphan_citation_count": len(orphan_citations),
    "self_citation_count": len(self_citations),
    "avg_citations_per_paper": len(citations_df) / len(papers_df) if len(papers_df) > 0 else 0,
    "most_common_venue": papers_df["venue"].mode()[0] if not papers_df["venue"].empty else None,
    "year_range": (int(papers_df["year"].min()), int(papers_df["year"].max()))
}

# Validation results
validation_results = {
    "papers_loaded_ok": len(papers_df) > 0 and all(col in papers_df.columns for col in ["paper_id", "title", "authors"]),
    "citations_loaded_ok": len(citations_df) > 0 and all(col in citations_df.columns for col in ["citing_paper", "cited_paper"]),
    "affiliations_loaded_ok": isinstance(affiliations_data, dict) and "authors" in affiliations_data and "institutions" in affiliations_data,
    "no_duplicate_paper_ids": papers_df["paper_id"].is_unique,
    "authors_extracted": len(extracted_authors) > 0,
    "institutions_extracted": len(extracted_institutions) > 0,
    "resolution_maps_valid": len(author_resolution_map) > 0 and len(institution_resolution_map) > 0,
    "citation_graph_built": len(citation_graph) > 0,
    "pagerank_computed": len(pagerank_scores) > 0 and len(pagerank_scores) == len(papers_df),
    "orphans_identified": True,  # We checked for orphans
    "self_citations_identified": True,  # We checked for self-citations
    "all_pagerank_finite": all(np.isfinite(v) for v in pagerank_scores.values())
}

print("Summary Statistics:")
for k, v in summary_stats.items():
    print(f"  {k}: {v}")
print("\nValidation Results:")
for k, v in validation_results.items():
    print(f"  {k}: {'✓' if v else '✗'}")

# ============================================================================
# FINAL REPORT
# ============================================================================

# Top authors by paper count
author_paper_counts = [(a["name"], len(a["paper_ids"])) for a in extracted_authors]
top_5_authors = sorted(author_paper_counts, key=lambda x: x[1], reverse=True)[:5]

# Top institutions by paper count
inst_paper_counts = [(i["name"], len(i["paper_ids"])) for i in extracted_institutions]
top_5_institutions = sorted(inst_paper_counts, key=lambda x: x[1], reverse=True)[:5]

# Top topics
top_10_topics = sorted(extracted_topics.items(), key=lambda x: x[1], reverse=True)[:10]

# Top cited papers with titles
top_10_cited_papers = []
for pid in top_cited_papers:
    paper_row = papers_df[papers_df["paper_id"] == pid]
    if not paper_row.empty:
        top_10_cited_papers.append({
            "paper_id": pid,
            "citation_count": in_degree[pid],
            "title": paper_row["title"].values[0]
        })

final_report = {
    "metadata": {
        "task": "Research Paper Entity Extraction and Citation Analysis",
        "papers_analyzed": len(papers_df),
        "execution_timestamp": datetime.now().isoformat()
    },
    "entity_extraction": {
        "authors": {
            "total_unique": resolved_author_count,
            "top_5_by_paper_count": [{"name": n, "paper_count": c} for n, c in top_5_authors]
        },
        "institutions": {
            "total_unique": resolved_institution_count,
            "top_5_by_paper_count": [{"name": n, "paper_count": c} for n, c in top_5_institutions]
        },
        "topics": {
            "total_unique": len(extracted_topics),
            "top_10_by_frequency": [{"topic": t, "count": c} for t, c in top_10_topics]
        }
    },
    "citation_analysis": {
        "total_citations": len(citations_df),
        "top_10_cited_papers": top_10_cited_papers,
        "orphan_citations": orphan_citations,
        "self_citations": self_citations,
        "network_statistics": {
            "avg_in_degree": np.mean(list(in_degree.values())),
            "avg_out_degree": np.mean(list(out_degree.values())),
            "max_in_degree": max(in_degree.values()),
            "max_out_degree": max(out_degree.values())
        }
    },
    "data_quality": {
        "missing_abstracts": summary_stats["papers_with_missing_abstract"],
        "missing_keywords": summary_stats["papers_with_missing_keywords"],
        "missing_institutions": int(papers_df["institution"].isna().sum()),
        "duplicate_author_entries": len(unique_authors_raw) - resolved_author_count
    },
    "validation_summary": {
        "all_checks_passed": all(validation_results.values()),
        "failed_checks": [k for k, v in validation_results.items() if not v]
    }
}

print("=== VALIDATION RESULTS ===")
print(json.dumps(validation_results, indent=2))
print("\n=== FINAL REPORT ===")
print(json.dumps(final_report, indent=2, default=str))

In [None]:
---

# Unit Tests

Comprehensive tests to validate the golden solution.

class TestDataLoading(unittest.TestCase):
    def test_papers_df_exists(self):
        self.assertIsInstance(papers_df, pd.DataFrame)
        self.assertGreater(len(papers_df), 0)
    
    def test_papers_df_columns(self):
        required = {'paper_id', 'title', 'authors', 'institution', 'abstract', 'keywords', 'venue', 'year'}
        self.assertTrue(required.issubset(set(papers_df.columns)))
    
    def test_citations_df_exists(self):
        self.assertIsInstance(citations_df, pd.DataFrame)
        self.assertGreater(len(citations_df), 0)
    
    def test_affiliations_structure(self):
        self.assertIsInstance(affiliations_data, dict)
        self.assertIn('authors', affiliations_data)
        self.assertIn('institutions', affiliations_data)

class TestEntityExtraction(unittest.TestCase):
    def test_extracted_authors(self):
        self.assertGreater(len(extracted_authors), 0)
        for author in extracted_authors:
            self.assertIn('name', author)
            self.assertIn('paper_ids', author)
    
    def test_extracted_institutions(self):
        self.assertGreater(len(extracted_institutions), 0)
    
    def test_extracted_topics(self):
        self.assertIsInstance(extracted_topics, dict)

class TestCitationNetwork(unittest.TestCase):
    def test_citation_graph(self):
        self.assertGreater(len(citation_graph), 0)
    
    def test_pagerank_sum(self):
        self.assertAlmostEqual(sum(pagerank_scores.values()), 1.0, delta=0.01)
    
    def test_orphan_citations(self):
        self.assertGreater(len(orphan_citations), 0)
    
    def test_self_citations(self):
        self.assertGreater(len(self_citations), 0)

class TestValidation(unittest.TestCase):
    def test_all_validations_pass(self):
        failed = [k for k, v in validation_results.items() if not v]
        self.assertEqual(len(failed), 0, f"Failed: {failed}")

class TestFinalReport(unittest.TestCase):
    def test_report_structure(self):
        self.assertIn('metadata', final_report)
        self.assertIn('entity_extraction', final_report)
        self.assertIn('citation_analysis', final_report)
        self.assertIn('validation_summary', final_report)
    
    def test_all_checks_passed(self):
        self.assertTrue(final_report['validation_summary']['all_checks_passed'])

In [None]:
# Run all unit tests
def run_tests():
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()
    
    suite.addTests(loader.loadTestsFromTestCase(TestDataLoading))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtraction))
    suite.addTests(loader.loadTestsFromTestCase(TestCitationNetwork))
    suite.addTests(loader.loadTestsFromTestCase(TestValidation))
    suite.addTests(loader.loadTestsFromTestCase(TestFinalReport))
    
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    
    print("\n" + "="*50)
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success: {result.wasSuccessful()}")
    return result

test_result = run_tests()

## Final Summary

In [None]:
print("="*60)
print("GOLDEN SOLUTION SUMMARY")
print("="*60)
print(f"\nPapers Analyzed: {len(papers_df)}")
print(f"Citations Processed: {len(citations_df)}")
print(f"\nEntity Resolution:")
print(f"  Authors: {summary_stats['unique_authors_raw']} raw -> {resolved_author_count} resolved")
print(f"  Institutions: {summary_stats['unique_institutions_raw']} raw -> {resolved_institution_count} resolved")
print(f"\nCitation Network:")
print(f"  Orphan citations: {len(orphan_citations)}")
print(f"  Self citations: {len(self_citations)}")
print(f"  PageRank sum: {sum(pagerank_scores.values()):.4f}")
print(f"\nTests: {test_result.testsRun} run, {len(test_result.failures)} failures, {len(test_result.errors)} errors")

if test_result.wasSuccessful() and all(validation_results.values()):
    print("\n" + "="*60)
    print("✓ GOLDEN SOLUTION PASSED ALL TESTS!")
    print("="*60)
else:
    print("\n" + "="*60)
    print("✗ SOME TESTS FAILED")
    print("="*60)