# Golden Solution: Research Paper Entity Extraction and Citation Analysis

This notebook contains the reference implementation for the benchmark task.
It demonstrates that the task is solvable and provides the expected outputs.

**Note:** This is the golden solution - it should pass all unit tests.

## Setup and Imports

In [None]:
# Install required packages (if not already installed)
%pip install -q pandas networkx

In [None]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from datetime import datetime
from typing import Dict, List, Any, Tuple
import re
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

## Data Loading

Load the three data files and validate their structure.

In [None]:
# For Google Colab, upload files or mount Google Drive
# Here we assume files are in the current directory or uploaded

# Define file paths (adjust as needed for your environment)
PAPERS_FILE = "papers_metadata.json"
CITATIONS_FILE = "citations.csv"
AFFILIATIONS_FILE = "author_affiliations.json"

# Load papers metadata
with open(PAPERS_FILE, 'r') as f:
    papers_raw = json.load(f)
papers_df = pd.DataFrame(papers_raw)

# Load citations
citations_df = pd.read_csv(CITATIONS_FILE)

# Load author affiliations reference data
with open(AFFILIATIONS_FILE, 'r') as f:
    affiliations_data = json.load(f)

print(f"Loaded {len(papers_df)} papers")
print(f"Loaded {len(citations_df)} citations")
print(f"Affiliations data keys: {list(affiliations_data.keys())}")

In [None]:
# Inspect data structure
print("Papers columns:", papers_df.columns.tolist())
print("\nSample paper:")
print(papers_df.iloc[0].to_dict())
print("\nCitations columns:", citations_df.columns.tolist())
print("\nSample citations:")
print(citations_df.head())

## Entity Extraction

Extract authors, institutions, topics, and methods from the papers.

In [None]:
def extract_all_authors(papers_df: pd.DataFrame) -> List[Dict]:
    """Extract all author entities from papers."""
    author_to_papers = defaultdict(list)
    
    for _, row in papers_df.iterrows():
        paper_id = row['paper_id']
        authors = row['authors']
        if isinstance(authors, list):
            for author in authors:
                if author and isinstance(author, str):
                    author_to_papers[author.strip()].append(paper_id)
    
    extracted_authors = []
    for name, paper_ids in author_to_papers.items():
        extracted_authors.append({
            "name": name,
            "paper_ids": list(set(paper_ids)),
            "name_variations": [name]  # Will be expanded during resolution
        })
    
    return extracted_authors


def extract_all_institutions(papers_df: pd.DataFrame) -> List[Dict]:
    """Extract all institution entities from papers."""
    inst_to_papers = defaultdict(list)
    
    for _, row in papers_df.iterrows():
        paper_id = row['paper_id']
        institution = row.get('institution')
        if institution and isinstance(institution, str) and institution.strip():
            inst_to_papers[institution.strip()].append(paper_id)
    
    extracted_institutions = []
    for name, paper_ids in inst_to_papers.items():
        extracted_institutions.append({
            "name": name,
            "paper_ids": list(set(paper_ids)),
            "name_variations": [name]
        })
    
    return extracted_institutions


def extract_topics(papers_df: pd.DataFrame) -> Dict[str, int]:
    """Extract topic/keyword frequencies."""
    topic_counts = Counter()
    
    for _, row in papers_df.iterrows():
        keywords = row.get('keywords', [])
        if isinstance(keywords, list):
            for kw in keywords:
                if kw and isinstance(kw, str):
                    topic_counts[kw.strip().lower()] += 1
    
    return dict(topic_counts)


def extract_methods_from_abstracts(papers_df: pd.DataFrame) -> List[str]:
    """Extract research methods mentioned in abstracts."""
    # Common research method patterns to look for
    method_patterns = [
        r'gradient descent', r'backpropagation', r'stochastic optimization',
        r'cross-validation', r'ablation stud(?:y|ies)', r'hyperparameter tuning',
        r'ensemble method', r'regularization', r'dropout', r'batch normalization',
        r'attention mechanism', r'skip connection', r'data augmentation',
        r'pre-training', r'fine-tuning', r'knowledge distillation',
        r'self-attention', r'graph convolution', r'contrastive learning',
        r'adversarial training', r'curriculum learning', r'multi-head attention'
    ]
    
    found_methods = set()
    
    for _, row in papers_df.iterrows():
        abstract = row.get('abstract', '')
        if abstract and isinstance(abstract, str):
            abstract_lower = abstract.lower()
            for pattern in method_patterns:
                if re.search(pattern, abstract_lower):
                    match = re.search(pattern, abstract_lower)
                    if match:
                        found_methods.add(match.group())
    
    return list(found_methods)


# Execute extraction
extracted_authors = extract_all_authors(papers_df)
extracted_institutions = extract_all_institutions(papers_df)
extracted_topics = extract_topics(papers_df)
methods_from_abstracts = extract_methods_from_abstracts(papers_df)

print(f"Extracted {len(extracted_authors)} unique author names")
print(f"Extracted {len(extracted_institutions)} unique institution names")
print(f"Extracted {len(extracted_topics)} unique topics")
print(f"Found {len(methods_from_abstracts)} methods in abstracts")
print(f"\nMethods found: {methods_from_abstracts[:10]}")

## Entity Resolution

Resolve author and institution name variations to canonical forms using fuzzy matching and reference data.

In [None]:
def normalize_name(name: str) -> str:
    """Normalize a name for comparison."""
    name = re.sub(r'[.,]', '', name.lower())
    name = ' '.join(name.split())
    return name


def get_name_parts(name: str) -> Tuple[str, str]:
    """Extract first name initial and last name from various formats."""
    name = normalize_name(name)
    
    # Handle "Last, First" format
    if ',' in name:
        parts = name.split(',')
        last = parts[0].strip()
        first = parts[1].strip() if len(parts) > 1 else ''
    else:
        parts = name.split()
        if len(parts) >= 2:
            first = parts[0]
            last = parts[-1]
        elif len(parts) == 1:
            first = ''
            last = parts[0]
        else:
            first = ''
            last = ''
    
    first_initial = first[0] if first else ''
    return first_initial, last


def build_author_resolution_map(extracted_authors: List[Dict], 
                                 affiliations_data: Dict) -> Dict[str, str]:
    """Build a mapping from author name variations to canonical forms."""
    resolution_map = {}
    
    canonical_by_parts = {}
    known_variations = {}
    
    for auth_id, auth_info in affiliations_data.get('authors', {}).items():
        canonical = auth_info['canonical_name']
        first_init, last = get_name_parts(canonical)
        canonical_by_parts[(first_init, last)] = canonical
        
        for var in auth_info.get('known_variations', []):
            known_variations[normalize_name(var)] = canonical
        known_variations[normalize_name(canonical)] = canonical
    
    for author_entry in extracted_authors:
        name = author_entry['name']
        normalized = normalize_name(name)
        
        if normalized in known_variations:
            resolution_map[name] = known_variations[normalized]
            continue
        
        first_init, last = get_name_parts(name)
        if (first_init, last) in canonical_by_parts:
            resolution_map[name] = canonical_by_parts[(first_init, last)]
            continue
        
        resolution_map[name] = name
    
    return resolution_map


def build_institution_resolution_map(extracted_institutions: List[Dict],
                                      affiliations_data: Dict) -> Dict[str, str]:
    """Build a mapping from institution name variations to canonical forms."""
    resolution_map = {}
    known_variations = {}
    
    for inst_id, inst_info in affiliations_data.get('institutions', {}).items():
        canonical = inst_info['canonical_name']
        
        for var in inst_info.get('known_variations', []):
            known_variations[normalize_name(var)] = canonical
        known_variations[normalize_name(canonical)] = canonical
    
    for inst_entry in extracted_institutions:
        name = inst_entry['name']
        normalized = normalize_name(name)
        
        if normalized in known_variations:
            resolution_map[name] = known_variations[normalized]
            continue
        
        matched = False
        for var_norm, canonical in known_variations.items():
            if var_norm in normalized or normalized in var_norm:
                resolution_map[name] = canonical
                matched = True
                break
        
        if not matched:
            resolution_map[name] = name
    
    return resolution_map


# Execute resolution
author_resolution_map = build_author_resolution_map(extracted_authors, affiliations_data)
institution_resolution_map = build_institution_resolution_map(extracted_institutions, affiliations_data)

# Count unique resolved entities
resolved_author_count = len(set(author_resolution_map.values()))
resolved_institution_count = len(set(institution_resolution_map.values()))

print(f"Author resolution: {len(extracted_authors)} names -> {resolved_author_count} unique authors")
print(f"Institution resolution: {len(extracted_institutions)} names -> {resolved_institution_count} unique institutions")
print(f"\nSample author resolutions:")
for name, canonical in list(author_resolution_map.items())[:5]:
    if name != canonical:
        print(f"  '{name}' -> '{canonical}'")

## Citation Network Analysis

Build the citation graph and compute network metrics.

In [None]:
def build_citation_graph(citations_df: pd.DataFrame, 
                         valid_paper_ids: set) -> Dict[str, List[str]]:
    """Build adjacency list representation of citation graph."""
    graph = defaultdict(list)
    
    for _, row in citations_df.iterrows():
        citing = row['citing_paper']
        cited = row['cited_paper']
        graph[citing].append(cited)
    
    for paper_id in valid_paper_ids:
        if paper_id not in graph:
            graph[paper_id] = []
    
    return dict(graph)


def compute_degrees(citation_graph: Dict[str, List[str]], 
                    valid_paper_ids: set) -> Tuple[Dict[str, int], Dict[str, int]]:
    """Compute in-degree and out-degree for each paper."""
    in_degree = defaultdict(int)
    out_degree = {}
    
    for paper_id in valid_paper_ids:
        in_degree[paper_id] = 0
    
    for citing, cited_list in citation_graph.items():
        out_degree[citing] = len(cited_list)
        for cited in cited_list:
            if cited in valid_paper_ids:
                in_degree[cited] += 1
    
    for paper_id in valid_paper_ids:
        if paper_id not in out_degree:
            out_degree[paper_id] = 0
    
    return dict(in_degree), out_degree


def compute_pagerank(citation_graph: Dict[str, List[str]], 
                     valid_paper_ids: set,
                     damping: float = 0.85) -> Dict[str, float]:
    """Compute PageRank scores using networkx."""
    G = nx.DiGraph()
    G.add_nodes_from(valid_paper_ids)
    
    for citing, cited_list in citation_graph.items():
        if citing in valid_paper_ids:
            for cited in cited_list:
                if cited in valid_paper_ids:
                    G.add_edge(citing, cited)
    
    pagerank = nx.pagerank(G, alpha=damping)
    return pagerank


def find_orphan_citations(citations_df: pd.DataFrame,
                          valid_paper_ids: set) -> List[Dict]:
    """Find citations that reference non-existent papers."""
    orphans = []
    
    for _, row in citations_df.iterrows():
        cited = row['cited_paper']
        if cited not in valid_paper_ids:
            orphans.append({
                "citing_paper": row['citing_paper'],
                "cited_paper": cited
            })
    
    return orphans


def find_self_citations(citations_df: pd.DataFrame) -> List[str]:
    """Find papers that cite themselves."""
    self_citing = []
    
    for _, row in citations_df.iterrows():
        if row['citing_paper'] == row['cited_paper']:
            self_citing.append(row['citing_paper'])
    
    return list(set(self_citing))


# Execute citation analysis
valid_paper_ids = set(papers_df['paper_id'].tolist())

citation_graph = build_citation_graph(citations_df, valid_paper_ids)
in_degree, out_degree = compute_degrees(citation_graph, valid_paper_ids)
pagerank_scores = compute_pagerank(citation_graph, valid_paper_ids)

# Get top cited papers
top_cited_papers = sorted(in_degree.keys(), key=lambda x: in_degree[x], reverse=True)[:10]

# Find anomalies
orphan_citations = find_orphan_citations(citations_df, valid_paper_ids)
self_citations = find_self_citations(citations_df)

print(f"Citation graph: {len(citation_graph)} nodes")
print(f"PageRank scores computed: {len(pagerank_scores)} papers")
print(f"PageRank sum: {sum(pagerank_scores.values()):.4f}")
print(f"\nTop 5 cited papers:")
for paper_id in top_cited_papers[:5]:
    print(f"  {paper_id}: {in_degree[paper_id]} citations")
print(f"\nOrphan citations found: {len(orphan_citations)}")
print(f"Self-citations found: {len(self_citations)}")

## Validation Checks

Perform all required validation checks and store results.

In [None]:
def run_all_validations() -> Dict[str, bool]:
    """Run all validation checks and return results."""
    results = {}
    
    # Data loading checks
    expected_paper_cols = {'paper_id', 'title', 'authors', 'institution', 
                           'abstract', 'keywords', 'venue', 'year', 'publication_date'}
    results["papers_loaded_ok"] = (
        len(papers_df) > 0 and 
        expected_paper_cols.issubset(set(papers_df.columns))
    )
    
    expected_citation_cols = {'citing_paper', 'cited_paper'}
    results["citations_loaded_ok"] = (
        len(citations_df) > 0 and
        expected_citation_cols.issubset(set(citations_df.columns))
    )
    
    results["affiliations_loaded_ok"] = (
        isinstance(affiliations_data, dict) and
        'authors' in affiliations_data and
        'institutions' in affiliations_data
    )
    
    results["no_duplicate_paper_ids"] = (
        papers_df['paper_id'].nunique() == len(papers_df)
    )
    
    results["authors_extracted"] = len(extracted_authors) > 0
    results["institutions_extracted"] = len(extracted_institutions) > 0
    
    results["resolution_maps_valid"] = (
        len(author_resolution_map) > 0 and
        len(institution_resolution_map) > 0 and
        isinstance(author_resolution_map, dict) and
        isinstance(institution_resolution_map, dict)
    )
    
    results["citation_graph_built"] = (
        len(citation_graph) > 0 and
        isinstance(citation_graph, dict)
    )
    
    results["pagerank_computed"] = (
        len(pagerank_scores) > 0 and
        isinstance(pagerank_scores, dict) and
        all(isinstance(v, float) for v in pagerank_scores.values())
    )
    
    results["orphans_identified"] = isinstance(orphan_citations, list)
    results["self_citations_identified"] = isinstance(self_citations, list)
    
    results["all_pagerank_finite"] = all(
        np.isfinite(v) for v in pagerank_scores.values()
    )
    
    return results


validation_results = run_all_validations()

print("=== Validation Results ===")
for check, passed in validation_results.items():
    status = "PASS" if passed else "FAIL"
    print(f"  [{status}] {check}: {passed}")

## Summary Statistics

Compute all required summary statistics.

In [None]:
def compute_summary_statistics() -> Dict[str, Any]:
    """Compute all required summary statistics."""
    stats = {}
    
    stats["total_papers"] = len(papers_df)
    stats["total_citations"] = len(citations_df)
    
    stats["unique_authors_raw"] = len(extracted_authors)
    stats["unique_authors_resolved"] = resolved_author_count
    
    stats["unique_institutions_raw"] = len(extracted_institutions)
    stats["unique_institutions_resolved"] = resolved_institution_count
    
    stats["papers_with_missing_abstract"] = papers_df['abstract'].apply(
        lambda x: not x or (isinstance(x, str) and len(x.strip()) == 0)
    ).sum()
    
    stats["papers_with_missing_keywords"] = papers_df['keywords'].apply(
        lambda x: not x or (isinstance(x, list) and len(x) == 0)
    ).sum()
    
    stats["orphan_citation_count"] = len(orphan_citations)
    stats["self_citation_count"] = len(self_citations)
    
    stats["avg_citations_per_paper"] = round(
        sum(out_degree.values()) / len(out_degree) if out_degree else 0, 2
    )
    
    venue_counts = papers_df['venue'].value_counts()
    stats["most_common_venue"] = venue_counts.index[0] if len(venue_counts) > 0 else ""
    
    stats["year_range"] = (int(papers_df['year'].min()), int(papers_df['year'].max()))
    
    return stats


summary_stats = compute_summary_statistics()

print("=== Summary Statistics ===")
for key, value in summary_stats.items():
    print(f"  {key}: {value}")

## Final Report Generation

Compile all results into the required final report structure.

In [None]:
def generate_final_report() -> Dict:
    """Generate the complete final report."""
    
    # Compute author paper counts (using resolved names)
    author_paper_counts = defaultdict(set)
    for author_entry in extracted_authors:
        canonical = author_resolution_map.get(author_entry['name'], author_entry['name'])
        for paper_id in author_entry['paper_ids']:
            author_paper_counts[canonical].add(paper_id)
    
    top_authors = sorted(
        author_paper_counts.items(), 
        key=lambda x: len(x[1]), 
        reverse=True
    )[:5]
    
    # Compute institution paper counts
    inst_paper_counts = defaultdict(set)
    for inst_entry in extracted_institutions:
        canonical = institution_resolution_map.get(inst_entry['name'], inst_entry['name'])
        for paper_id in inst_entry['paper_ids']:
            inst_paper_counts[canonical].add(paper_id)
    
    top_institutions = sorted(
        inst_paper_counts.items(),
        key=lambda x: len(x[1]),
        reverse=True
    )[:5]
    
    # Top topics
    top_topics = sorted(
        extracted_topics.items(),
        key=lambda x: x[1],
        reverse=True
    )[:10]
    
    # Top cited papers with titles
    paper_titles = dict(zip(papers_df['paper_id'], papers_df['title']))
    top_cited_with_info = [
        {
            "paper_id": pid,
            "citation_count": in_degree[pid],
            "title": paper_titles.get(pid, "Unknown")
        }
        for pid in top_cited_papers
    ]
    
    # Network statistics
    in_degrees = list(in_degree.values())
    out_degrees = list(out_degree.values())
    
    # Check for duplicate authors in papers
    duplicate_author_count = 0
    for _, row in papers_df.iterrows():
        authors = row.get('authors', [])
        if isinstance(authors, list):
            resolved = [author_resolution_map.get(a, a) for a in authors if a]
            if len(resolved) != len(set(resolved)):
                duplicate_author_count += 1
    
    # Count missing institutions
    missing_inst_count = papers_df['institution'].apply(
        lambda x: not x or (isinstance(x, str) and len(x.strip()) == 0) or pd.isna(x)
    ).sum()
    
    failed_checks = [k for k, v in validation_results.items() if not v]
    
    report = {
        "metadata": {
            "task": "Research Paper Entity Extraction and Citation Analysis",
            "papers_analyzed": len(papers_df),
            "execution_timestamp": datetime.now().isoformat()
        },
        "entity_extraction": {
            "authors": {
                "total_unique": resolved_author_count,
                "top_5_by_paper_count": [
                    {"name": name, "paper_count": len(papers)}
                    for name, papers in top_authors
                ]
            },
            "institutions": {
                "total_unique": resolved_institution_count,
                "top_5_by_paper_count": [
                    {"name": name, "paper_count": len(papers)}
                    for name, papers in top_institutions
                ]
            },
            "topics": {
                "total_unique": len(extracted_topics),
                "top_10_by_frequency": [
                    {"topic": topic, "count": count}
                    for topic, count in top_topics
                ]
            }
        },
        "citation_analysis": {
            "total_citations": len(citations_df),
            "top_10_cited_papers": top_cited_with_info,
            "orphan_citations": orphan_citations,
            "self_citations": self_citations,
            "network_statistics": {
                "avg_in_degree": round(sum(in_degrees) / len(in_degrees), 2) if in_degrees else 0,
                "avg_out_degree": round(sum(out_degrees) / len(out_degrees), 2) if out_degrees else 0,
                "max_in_degree": max(in_degrees) if in_degrees else 0,
                "max_out_degree": max(out_degrees) if out_degrees else 0
            }
        },
        "data_quality": {
            "missing_abstracts": int(summary_stats["papers_with_missing_abstract"]),
            "missing_keywords": int(summary_stats["papers_with_missing_keywords"]),
            "missing_institutions": int(missing_inst_count),
            "duplicate_author_entries": duplicate_author_count
        },
        "validation_summary": {
            "all_checks_passed": len(failed_checks) == 0,
            "failed_checks": failed_checks
        }
    }
    
    return report


final_report = generate_final_report()

## Output Results

In [None]:
print("=== VALIDATION RESULTS ===")
print(json.dumps(validation_results, indent=2))
print("\n=== FINAL REPORT ===")
print(json.dumps(final_report, indent=2))

---

# Unit Tests

The following section contains comprehensive unit tests to validate the solution.

In [None]:
import unittest

class TestDataLoading(unittest.TestCase):
    """Tests for data loading functionality."""
    
    def test_papers_df_exists_and_not_empty(self):
        self.assertIsInstance(papers_df, pd.DataFrame)
        self.assertGreater(len(papers_df), 0)
    
    def test_papers_df_has_required_columns(self):
        required = {'paper_id', 'title', 'authors', 'institution', 
                   'abstract', 'keywords', 'venue', 'year', 'publication_date'}
        self.assertTrue(required.issubset(set(papers_df.columns)))
    
    def test_citations_df_exists_and_not_empty(self):
        self.assertIsInstance(citations_df, pd.DataFrame)
        self.assertGreater(len(citations_df), 0)
    
    def test_citations_df_has_required_columns(self):
        required = {'citing_paper', 'cited_paper'}
        self.assertTrue(required.issubset(set(citations_df.columns)))
    
    def test_affiliations_data_structure(self):
        self.assertIsInstance(affiliations_data, dict)
        self.assertIn('authors', affiliations_data)
        self.assertIn('institutions', affiliations_data)
    
    def test_no_duplicate_paper_ids(self):
        self.assertEqual(papers_df['paper_id'].nunique(), len(papers_df))


class TestEntityExtraction(unittest.TestCase):
    """Tests for entity extraction functionality."""
    
    def test_extracted_authors_not_empty(self):
        self.assertGreater(len(extracted_authors), 0)
    
    def test_extracted_authors_structure(self):
        for author in extracted_authors:
            self.assertIn('name', author)
            self.assertIn('paper_ids', author)
            self.assertIn('name_variations', author)
            self.assertIsInstance(author['name'], str)
            self.assertIsInstance(author['paper_ids'], list)
    
    def test_extracted_institutions_not_empty(self):
        self.assertGreater(len(extracted_institutions), 0)
    
    def test_extracted_institutions_structure(self):
        for inst in extracted_institutions:
            self.assertIn('name', inst)
            self.assertIn('paper_ids', inst)
            self.assertIn('name_variations', inst)
    
    def test_extracted_topics_is_dict(self):
        self.assertIsInstance(extracted_topics, dict)
        for key, value in extracted_topics.items():
            self.assertIsInstance(key, str)
            self.assertIsInstance(value, int)
    
    def test_methods_from_abstracts_is_list(self):
        self.assertIsInstance(methods_from_abstracts, list)
        for method in methods_from_abstracts:
            self.assertIsInstance(method, str)


class TestEntityResolution(unittest.TestCase):
    """Tests for entity resolution functionality."""
    
    def test_author_resolution_map_not_empty(self):
        self.assertGreater(len(author_resolution_map), 0)
    
    def test_author_resolution_map_structure(self):
        for key, value in author_resolution_map.items():
            self.assertIsInstance(key, str)
            self.assertIsInstance(value, str)
    
    def test_institution_resolution_map_not_empty(self):
        self.assertGreater(len(institution_resolution_map), 0)
    
    def test_resolution_reduces_author_count(self):
        raw_count = len(extracted_authors)
        self.assertLessEqual(resolved_author_count, raw_count)
    
    def test_resolved_counts_are_positive(self):
        self.assertGreater(resolved_author_count, 0)
        self.assertGreater(resolved_institution_count, 0)


class TestCitationNetwork(unittest.TestCase):
    """Tests for citation network functionality."""
    
    def test_citation_graph_not_empty(self):
        self.assertGreater(len(citation_graph), 0)
    
    def test_citation_graph_structure(self):
        for key, value in citation_graph.items():
            self.assertIsInstance(key, str)
            self.assertIsInstance(value, list)
    
    def test_in_degree_covers_all_papers(self):
        self.assertEqual(len(in_degree), len(papers_df))
    
    def test_out_degree_covers_all_papers(self):
        self.assertEqual(len(out_degree), len(papers_df))
    
    def test_pagerank_scores_not_empty(self):
        self.assertGreater(len(pagerank_scores), 0)
    
    def test_pagerank_scores_are_floats(self):
        for paper_id, score in pagerank_scores.items():
            self.assertIsInstance(score, float)
    
    def test_pagerank_scores_sum_to_one(self):
        total = sum(pagerank_scores.values())
        self.assertAlmostEqual(total, 1.0, delta=0.01)
    
    def test_pagerank_scores_are_finite(self):
        for score in pagerank_scores.values():
            self.assertTrue(np.isfinite(score))
    
    def test_top_cited_papers_length(self):
        self.assertLessEqual(len(top_cited_papers), 10)
        self.assertGreater(len(top_cited_papers), 0)
    
    def test_orphan_citations_identified(self):
        self.assertIsInstance(orphan_citations, list)
        self.assertGreater(len(orphan_citations), 0)
    
    def test_orphan_citations_structure(self):
        for orphan in orphan_citations:
            self.assertIn('citing_paper', orphan)
            self.assertIn('cited_paper', orphan)
    
    def test_self_citations_identified(self):
        self.assertIsInstance(self_citations, list)
        self.assertGreater(len(self_citations), 0)


class TestValidationResults(unittest.TestCase):
    """Tests for validation results."""
    
    def test_validation_results_is_dict(self):
        self.assertIsInstance(validation_results, dict)
    
    def test_validation_results_has_required_keys(self):
        required_keys = {
            "papers_loaded_ok", "citations_loaded_ok", "affiliations_loaded_ok",
            "no_duplicate_paper_ids", "authors_extracted", "institutions_extracted",
            "resolution_maps_valid", "citation_graph_built", "pagerank_computed",
            "orphans_identified", "self_citations_identified", "all_pagerank_finite"
        }
        self.assertTrue(required_keys.issubset(set(validation_results.keys())))
    
    def test_all_validations_pass(self):
        failed = [k for k, v in validation_results.items() if not v]
        self.assertEqual(len(failed), 0, f"Failed validations: {failed}")


class TestSummaryStats(unittest.TestCase):
    """Tests for summary statistics."""
    
    def test_summary_stats_is_dict(self):
        self.assertIsInstance(summary_stats, dict)
    
    def test_summary_stats_has_required_keys(self):
        required_keys = {
            "total_papers", "total_citations", "unique_authors_raw",
            "unique_authors_resolved", "unique_institutions_raw",
            "unique_institutions_resolved", "papers_with_missing_abstract",
            "papers_with_missing_keywords", "orphan_citation_count",
            "self_citation_count", "avg_citations_per_paper",
            "most_common_venue", "year_range"
        }
        self.assertTrue(required_keys.issubset(set(summary_stats.keys())))
    
    def test_total_papers_matches_dataframe(self):
        self.assertEqual(summary_stats["total_papers"], len(papers_df))
    
    def test_year_range_is_valid(self):
        year_range = summary_stats["year_range"]
        self.assertIsInstance(year_range, tuple)
        self.assertEqual(len(year_range), 2)
        self.assertLessEqual(year_range[0], year_range[1])


class TestFinalReport(unittest.TestCase):
    """Tests for final report structure."""
    
    def test_final_report_is_dict(self):
        self.assertIsInstance(final_report, dict)
    
    def test_final_report_has_metadata(self):
        self.assertIn('metadata', final_report)
        self.assertIn('task', final_report['metadata'])
        self.assertIn('papers_analyzed', final_report['metadata'])
        self.assertIn('execution_timestamp', final_report['metadata'])
    
    def test_final_report_has_entity_extraction(self):
        self.assertIn('entity_extraction', final_report)
        ee = final_report['entity_extraction']
        self.assertIn('authors', ee)
        self.assertIn('institutions', ee)
        self.assertIn('topics', ee)
    
    def test_final_report_has_citation_analysis(self):
        self.assertIn('citation_analysis', final_report)
        ca = final_report['citation_analysis']
        self.assertIn('total_citations', ca)
        self.assertIn('top_10_cited_papers', ca)
        self.assertIn('orphan_citations', ca)
        self.assertIn('self_citations', ca)
        self.assertIn('network_statistics', ca)
    
    def test_final_report_has_data_quality(self):
        self.assertIn('data_quality', final_report)
        dq = final_report['data_quality']
        self.assertIn('missing_abstracts', dq)
        self.assertIn('missing_keywords', dq)
        self.assertIn('missing_institutions', dq)
        self.assertIn('duplicate_author_entries', dq)
    
    def test_final_report_has_validation_summary(self):
        self.assertIn('validation_summary', final_report)
        vs = final_report['validation_summary']
        self.assertIn('all_checks_passed', vs)
        self.assertIn('failed_checks', vs)
    
    def test_top_authors_structure(self):
        top_authors = final_report['entity_extraction']['authors']['top_5_by_paper_count']
        self.assertLessEqual(len(top_authors), 5)
        for author in top_authors:
            self.assertIn('name', author)
            self.assertIn('paper_count', author)
    
    def test_top_cited_papers_structure(self):
        top_papers = final_report['citation_analysis']['top_10_cited_papers']
        self.assertLessEqual(len(top_papers), 10)
        for paper in top_papers:
            self.assertIn('paper_id', paper)
            self.assertIn('citation_count', paper)
            self.assertIn('title', paper)
    
    def test_network_statistics_structure(self):
        ns = final_report['citation_analysis']['network_statistics']
        self.assertIn('avg_in_degree', ns)
        self.assertIn('avg_out_degree', ns)
        self.assertIn('max_in_degree', ns)
        self.assertIn('max_out_degree', ns)
    
    def test_all_checks_passed(self):
        self.assertTrue(final_report['validation_summary']['all_checks_passed'])
        self.assertEqual(len(final_report['validation_summary']['failed_checks']), 0)

In [None]:
# Run all unit tests
def run_tests():
    """Run all unit tests and report results."""
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()
    
    suite.addTests(loader.loadTestsFromTestCase(TestDataLoading))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtraction))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityResolution))
    suite.addTests(loader.loadTestsFromTestCase(TestCitationNetwork))
    suite.addTests(loader.loadTestsFromTestCase(TestValidationResults))
    suite.addTests(loader.loadTestsFromTestCase(TestSummaryStats))
    suite.addTests(loader.loadTestsFromTestCase(TestFinalReport))
    
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    
    print("\n" + "="*50)
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success: {result.wasSuccessful()}")
    
    return result

test_result = run_tests()

## Success Verification

Final check that all success criteria are met.

In [None]:
def verify_success_criteria():
    """Verify all success criteria from the benchmark prompt."""
    criteria = {}
    
    # 1. All validation checks pass
    criteria["all_validation_checks_pass"] = all(validation_results.values())
    
    # 2. Entity resolution reduces author count
    criteria["resolution_reduces_authors"] = resolved_author_count < len(extracted_authors)
    
    # 3. Orphan citations identified
    criteria["orphan_citations_found"] = len(orphan_citations) >= 1
    
    # 4. Self-citations identified
    criteria["self_citations_found"] = len(self_citations) >= 1
    
    # 5. PageRank sums to ~1.0
    pr_sum = sum(pagerank_scores.values())
    criteria["pagerank_sums_to_one"] = abs(pr_sum - 1.0) < 0.01
    
    # 6. Final report has correct schema
    required_keys = {'metadata', 'entity_extraction', 'citation_analysis', 
                     'data_quality', 'validation_summary'}
    criteria["final_report_schema_valid"] = required_keys.issubset(set(final_report.keys()))
    
    # 7. All numeric values are finite
    criteria["all_numerics_finite"] = all(np.isfinite(v) for v in pagerank_scores.values())
    
    print("=== SUCCESS CRITERIA VERIFICATION ===")
    all_passed = True
    for criterion, passed in criteria.items():
        status = "PASS" if passed else "FAIL"
        print(f"  [{status}] {criterion}: {passed}")
        if not passed:
            all_passed = False
    
    print(f"\n{'='*40}")
    if all_passed:
        print("ALL SUCCESS CRITERIA MET!")
    else:
        print("SOME CRITERIA FAILED")
    
    return all_passed

success = verify_success_criteria()