In [1]:
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict

sys.path.insert(0, str(Path.cwd().parent))
from matching.common import (
    ReferenceMatchingModel, FeatureExtractor, TextCleaner,
    load_ground_truth, calculate_mrr, calculate_precision_at_k,
    DATA_DIR, OUTPUT_DIR
)

## 1. Load Model and Data

In [2]:
# DATA_DIR and OUTPUT_DIR imported from common.py

model = ReferenceMatchingModel()
model.load(OUTPUT_DIR / 'reference_matching_model.json')
print("Model loaded")

Model loaded


In [3]:
with open(OUTPUT_DIR / 'extracted_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)
print(f"Loaded {len(all_data)} publications")

Loaded 2546 publications


## 2. Evaluation Functions

In [4]:
# Evaluation functions imported from common.py
# calculate_mrr(predictions, ground_truth, top_k)
# calculate_precision_at_k(predictions, ground_truth, k)
print("Metrics: MRR@5, P@1, P@3, P@5")

Metrics: MRR@5, P@1, P@3, P@5


## 3. Feature Extraction (from notebook 2)

In [5]:
# TextCleaner and FeatureExtractor imported from common.py
# Helper function to extract features as numpy array
def extract_features(bib: Dict, ref: Dict) -> np.ndarray:
    return FeatureExtractor.extract_features_vector(bib, ref)

## 4. Generate Predictions

In [6]:
def rank_candidates(bib: Dict, refs: Dict[str, Dict], model, top_k: int = 5) -> List[Tuple[str, float]]:
    """Use model's built-in rank_candidates method"""
    return model.rank_candidates(bib, refs, top_k)

In [7]:
# Load ground truth from pred.json files (using common.py)
ground_truth_lookup, partition_lookup = load_ground_truth(DATA_DIR)

print(f"Ground truth: {len(ground_truth_lookup)} pubs, "
      f"Test: {sum(1 for p in partition_lookup.values() if p == 'test')}, "
      f"Valid: {sum(1 for p in partition_lookup.values() if p == 'valid')}")

Ground truth: 1854 pubs, Test: 2, Valid: 2


In [8]:
# Generate predictions for TEST SET
test_predictions, test_ground_truth, test_pub_ids = [], [], []

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    if partition_lookup.get(pub_id) != 'test' or pub_id not in ground_truth_lookup:
        continue
    
    for bib in pub_data['bibs']:
        if bib['key'] not in ground_truth_lookup[pub_id]:
            continue
        ranked = rank_candidates(bib, pub_data['refs'], model, top_k=5)
        test_predictions.append([arxiv_id for arxiv_id, _ in ranked])
        test_ground_truth.append(ground_truth_lookup[pub_id][bib['key']])
        test_pub_ids.append(pub_id)

print(f"Test: {len(set(test_pub_ids))} pubs, {len(test_predictions)} entries")

Test: 2 pubs, 16 entries


## 5. Compute Metrics

In [9]:
# MRR@5 and Precision metrics
mrr_5 = calculate_mrr(test_predictions, test_ground_truth, top_k=5)
p_at_1 = calculate_precision_at_k(test_predictions, test_ground_truth, k=1)
p_at_3 = calculate_precision_at_k(test_predictions, test_ground_truth, k=3)
p_at_5 = calculate_precision_at_k(test_predictions, test_ground_truth, k=5)

print("=" * 50)
print("TEST SET METRICS")
print("=" * 50)
print(f"MRR@5: {mrr_5:.4f}, P@1: {p_at_1:.4f}, P@3: {p_at_3:.4f}, P@5: {p_at_5:.4f}")

for pub in sorted(set(test_pub_ids)):
    pub_preds = [p for p, pid in zip(test_predictions, test_pub_ids) if pid == pub]
    pub_gt = [g for g, pid in zip(test_ground_truth, test_pub_ids) if pid == pub]
    print(f"  {pub}: MRR@5={calculate_mrr(pub_preds, pub_gt):.4f}, P@1={calculate_precision_at_k(pub_preds, pub_gt):.4f}")

TEST SET METRICS
MRR@5: 0.9375, P@1: 0.8750, P@3: 1.0000, P@5: 1.0000
  2411-00222: MRR@5=0.9333, P@1=0.8667
  2411-00232: MRR@5=1.0000, P@1=1.0000


## 6. Generate pred.json Files

In [10]:
def generate_pred_json(pub_path: Path, pub_data: Dict, gt_dict: Dict, model, partition: str = "test"):
    """Generate pred.json with top-5 predictions"""
    pred_structure = {"partition": partition, "groundtruth": {}, "prediction": {}}
    
    for bib_key, arxiv_id in gt_dict.items():
        pred_structure["groundtruth"][bib_key] = arxiv_id
        bib = next((b for b in pub_data['bibs'] if b['key'] == bib_key), None)
        if bib:
            ranked = rank_candidates(bib, pub_data['refs'], model, top_k=5)
            pred_structure["prediction"][bib_key] = [aid for aid, _ in ranked]
        else:
            pred_structure["prediction"][bib_key] = []
    
    with open(pub_path / 'pred.json', 'w', encoding='utf-8') as f:
        json.dump(pred_structure, f, indent=2)
    return pub_path / 'pred.json'

In [11]:
# Generate pred.json for all publications with ground truth
generated, partition_counts = 0, {'test': 0, 'valid': 0, 'train': 0}

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    if pub_id not in ground_truth_lookup:
        continue
    partition = partition_lookup.get(pub_id, 'train')
    generate_pred_json(DATA_DIR / pub_id, pub_data, ground_truth_lookup[pub_id], model, partition)
    generated += 1
    partition_counts[partition] += 1

print(f"Generated {generated} pred.json files: Test={partition_counts['test']}, Valid={partition_counts['valid']}, Train={partition_counts['train']}")

Generated 1846 pred.json files: Test=2, Valid=2, Train=1842


## 7. Summary

In [12]:
print("=" * 50)
print("EVALUATION SUMMARY")
print("=" * 50)
print(f"Publications: {len(ground_truth_lookup)} with labels")
print(f"Test entries: {len(test_predictions)}")
print(f"MRR@5: {mrr_5:.4f}, P@1: {p_at_1:.4f}")
print(f"Output: {generated} pred.json files")

EVALUATION SUMMARY
Publications: 1854 with labels
Test entries: 16
MRR@5: 0.9375, P@1: 0.8750
Output: 1846 pred.json files


---
## Pipeline Complete!

| Requirement | Implementation |
|-------------|----------------|
| 2.2.1 Data Cleaning | Lowercasing, tokenization, LaTeX cleanup |
| 2.2.2 Labels | 5 manual + auto-labeled (regex/similarity) |
| 2.2.3 Features | 12 features: title, author, year, arXiv |
| 2.2.4 Data Split | Publication-level: test/valid/train |
| 2.2.5 Evaluation | MRR@5 on test set |

**pred.json format:** `{partition, groundtruth, prediction}`