In [1]:
import sys
import json
import re
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Set
from collections import defaultdict

sys.path.insert(0, str(Path.cwd().parent))

## 1. Load Model and Data

In [2]:
DATA_DIR = Path("../../23120260")
OUTPUT_DIR = Path("../../output")

class ReferenceMatchingModel:
    def __init__(self):
        self.weights, self.bias = None, 0.0
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self._sigmoid(np.dot(X, self.weights) + self.bias)
    
    def load(self, path: Path):
        with open(path, 'r') as f:
            data = json.load(f)
        self.weights = np.array(data['weights']) if data['weights'] else None
        self.bias = data['bias']

model = ReferenceMatchingModel()
model.load(OUTPUT_DIR / 'reference_matching_model.json')
print("Model loaded")

Model loaded


In [3]:
with open(OUTPUT_DIR / 'extracted_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)
print(f"Loaded {len(all_data)} publications")

Loaded 893 publications


## 2. Evaluation Functions

In [4]:
def calculate_mrr(predictions: List[List[str]], ground_truth: List[str], top_k: int = 5) -> float:
    """Mean Reciprocal Rank"""
    rr = []
    for pred_list, true_id in zip(predictions, ground_truth):
        rank = next((i + 1 for i, p in enumerate(pred_list[:top_k]) if p == true_id), 0)
        rr.append(1.0 / rank if rank > 0 else 0.0)
    return np.mean(rr) if rr else 0.0

def calculate_precision_at_k(predictions: List[List[str]], ground_truth: List[str], k: int = 1) -> float:
    """Precision@K"""
    return sum(1 for pred, true in zip(predictions, ground_truth) if true in pred[:k]) / len(predictions) if predictions else 0.0

## 3. Feature Extraction (from notebook 2)

In [5]:
class TextCleaner:
    STOP_WORDS = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    
    @staticmethod
    def clean_text(text: str) -> str:
        if not text: return ""
        text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text.lower())
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', ' ', text)).strip()
    
    @staticmethod
    def clean_title(title: str) -> str:
        return re.sub(r'^(on|the|a|an)\s+', '', TextCleaner.clean_text(title))
    
    @staticmethod
    def tokenize(text: str) -> List[str]:
        return [t for t in TextCleaner.clean_text(text).split() if t not in TextCleaner.STOP_WORDS]
    
    @staticmethod
    def extract_author_last_names(authors: List[str]) -> List[str]:
        return [p[-1] for a in authors if a for p in [TextCleaner.clean_text(a).split()] if p]

def extract_features(bib: Dict, ref: Dict) -> np.ndarray:
    bib_title, ref_title = TextCleaner.clean_title(bib.get('title', '')), TextCleaner.clean_title(ref.get('title', ''))
    bib_tokens, ref_tokens = set(TextCleaner.tokenize(bib_title)), set(TextCleaner.tokenize(ref_title))
    
    jaccard = len(bib_tokens & ref_tokens) / len(bib_tokens | ref_tokens) if bib_tokens and ref_tokens else 0.0
    overlap = len(bib_tokens & ref_tokens) / min(len(bib_tokens), len(ref_tokens)) if bib_tokens and ref_tokens else 0.0
    edit_sim = 1.0 - abs(len(bib_title) - len(ref_title)) / max(len(bib_title), len(ref_title), 1)
    
    bib_authors = set(TextCleaner.extract_author_last_names(bib.get('authors', [])))
    ref_authors = set(TextCleaner.extract_author_last_names(ref.get('authors', [])))
    author_overlap = len(bib_authors & ref_authors) / min(len(bib_authors), len(ref_authors)) if bib_authors and ref_authors else 0.0
    first_match = 1.0 if list(bib_authors)[:1] == list(ref_authors)[:1] and bib_authors else 0.0
    
    bib_year, ref_year = bib.get('year', ''), ref.get('year', '')
    year_match = 1.0 if bib_year == ref_year else 0.0
    try: year_diff = abs(int(bib_year) - int(ref_year)) if bib_year and ref_year else 10
    except: year_diff = 10
    
    bib_arxiv = (bib.get('arxiv_id') or '').replace('.', '-')
    ref_arxiv = (ref.get('arxiv_id') or '').replace('.', '-')
    arxiv_match = 1.0 if bib_arxiv and ref_arxiv and bib_arxiv == ref_arxiv else 0.0
    arxiv_in_content = 1.0 if ref_arxiv.replace('-', '.') in bib.get('raw_content', '') else 0.0
    
    len_ratio = len(bib_title) / len(ref_title) if ref_title else 0
    title_len_ratio = min(len_ratio, 1/len_ratio) if len_ratio > 0 else 0
    combined = 0.4 * jaccard + 0.3 * author_overlap + 0.2 * year_match + 0.1 * first_match
    
    return np.array([jaccard, overlap, edit_sim, author_overlap, first_match, year_match, year_diff,
                     arxiv_match, arxiv_in_content, len(bib_authors & ref_authors), title_len_ratio, combined])

## 4. Generate Predictions

In [6]:
def rank_candidates(bib: Dict, refs: Dict[str, Dict], model, top_k: int = 5) -> List[Tuple[str, float]]:
    scores = [(arxiv_id, model.predict_proba(extract_features(bib, ref).reshape(1, -1))[0]) 
              for arxiv_id, ref in refs.items()]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

In [7]:
# Load ground truth from pred.json files
ground_truth_lookup, partition_lookup = defaultdict(dict), {}

for pub_data in all_data:
    pred_file = DATA_DIR / pub_data['pub_id'] / "pred.json"
    if pred_file.exists():
        with open(pred_file, 'r') as f:
            pred_data = json.load(f)
        partition_lookup[pub_data['pub_id']] = pred_data.get('partition', 'train')
        for bib_key, arxiv_id in pred_data.get('groundtruth', {}).items():
            ground_truth_lookup[pub_data['pub_id']][bib_key] = arxiv_id

print(f"Ground truth: {len(ground_truth_lookup)} pubs, Test: {sum(1 for p in partition_lookup.values() if p == 'test')}")

Ground truth: 677 pubs, Test: 4


In [8]:
# Generate predictions for TEST SET
test_predictions, test_ground_truth, test_pub_ids = [], [], []

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    if partition_lookup.get(pub_id) != 'test' or pub_id not in ground_truth_lookup:
        continue
    
    for bib in pub_data['bibs']:
        if bib['key'] not in ground_truth_lookup[pub_id]:
            continue
        ranked = rank_candidates(bib, pub_data['refs'], model, top_k=5)
        test_predictions.append([arxiv_id for arxiv_id, _ in ranked])
        test_ground_truth.append(ground_truth_lookup[pub_id][bib['key']])
        test_pub_ids.append(pub_id)

print(f"Test: {len(set(test_pub_ids))} pubs, {len(test_predictions)} entries")

Test: 4 pubs, 24 entries


## 5. Compute Metrics

In [9]:
# MRR@5 and Precision metrics
mrr_5 = calculate_mrr(test_predictions, test_ground_truth, top_k=5)
p_at_1 = calculate_precision_at_k(test_predictions, test_ground_truth, k=1)
p_at_3 = calculate_precision_at_k(test_predictions, test_ground_truth, k=3)
p_at_5 = calculate_precision_at_k(test_predictions, test_ground_truth, k=5)

print("=" * 50)
print("TEST SET METRICS")
print("=" * 50)
print(f"MRR@5: {mrr_5:.4f}, P@1: {p_at_1:.4f}, P@3: {p_at_3:.4f}, P@5: {p_at_5:.4f}")

for pub in sorted(set(test_pub_ids)):
    pub_preds = [p for p, pid in zip(test_predictions, test_pub_ids) if pid == pub]
    pub_gt = [g for g, pid in zip(test_ground_truth, test_pub_ids) if pid == pub]
    print(f"  {pub}: MRR@5={calculate_mrr(pub_preds, pub_gt):.4f}, P@1={calculate_precision_at_k(pub_preds, pub_gt):.4f}")

TEST SET METRICS
MRR@5: 0.6660, P@1: 0.5833, P@3: 0.6667, P@5: 0.8750
  2411-00222: MRR@5=0.5156, P@1=0.4000
  2411-00236: MRR@5=1.0000, P@1=1.0000
  2411-00252: MRR@5=0.8500, P@1=0.8000
  2411-00260: MRR@5=1.0000, P@1=1.0000


## 6. Generate pred.json Files

In [10]:
def generate_pred_json(pub_path: Path, pub_data: Dict, gt_dict: Dict, model, partition: str = "test"):
    """Generate pred.json with top-5 predictions"""
    pred_structure = {"partition": partition, "groundtruth": {}, "prediction": {}}
    
    for bib_key, arxiv_id in gt_dict.items():
        pred_structure["groundtruth"][bib_key] = arxiv_id
        bib = next((b for b in pub_data['bibs'] if b['key'] == bib_key), None)
        if bib:
            ranked = rank_candidates(bib, pub_data['refs'], model, top_k=5)
            pred_structure["prediction"][bib_key] = [aid for aid, _ in ranked]
        else:
            pred_structure["prediction"][bib_key] = []
    
    with open(pub_path / 'pred.json', 'w', encoding='utf-8') as f:
        json.dump(pred_structure, f, indent=2)
    return pub_path / 'pred.json'

In [11]:
# Generate pred.json for all publications with ground truth
generated, partition_counts = 0, {'test': 0, 'valid': 0, 'train': 0}

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    if pub_id not in ground_truth_lookup:
        continue
    partition = partition_lookup.get(pub_id, 'train')
    generate_pred_json(DATA_DIR / pub_id, pub_data, ground_truth_lookup[pub_id], model, partition)
    generated += 1
    partition_counts[partition] += 1

print(f"Generated {generated} pred.json files: Test={partition_counts['test']}, Valid={partition_counts['valid']}, Train={partition_counts['train']}")

Generated 677 pred.json files: Test=4, Valid=2, Train=671


## 7. Summary

In [12]:
print("=" * 50)
print("EVALUATION SUMMARY")
print("=" * 50)
print(f"Publications: {len(ground_truth_lookup)} with labels")
print(f"Test entries: {len(test_predictions)}")
print(f"MRR@5: {mrr_5:.4f}, P@1: {p_at_1:.4f}")
print(f"Output: {generated} pred.json files")

EVALUATION SUMMARY
Publications: 677 with labels
Test entries: 24
MRR@5: 0.6660, P@1: 0.5833
Output: 677 pred.json files


---
## Pipeline Complete!

| Requirement | Implementation |
|-------------|----------------|
| 2.2.1 Data Cleaning | Lowercasing, tokenization, LaTeX cleanup |
| 2.2.2 Labels | 5 manual + auto-labeled (regex/similarity) |
| 2.2.3 Features | 12 features: title, author, year, arXiv |
| 2.2.4 Data Split | Publication-level: test/valid/train |
| 2.2.5 Evaluation | MRR@5 on test set |

**pred.json format:** `{partition, groundtruth, prediction}`