In [None]:
# Imports
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict

sys.path.insert(0, str(Path.cwd().parent))

## 1. Load Model and Data

In [None]:
# Configuration
DATA_DIR = Path("../../23120260")
OUTPUT_DIR = Path("../../output")

# Load model
class ReferenceMatchingModel:
    def __init__(self):
        self.weights = None
        self.bias = 0.0
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        if self.weights is None:
            raise ValueError("Model not loaded")
        linear = np.dot(X, self.weights) + self.bias
        return self._sigmoid(linear)
    
    def load(self, path: Path):
        with open(path, 'r') as f:
            data = json.load(f)
        self.weights = np.array(data['weights']) if data['weights'] else None
        self.bias = data['bias']

model = ReferenceMatchingModel()
model.load(OUTPUT_DIR / 'reference_matching_model.json')
print("Model loaded")

In [None]:
# Load extracted data
with open(OUTPUT_DIR / 'extracted_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)

# Load manual labels
manual_labels = []
if (OUTPUT_DIR / 'manual_labels.json').exists():
    with open(OUTPUT_DIR / 'manual_labels.json', 'r', encoding='utf-8') as f:
        manual_labels = json.load(f)

print(f"Loaded {len(all_data)} publications")
print(f"Loaded {len(manual_labels)} manual labels")

## 2. Evaluation Functions

In [None]:
def calculate_mrr(predictions: List[List[str]], ground_truth: List[str], top_k: int = 5) -> float:
    """
    Calculate Mean Reciprocal Rank (MRR).
    
    Args:
        predictions: List of ranked prediction lists (arxiv IDs)
        ground_truth: List of correct arxiv IDs
        top_k: Consider only top-k predictions
    
    Returns:
        MRR score
    """
    reciprocal_ranks = []
    
    for pred_list, true_id in zip(predictions, ground_truth):
        pred_list = pred_list[:top_k]
        
        rank = 0
        for i, pred_id in enumerate(pred_list):
            if pred_id == true_id:
                rank = i + 1
                break
        
        reciprocal_ranks.append(1.0 / rank if rank > 0 else 0.0)
    
    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0


def calculate_precision_at_k(predictions: List[List[str]], ground_truth: List[str], k: int = 1) -> float:
    """
    Calculate Precision@K.
    
    Args:
        predictions: List of ranked prediction lists
        ground_truth: List of correct IDs
        k: Consider top-k predictions
    
    Returns:
        Precision@K score
    """
    correct = 0
    for pred_list, true_id in zip(predictions, ground_truth):
        if true_id in pred_list[:k]:
            correct += 1
    return correct / len(predictions) if predictions else 0.0

## 3. Feature Extraction (from notebook 2)

In [None]:
import re
from typing import Set

class TextCleaner:
    STOP_WORDS = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    
    @staticmethod
    def clean_text(text: str) -> str:
        if not text:
            return ""
        text = text.lower()
        text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    @staticmethod
    def clean_title(title: str) -> str:
        title = TextCleaner.clean_text(title)
        title = re.sub(r'^(on|the|a|an)\s+', '', title)
        return title
    
    @staticmethod
    def tokenize(text: str) -> List[str]:
        text = TextCleaner.clean_text(text)
        return [t for t in text.split() if t not in TextCleaner.STOP_WORDS]
    
    @staticmethod
    def extract_author_last_names(authors: List[str]) -> List[str]:
        result = []
        for author in authors:
            if author:
                cleaned = TextCleaner.clean_text(author)
                parts = cleaned.split()
                if parts:
                    result.append(parts[-1])
        return result


def extract_features(bib: Dict, ref: Dict) -> np.ndarray:
    """Extract feature vector for a bib-ref pair"""
    # Title features
    bib_title = TextCleaner.clean_title(bib.get('title', ''))
    ref_title = TextCleaner.clean_title(ref.get('title', ''))
    bib_tokens = set(TextCleaner.tokenize(bib_title))
    ref_tokens = set(TextCleaner.tokenize(ref_title))
    
    # Jaccard
    if bib_tokens and ref_tokens:
        jaccard = len(bib_tokens & ref_tokens) / len(bib_tokens | ref_tokens)
    else:
        jaccard = 0.0
    
    # Overlap
    if bib_tokens and ref_tokens:
        overlap = len(bib_tokens & ref_tokens) / min(len(bib_tokens), len(ref_tokens))
    else:
        overlap = 0.0
    
    # Edit distance (simplified)
    edit_sim = 1.0 - (abs(len(bib_title) - len(ref_title)) / max(len(bib_title), len(ref_title), 1))
    
    # Author features
    bib_authors = set(TextCleaner.extract_author_last_names(bib.get('authors', [])))
    ref_authors = set(TextCleaner.extract_author_last_names(ref.get('authors', [])))
    
    if bib_authors and ref_authors:
        author_overlap = len(bib_authors & ref_authors) / min(len(bib_authors), len(ref_authors))
    else:
        author_overlap = 0.0
    
    bib_authors_list = list(bib_authors)
    ref_authors_list = list(ref_authors)
    first_author_match = 1.0 if (bib_authors_list and ref_authors_list and 
                                  bib_authors_list[0] == ref_authors_list[0]) else 0.0
    
    # Year features
    bib_year = bib.get('year', '')
    ref_year = ref.get('year', '')
    year_match = 1.0 if bib_year == ref_year else 0.0
    try:
        year_diff = abs(int(bib_year) - int(ref_year)) if bib_year and ref_year else 10
    except:
        year_diff = 10
    
    # ArXiv features
    bib_arxiv = (bib.get('arxiv_id') or '').replace('.', '-')
    ref_arxiv = (ref.get('arxiv_id') or '').replace('.', '-')
    arxiv_match = 1.0 if (bib_arxiv and ref_arxiv and bib_arxiv == ref_arxiv) else 0.0
    
    raw_content = bib.get('raw_content', '')
    arxiv_in_content = 1.0 if ref_arxiv.replace('-', '.') in raw_content else 0.0
    
    num_matching_authors = len(bib_authors & ref_authors)
    
    len_ratio = len(bib_title) / len(ref_title) if ref_title else 0
    title_len_ratio = min(len_ratio, 1/len_ratio) if len_ratio > 0 else 0
    
    combined_score = 0.4 * jaccard + 0.3 * author_overlap + 0.2 * year_match + 0.1 * first_author_match
    
    return np.array([
        jaccard, overlap, edit_sim,
        author_overlap, first_author_match,
        year_match, year_diff,
        arxiv_match, arxiv_in_content,
        num_matching_authors, title_len_ratio, combined_score
    ])

## 4. Generate Predictions

In [None]:
def rank_candidates(bib: Dict, refs: Dict[str, Dict], model: ReferenceMatchingModel, top_k: int = 5) -> List[Tuple[str, float]]:
    """Rank reference candidates for a BibTeX entry"""
    scores = []
    
    for arxiv_id, ref in refs.items():
        features = extract_features(bib, ref)
        score = model.predict_proba(features.reshape(1, -1))[0]
        scores.append((arxiv_id, score))
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

In [None]:
# ============================================================================
# Load ground truth from pred.json files (supports both manual and auto labels)
# ============================================================================

ground_truth_lookup = defaultdict(dict)
partition_lookup = {}

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    pub_path = DATA_DIR / pub_id
    pred_file = pub_path / "pred.json"
    
    if pred_file.exists():
        with open(pred_file, 'r') as f:
            pred_data = json.load(f)
        
        partition = pred_data.get('partition', 'train')
        partition_lookup[pub_id] = partition
        
        for bib_key, arxiv_id in pred_data.get('groundtruth', {}).items():
            ground_truth_lookup[pub_id][bib_key] = arxiv_id

print(f"Ground truth for {len(ground_truth_lookup)} publications")
print(f"  Test: {sum(1 for p in partition_lookup.values() if p == 'test')}")
print(f"  Valid: {sum(1 for p in partition_lookup.values() if p == 'valid')}")
print(f"  Train: {sum(1 for p in partition_lookup.values() if p == 'train')}")

In [None]:
# ============================================================================
# Generate predictions for TEST SET ONLY (Requirement 2.2.5)
# Test set = 2 publications (1 manual + 1 auto-labeled)
# ============================================================================

test_predictions = []
test_ground_truth = []
test_pub_ids = []

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    
    # Only evaluate on TEST partition
    if partition_lookup.get(pub_id) != 'test':
        continue
    
    if pub_id not in ground_truth_lookup:
        continue
    
    gt_dict = ground_truth_lookup[pub_id]
    
    for bib in pub_data['bibs']:
        bib_key = bib['key']
        
        if bib_key not in gt_dict:
            continue
        
        # Get ground truth
        true_arxiv = gt_dict[bib_key]
        
        # Rank candidates (top 5 as per requirement)
        ranked = rank_candidates(bib, pub_data['refs'], model, top_k=5)
        pred_list = [arxiv_id for arxiv_id, score in ranked]
        
        test_predictions.append(pred_list)
        test_ground_truth.append(true_arxiv)
        test_pub_ids.append(pub_id)

print(f"Test set evaluation:")
print(f"  Publications: {len(set(test_pub_ids))}")
print(f"  Total entries: {len(test_predictions)}")
for pub in sorted(set(test_pub_ids)):
    count = sum(1 for p in test_pub_ids if p == pub)
    print(f"    - {pub}: {count} entries")

## 5. Compute Metrics

In [None]:
# ============================================================================
# Compute MRR@5 (Requirement 2.2.5)
# ============================================================================
# MRR = (1/|Q|) * sum(1/rank_i) where rank_i is position of correct match

mrr_5 = calculate_mrr(test_predictions, test_ground_truth, top_k=5)
p_at_1 = calculate_precision_at_k(test_predictions, test_ground_truth, k=1)
p_at_3 = calculate_precision_at_k(test_predictions, test_ground_truth, k=3)
p_at_5 = calculate_precision_at_k(test_predictions, test_ground_truth, k=5)

print("=" * 60)
print("EVALUATION METRICS (Test Set Only)")
print("=" * 60)
print(f"\nMRR@5: {mrr_5:.4f}")
print(f"Precision@1: {p_at_1:.4f}")
print(f"Precision@3: {p_at_3:.4f}")
print(f"Precision@5: {p_at_5:.4f}")

# Detailed breakdown by publication
print("\n" + "-" * 40)
print("Breakdown by Test Publication:")
for pub in sorted(set(test_pub_ids)):
    pub_preds = [p for p, pid in zip(test_predictions, test_pub_ids) if pid == pub]
    pub_gt = [g for g, pid in zip(test_ground_truth, test_pub_ids) if pid == pub]
    pub_mrr = calculate_mrr(pub_preds, pub_gt, top_k=5)
    pub_p1 = calculate_precision_at_k(pub_preds, pub_gt, k=1)
    print(f"  {pub}: MRR@5={pub_mrr:.4f}, P@1={pub_p1:.4f} ({len(pub_preds)} entries)")

## 6. Generate pred.json Files

In [None]:
def generate_pred_json(pub_path: Path, pub_data: Dict, ground_truth_dict: Dict, 
                       model: ReferenceMatchingModel, partition: str = "test"):
    """
    Generate pred.json for a publication (Requirement 3.1.3).
    
    Format:
    {
        "partition": "test" | "valid" | "train",
        "groundtruth": {"bibtex_entry_name_1": "arxiv_id_from_references_json", ...},
        "prediction": {"bibtex_entry_name_1": ["candidate_id_1", "candidate_id_2", ...], ...}
    }
    """
    pred_structure = {
        "partition": partition,
        "groundtruth": {},
        "prediction": {}
    }
    
    # Add ground truth and predictions
    for bib_key, arxiv_id in ground_truth_dict.items():
        pred_structure["groundtruth"][bib_key] = arxiv_id
        
        # Find bib entry and generate top-5 predictions
        bib_entry = None
        for bib in pub_data['bibs']:
            if bib['key'] == bib_key:
                bib_entry = bib
                break
        
        if bib_entry:
            # Top 5 ranked candidates as per requirement
            ranked = rank_candidates(bib_entry, pub_data['refs'], model, top_k=5)
            pred_structure["prediction"][bib_key] = [arxiv_id for arxiv_id, _ in ranked]
        else:
            pred_structure["prediction"][bib_key] = []
    
    # Save pred.json
    pred_path = pub_path / 'pred.json'
    with open(pred_path, 'w', encoding='utf-8') as f:
        json.dump(pred_structure, f, indent=2)
    
    return pred_path

In [None]:
# Generate pred.json for ALL publications with ground truth
# Include partition information from lookup

generated_count = 0
partition_counts = {'test': 0, 'valid': 0, 'train': 0}

for pub_data in all_data:
    pub_id = pub_data['pub_id']
    
    if pub_id not in ground_truth_lookup:
        continue
    
    pub_path = DATA_DIR / pub_id
    gt_dict = ground_truth_lookup[pub_id]
    partition = partition_lookup.get(pub_id, 'train')
    
    generate_pred_json(pub_path, pub_data, gt_dict, model, partition=partition)
    generated_count += 1
    partition_counts[partition] += 1

print(f"Generated {generated_count} pred.json files")
print(f"  Test: {partition_counts['test']}")
print(f"  Valid: {partition_counts['valid']}")
print(f"  Train: {partition_counts['train']}")

## 7. Summary

In [None]:
# Final summary
print("=" * 60)
print("EVALUATION SUMMARY (Requirement 2.2.5)")
print("=" * 60)

print(f"\nDataset:")
print(f"  Total publications with labels: {len(ground_truth_lookup)}")
print(f"  Test publications: {sum(1 for p in partition_lookup.values() if p == 'test')}")
print(f"  Valid publications: {sum(1 for p in partition_lookup.values() if p == 'valid')}")
print(f"  Train publications: {sum(1 for p in partition_lookup.values() if p == 'train')}")

print(f"\nTest Set Metrics (MRR@5 as per requirement):")
print(f"  Test entries evaluated: {len(test_predictions)}")
print(f"  MRR@5: {mrr_5:.4f}")
print(f"  Precision@1: {p_at_1:.4f}")
print(f"  Precision@3: {p_at_3:.4f}")
print(f"  Precision@5: {p_at_5:.4f}")

print(f"\nOutput files:")
print(f"  {generated_count} pred.json files generated")
print(f"    Format: partition, groundtruth, prediction (top 5 candidates)")

print("\n" + "=" * 60)
print("MRR Formula: (1/|Q|) * Î£(1/rank_i)")
print("  |Q| = total references to match")
print("  rank_i = position of correct match in top-5 list (0 if not found)")
print("=" * 60)

---
## Pipeline Complete!

The reference matching pipeline implements all requirements from Section 2.2:

| Requirement | Implementation |
|-------------|----------------|
| **2.2.1 Data Cleaning** | Lowercasing, stop-word removal, tokenization, LaTeX cleanup |
| **2.2.2 Manual Labels** | 5 publications, 52+ label pairs (exceeds 20 minimum) |
| **2.2.2 Auto Labels** | 10%+ of non-manual data with regex/similarity heuristics |
| **2.2.3 Features** | 12 features: title similarity, author overlap, year match, arXiv ID |
| **2.2.4 Data Split** | Publication-level: Test (1 manual + 1 auto), Valid (1 manual + 1 auto), Train (rest) |
| **2.2.5 Evaluation** | MRR@5 on Test set (2 publications) |

**Output Format (pred.json):**
```json
{
    "partition": "test",
    "groundtruth": {"bib_key": "arxiv_id"},
    "prediction": {"bib_key": ["candidate_1", "candidate_2", ...]}
}
```