In [3]:
# Imports
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Set, Tuple

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from matching import BibEntry, RefEntry, TextCleaner

## 1. Configuration

In [4]:
# Configuration
OUTPUT_DIR = Path("../../output")

# Load extracted data from previous notebook
with open(OUTPUT_DIR / 'extracted_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)

print(f"Loaded {len(all_data)} publications")

Loaded 302 publications


## 2. Feature Extractor

In [5]:
class FeatureExtractor:
    """
    Extract features for reference matching.
    
    Problem Framing: This is a RANKING problem where for each BibTeX entry,
    we need to rank all candidate references from references.json.
    """
    
    FEATURE_NAMES = [
        'title_jaccard', 'title_overlap', 'title_edit_dist',
        'author_overlap', 'first_author_match',
        'year_match', 'year_diff',
        'arxiv_match', 'arxiv_in_content',
        'num_matching_authors', 'title_len_ratio', 'combined_score'
    ]
    
    @staticmethod
    def jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
        """Jaccard similarity between two sets"""
        if not set1 or not set2:
            return 0.0
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        return intersection / union if union > 0 else 0.0
    
    @staticmethod
    def token_overlap_ratio(tokens1: List[str], tokens2: List[str]) -> float:
        """Ratio of overlapping tokens"""
        if not tokens1 or not tokens2:
            return 0.0
        set1, set2 = set(tokens1), set(tokens2)
        overlap = len(set1 & set2)
        return overlap / min(len(set1), len(set2))
    
    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        """Compute Levenshtein distance between two strings"""
        if len(s1) < len(s2):
            return FeatureExtractor.levenshtein_distance(s2, s1)
        if len(s2) == 0:
            return len(s1)
        
        prev_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            curr_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = prev_row[j + 1] + 1
                deletions = curr_row[j] + 1
                substitutions = prev_row[j] + (c1 != c2)
                curr_row.append(min(insertions, deletions, substitutions))
            prev_row = curr_row
        
        return prev_row[-1]
    
    @staticmethod
    def normalized_edit_distance(s1: str, s2: str) -> float:
        """Normalized edit distance (0 = identical, 1 = completely different)"""
        if not s1 and not s2:
            return 0.0
        if not s1 or not s2:
            return 1.0
        dist = FeatureExtractor.levenshtein_distance(s1, s2)
        return dist / max(len(s1), len(s2))
    
    @staticmethod
    def extract_features(bib: Dict, ref: Dict) -> Dict[str, float]:
        """
        Extract features for a (BibEntry, RefEntry) pair.
        
        Args:
            bib: BibEntry as dict
            ref: RefEntry as dict
        
        Returns:
            Feature dictionary
        """
        features = {}
        
        # Clean texts
        bib_title = TextCleaner.clean_title(bib.get('title', ''))
        ref_title = TextCleaner.clean_title(ref.get('title', ''))
        
        bib_title_tokens = TextCleaner.tokenize(bib_title)
        ref_title_tokens = TextCleaner.tokenize(ref_title)
        
        # Feature 1: Title Jaccard Similarity
        features['title_jaccard'] = FeatureExtractor.jaccard_similarity(
            set(bib_title_tokens), set(ref_title_tokens)
        )
        
        # Feature 2: Title Token Overlap Ratio
        features['title_overlap'] = FeatureExtractor.token_overlap_ratio(
            bib_title_tokens, ref_title_tokens
        )
        
        # Feature 3: Normalized Edit Distance of Titles
        features['title_edit_dist'] = 1.0 - FeatureExtractor.normalized_edit_distance(
            bib_title, ref_title
        )
        
        # Feature 4: Author Last Name Overlap
        bib_authors = TextCleaner.extract_author_last_names(bib.get('authors', []))
        ref_authors = TextCleaner.extract_author_last_names(ref.get('authors', []))
        features['author_overlap'] = FeatureExtractor.token_overlap_ratio(
            bib_authors, ref_authors
        )
        
        # Feature 5: First Author Match
        features['first_author_match'] = 1.0 if (
            bib_authors and ref_authors and bib_authors[0] == ref_authors[0]
        ) else 0.0
        
        # Feature 6: Year Match
        bib_year = bib.get('year') or TextCleaner.extract_year(bib.get('raw_content', ''))
        ref_year = ref.get('year', '')
        features['year_match'] = 1.0 if bib_year == ref_year else 0.0
        
        # Year difference
        try:
            if bib_year and ref_year:
                features['year_diff'] = abs(int(bib_year) - int(ref_year))
            else:
                features['year_diff'] = 10
        except ValueError:
            features['year_diff'] = 10
        
        # Feature 7: ArXiv ID Exact Match (strong signal!)
        bib_arxiv = (bib.get('arxiv_id') or '').replace('.', '-')
        ref_arxiv = (ref.get('arxiv_id') or '').replace('.', '-')
        features['arxiv_match'] = 1.0 if (bib_arxiv and ref_arxiv and bib_arxiv == ref_arxiv) else 0.0
        
        # Feature 8: ArXiv ID in raw content
        raw_content = bib.get('raw_content', '')
        ref_arxiv_dot = ref_arxiv.replace('-', '.')
        features['arxiv_in_content'] = 1.0 if ref_arxiv_dot and ref_arxiv_dot in raw_content else 0.0
        
        # Feature 9: Number of matching authors
        features['num_matching_authors'] = len(set(bib_authors) & set(ref_authors))
        
        # Feature 10: Title length ratio
        len_ratio = len(bib_title) / len(ref_title) if ref_title else 0
        features['title_len_ratio'] = min(len_ratio, 1/len_ratio) if len_ratio > 0 else 0
        
        # Feature 11: Combined score
        features['combined_score'] = (
            0.4 * features['title_jaccard'] +
            0.3 * features['author_overlap'] +
            0.2 * features['year_match'] +
            0.1 * features['first_author_match']
        )
        
        return features
    
    @staticmethod
    def features_to_vector(features: Dict[str, float]) -> np.ndarray:
        """Convert feature dict to numpy array"""
        return np.array([features.get(name, 0.0) for name in FeatureExtractor.FEATURE_NAMES])

## 3. Test Feature Extraction

In [6]:
# Test on sample data
sample = all_data[0]
print(f"Publication: {sample['pub_id']}")

# Get first bib and first ref
bib = sample['bibs'][0]
ref_key = list(sample['refs'].keys())[0]
ref = sample['refs'][ref_key]

print(f"\nBib: {bib['key']}")
print(f"  Title: {bib['title'][:60]}...")

print(f"\nRef: {ref_key}")
print(f"  Title: {ref['title'][:60]}...")

# Extract features
features = FeatureExtractor.extract_features(bib, ref)
print("\nFeatures:")
for name, value in features.items():
    print(f"  {name}: {value:.4f}")

Publication: 2411-00222

Bib: ganjidoost2024protectingfeedforwardnetworksadversarial
  Title: Protecting Feed-Forward Networks from Adversarial Attacks Us...

Ref: 2006-04182
  Title: Predictive Coding Approximates Backprop along Arbitrary Comp...

Features:
  title_jaccard: 0.1333
  title_overlap: 0.2500
  title_edit_dist: 0.2346
  author_overlap: 0.0000
  first_author_match: 0.0000
  year_match: 0.0000
  year_diff: 4.0000
  arxiv_match: 0.0000
  arxiv_in_content: 0.0000
  num_matching_authors: 0.0000
  title_len_ratio: 0.9136
  combined_score: 0.0533


## 4. Generate Training Data

In [7]:
def generate_candidate_pairs(pub_data: Dict, max_candidates: int = 50) -> List[Dict]:
    """
    Generate candidate pairs for a publication.
    For each bib entry, create pairs with top candidates.
    """
    pairs = []
    
    for bib in pub_data['bibs']:
        # Extract features for all references
        candidates = []
        for arxiv_id, ref in pub_data['refs'].items():
            t1 = bib['title'].replace('\n', ' ').strip()
            t2 = ref['title'].replace('\n', ' ').strip()

            if abs(len(t1) - len(t2)) > 40:
                continue

            features = FeatureExtractor.extract_features(bib, ref)
            candidates.append({
                'pub_id': pub_data['pub_id'],
                'bib_key': bib['key'],
                'arxiv_id': arxiv_id,
                'features': features
            })
        
        # Sort by combined score and take top candidates
        candidates.sort(key=lambda x: x['features']['combined_score'], reverse=True)
        pairs.extend(candidates[:max_candidates])
    
    return pairs

In [8]:
# Generate pairs using multiprocessing with batch processing to save RAM
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

# Import the worker function from the separate module
from feature_processor import generate_candidate_pairs_worker, FeatureProcessor

# Configuration
num_workers = min(multiprocessing.cpu_count(), 8)
batch_size = 500  # Process this many publications per batch
num_features = len(FeatureProcessor.FEATURE_NAMES)

print(f"Using {num_workers} workers, batch size: {batch_size}")
print(f"Total publications: {len(all_data)}")

# Initialize output files
features_file = OUTPUT_DIR / 'features.npy'
metadata_file = OUTPUT_DIR / 'pair_metadata.json'

# Process in batches
total_pairs = 0
all_metadata = []
all_features_list = []

num_batches = (len(all_data) + batch_size - 1) // batch_size

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, len(all_data))
    batch_data = all_data[start_idx:end_idx]
    
    print(f"\n--- Batch {batch_idx + 1}/{num_batches} (publications {start_idx}-{end_idx}) ---")
    
    batch_pairs = []
    
    # Process batch with multiprocessing
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_candidate_pairs_worker, pub_data) 
                   for pub_data in batch_data]
        
        for future in tqdm(as_completed(futures), total=len(batch_data), 
                          desc=f"Batch {batch_idx + 1}"):
            try:
                pairs = future.result()
                batch_pairs.extend(pairs)
            except Exception as e:
                print(f"Error: {e}")
    
    # Extract features and metadata from batch
    batch_features = np.array([
        [p['features'].get(name, 0.0) for name in FeatureProcessor.FEATURE_NAMES]
        for p in batch_pairs
    ], dtype=np.float32)
    
    batch_metadata = [{
        'pub_id': p['pub_id'],
        'bib_key': p['bib_key'],
        'arxiv_id': p['arxiv_id'],
        'combined_score': p['features']['combined_score']
    } for p in batch_pairs]
    
    # Accumulate results
    all_features_list.append(batch_features)
    all_metadata.extend(batch_metadata)
    
    total_pairs += len(batch_pairs)
    print(f"  Batch pairs: {len(batch_pairs)}, Total so far: {total_pairs}")
    
    # Clear batch from memory
    del batch_pairs, batch_features, batch_metadata

# Concatenate all features and save
print(f"\n--- Saving results ---")
X = np.vstack(all_features_list)
print(f"Feature matrix shape: {X.shape}")

np.save(features_file, X)
print(f"Saved features to: {features_file}")

with open(metadata_file, 'w', encoding='utf-8') as f:
    json.dump(all_metadata, f, indent=2)
print(f"Saved metadata to: {metadata_file}")

print(f"\n=== SUMMARY ===")
print(f"Total pairs generated: {total_pairs}")
print(f"Feature matrix shape: {X.shape}")

Using 8 workers, batch size: 500
Total publications: 302

--- Batch 1/1 (publications 0-302) ---


Batch 1: 100%|██████████| 302/302 [00:04<00:00, 61.87it/s] 


  Batch pairs: 703946, Total so far: 703946

--- Saving results ---
Feature matrix shape: (703946, 12)
Saved features to: ..\..\output\features.npy
Saved metadata to: ..\..\output\pair_metadata.json

=== SUMMARY ===
Total pairs generated: 703946
Feature matrix shape: (703946, 12)


## 5. Feature Statistics

In [9]:
# Feature statistics
print("=== Feature Statistics ===")
for i, name in enumerate(FeatureExtractor.FEATURE_NAMES):
    col = X[:, i]
    print(f"{name:25s}: mean={col.mean():.4f}, std={col.std():.4f}, min={col.min():.4f}, max={col.max():.4f}")

=== Feature Statistics ===
title_jaccard            : mean=0.0651, std=0.1113, min=0.0000, max=1.0000
title_overlap            : mean=0.1269, std=0.1634, min=0.0000, max=1.0000
title_edit_dist          : mean=0.2492, std=0.0974, min=0.0000, max=1.0000
author_overlap           : mean=0.0110, std=0.0758, min=0.0000, max=1.0000
first_author_match       : mean=0.0040, std=0.0630, min=0.0000, max=1.0000
year_match               : mean=0.1107, std=0.3138, min=0.0000, max=1.0000
year_diff                : mean=7.7554, std=19.4024, min=0.0000, max=2018.0000
arxiv_match              : mean=0.0014, std=0.0380, min=0.0000, max=1.0000
arxiv_in_content         : mean=0.0028, std=0.0525, min=0.0000, max=1.0000
num_matching_authors     : mean=0.0510, std=0.4380, min=0.0000, max=58.0000
title_len_ratio          : mean=0.7296, std=0.1884, min=0.0000, max=1.0000
combined_score           : mean=0.0519, std=0.0907, min=0.0000, max=1.0000


---
**Next:** Continue to `03_model_training.ipynb` to train the matching model.