In [1]:
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Set

sys.path.insert(0, str(Path.cwd().parent))
from matching import BibEntry, RefEntry, TextCleaner

## 1. Configuration

In [2]:
OUTPUT_DIR = Path("../../output")

with open(OUTPUT_DIR / 'extracted_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)
print(f"Loaded {len(all_data)} publications")

Loaded 893 publications


## 2. Feature Extractor

In [3]:
class FeatureExtractor:
    """Feature extraction for reference matching (ranking problem)"""
    
    FEATURE_NAMES = [
        'title_jaccard', 'title_overlap', 'title_edit_dist',
        'author_overlap', 'first_author_match', 'year_match', 'year_diff',
        'arxiv_match', 'arxiv_in_content', 'num_matching_authors', 
        'title_len_ratio', 'combined_score'
    ]
    
    @staticmethod
    def jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
        if not set1 or not set2:
            return 0.0
        return len(set1 & set2) / len(set1 | set2)
    
    @staticmethod
    def token_overlap_ratio(tokens1: List[str], tokens2: List[str]) -> float:
        if not tokens1 or not tokens2:
            return 0.0
        set1, set2 = set(tokens1), set(tokens2)
        return len(set1 & set2) / min(len(set1), len(set2))
    
    @staticmethod
    def levenshtein_distance(s1: str, s2: str) -> int:
        if len(s1) < len(s2):
            return FeatureExtractor.levenshtein_distance(s2, s1)
        if len(s2) == 0:
            return len(s1)
        prev_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            curr_row = [i + 1]
            for j, c2 in enumerate(s2):
                curr_row.append(min(prev_row[j + 1] + 1, curr_row[j] + 1, prev_row[j] + (c1 != c2)))
            prev_row = curr_row
        return prev_row[-1]
    
    @staticmethod
    def extract_features(bib: Dict, ref: Dict) -> Dict[str, float]:
        features = {}
        
        # Title features
        bib_title = TextCleaner.clean_title(bib.get('title', ''))
        ref_title = TextCleaner.clean_title(ref.get('title', ''))
        bib_tokens = TextCleaner.tokenize(bib_title)
        ref_tokens = TextCleaner.tokenize(ref_title)
        
        features['title_jaccard'] = FeatureExtractor.jaccard_similarity(set(bib_tokens), set(ref_tokens))
        features['title_overlap'] = FeatureExtractor.token_overlap_ratio(bib_tokens, ref_tokens)
        
        if bib_title and ref_title:
            dist = FeatureExtractor.levenshtein_distance(bib_title, ref_title)
            features['title_edit_dist'] = 1.0 - dist / max(len(bib_title), len(ref_title))
        else:
            features['title_edit_dist'] = 0.0
        
        # Author features
        bib_authors = TextCleaner.extract_author_last_names(bib.get('authors', [])[:50])
        ref_authors = TextCleaner.extract_author_last_names(ref.get('authors', [])[:50])
        features['author_overlap'] = FeatureExtractor.token_overlap_ratio(bib_authors, ref_authors)
        features['first_author_match'] = 1.0 if bib_authors and ref_authors and bib_authors[0] == ref_authors[0] else 0.0
        features['num_matching_authors'] = min(len(set(bib_authors) & set(ref_authors)), 20)
        
        # Year features
        bib_year = bib.get('year') or TextCleaner.extract_year(bib.get('raw_content', ''))
        ref_year = ref.get('year', '')
        features['year_match'] = 1.0 if bib_year and ref_year and bib_year == ref_year else 0.0
        try:
            features['year_diff'] = min(abs(int(bib_year) - int(ref_year)), 50) if bib_year and ref_year else 10
        except ValueError:
            features['year_diff'] = 10
        
        # ArXiv features
        bib_arxiv = (bib.get('arxiv_id') or '').replace('.', '-')
        ref_arxiv = (ref.get('arxiv_id') or '').replace('.', '-')
        features['arxiv_match'] = 1.0 if bib_arxiv and ref_arxiv and bib_arxiv == ref_arxiv else 0.0
        features['arxiv_in_content'] = 1.0 if ref_arxiv.replace('-', '.') in bib.get('raw_content', '') else 0.0
        
        # Title length ratio
        len_ratio = len(bib_title) / len(ref_title) if ref_title else 0
        features['title_len_ratio'] = min(len_ratio, 1/len_ratio) if len_ratio > 0 else 0
        
        # Combined score
        features['combined_score'] = (0.4 * features['title_jaccard'] + 0.3 * features['author_overlap'] +
                                      0.2 * features['year_match'] + 0.1 * features['first_author_match'])
        return features
    
    @staticmethod
    def features_to_vector(features: Dict[str, float]) -> np.ndarray:
        return np.array([features.get(name, 0.0) for name in FeatureExtractor.FEATURE_NAMES])

## 3. Test Feature Extraction

In [4]:
# Test feature extraction
sample = all_data[0]
bib, ref_key = sample['bibs'][0], list(sample['refs'].keys())[0]
ref = sample['refs'][ref_key]
features = FeatureExtractor.extract_features(bib, ref)

print(f"Pub: {sample['pub_id']}, Bib: {bib['key']}, Ref: {ref_key}")
for name, value in features.items():
    print(f"  {name}: {value:.4f}")

Pub: 2411-00223, Bib: NonlinearConsensusDirected, Ref: 2307-04374
  title_jaccard: 0.0000
  title_overlap: 0.0000
  title_edit_dist: 0.2027
  author_overlap: 0.0000
  first_author_match: 0.0000
  num_matching_authors: 0.0000
  year_match: 0.0000
  year_diff: 12.0000
  arxiv_match: 0.0000
  arxiv_in_content: 0.0000
  title_len_ratio: 0.3108
  combined_score: 0.0000


## 4. Generate Training Data

In [5]:
def generate_candidate_pairs(pub_data: Dict, max_candidates: int = 50) -> List[Dict]:
    """Generate candidate pairs for a publication"""
    pairs = []
    for bib in pub_data['bibs']:
        candidates = []
        for arxiv_id, ref in pub_data['refs'].items():
            t1 = bib['title'].replace('\n', ' ').strip()
            t2 = ref['title'].replace('\n', ' ').strip()
            if abs(len(t1) - len(t2)) > 40:
                continue
            features = FeatureExtractor.extract_features(bib, ref)
            candidates.append({'pub_id': pub_data['pub_id'], 'bib_key': bib['key'],
                              'arxiv_id': arxiv_id, 'features': features})
        candidates.sort(key=lambda x: x['features']['combined_score'], reverse=True)
        pairs.extend(candidates[:max_candidates])
    return pairs

In [6]:
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
from feature_processor import generate_candidate_pairs_worker, FeatureProcessor

num_workers = min(multiprocessing.cpu_count(), 8)
batch_size = 500

print(f"Processing {len(all_data)} publications with {num_workers} workers, batch size {batch_size}")

all_metadata, all_features_list, total_pairs = [], [], 0

for batch_idx in range((len(all_data) + batch_size - 1) // batch_size):
    batch_data = all_data[batch_idx * batch_size:(batch_idx + 1) * batch_size]
    batch_pairs = []
    
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_candidate_pairs_worker, pub) for pub in batch_data]
        for future in tqdm(as_completed(futures), total=len(batch_data), desc=f"Batch {batch_idx + 1}"):
            batch_pairs.extend(future.result())
    
    all_features_list.append(np.array([[p['features'].get(n, 0.0) for n in FeatureProcessor.FEATURE_NAMES] 
                                        for p in batch_pairs], dtype=np.float32))
    all_metadata.extend([{'pub_id': p['pub_id'], 'bib_key': p['bib_key'], 'arxiv_id': p['arxiv_id'],
                          'combined_score': p['features']['combined_score']} for p in batch_pairs])
    total_pairs += len(batch_pairs)
    print(f"  Batch {batch_idx + 1}: {len(batch_pairs)} pairs, total: {total_pairs}")

# Save results
X = np.vstack(all_features_list)
np.save(OUTPUT_DIR / 'features.npy', X)
with open(OUTPUT_DIR / 'pair_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(all_metadata, f, indent=2)

print(f"\nSaved {X.shape[0]} pairs, feature shape: {X.shape}")

Processing 893 publications with 8 workers, batch size 500


Batch 1: 100%|██████████| 500/500 [01:01<00:00,  8.13it/s] 


  Batch 1: 3150924 pairs, total: 3150924


Batch 2: 100%|██████████| 393/393 [00:37<00:00, 10.46it/s] 


  Batch 2: 2162392 pairs, total: 5313316

Saved 5313316 pairs, feature shape: (5313316, 12)


## 5. Feature Statistics

In [7]:
print("Feature Statistics:")
for i, name in enumerate(FeatureExtractor.FEATURE_NAMES):
    col = X[:, i]
    print(f"  {name:22s}: mean={col.mean():.3f}, std={col.std():.3f}, min={col.min():.3f}, max={col.max():.3f}")

Feature Statistics:
  title_jaccard         : mean=0.038, std=0.077, min=0.000, max=1.000
  title_overlap         : mean=0.076, std=0.124, min=0.000, max=1.000
  title_edit_dist       : mean=0.227, std=0.076, min=0.000, max=1.000
  author_overlap        : mean=0.005, std=0.054, min=0.000, max=1.000
  first_author_match    : mean=0.002, std=0.042, min=0.000, max=1.000
  year_match            : mean=0.067, std=0.250, min=0.000, max=1.000
  year_diff             : mean=9.110, std=9.558, min=0.000, max=50.000
  arxiv_match           : mean=0.001, std=0.024, min=0.000, max=1.000
  arxiv_in_content      : mean=0.001, std=0.032, min=0.000, max=1.000
  num_matching_authors  : mean=0.016, std=0.208, min=0.000, max=20.000
  title_len_ratio       : mean=0.710, std=0.207, min=0.000, max=1.000
  combined_score        : mean=0.030, std=0.067, min=0.000, max=1.000


---
**Next:** `03_model_training.ipynb`