In [1]:
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List

sys.path.insert(0, str(Path.cwd().parent))

## 1. Load Data

In [2]:
OUTPUT_DIR = Path("../../output")
DATA_DIR = Path("../../23120260")

# Load features
X = np.load(OUTPUT_DIR / 'features.npy')
print(f"Features: {X.shape}")

# Load metadata with streaming parser
try:
    import ijson
    pair_metadata = list(ijson.items(open(OUTPUT_DIR / 'pair_metadata.json', 'rb'), 'item'))
except ImportError:
    with open(OUTPUT_DIR / 'pair_metadata.json', 'r', encoding='utf-8') as f:
        pair_metadata = json.load(f)
print(f"Pairs: {len(pair_metadata)}")

Features: (5313316, 12)
Pairs: 5313316


## 2. Load Manual Labels

In [3]:
# Load manual labels if available
manual_labels_file = OUTPUT_DIR / 'manual_labels.json'
label_lookup = {}

if manual_labels_file.exists():
    with open(manual_labels_file, 'r', encoding='utf-8') as f:
        for label in json.load(f):
            label_lookup[(label['pub_id'], label['bib_key'], label['arxiv_id'])] = label['is_match']
    print(f"Loaded {len(label_lookup)} manual labels")
else:
    print("No manual labels - using heuristic labels")

No manual labels - using heuristic labels


In [4]:
# Create labels (manual or auto-heuristic)
y = np.zeros(len(pair_metadata))
for i, pair in enumerate(pair_metadata):
    key = (pair['pub_id'], pair['bib_key'], pair['arxiv_id'])
    if key in label_lookup:
        y[i] = 1.0 if label_lookup[key] else 0.0
    elif X[i, 7] == 1.0 or X[i, 8] == 1.0:  # arxiv_match or arxiv_in_content
        y[i] = 1.0
    elif pair['combined_score'] > 0.8:
        y[i] = 1.0

print(f"Labels: {int(y.sum())} positive, {int(len(y) - y.sum())} negative")

Labels: 7382 positive, 5305934 negative


## 3. Reference Matching Model

In [5]:
class ReferenceMatchingModel:
    """Logistic regression for reference matching"""
    
    FEATURE_NAMES = [
        'title_jaccard', 'title_overlap', 'title_edit_dist',
        'author_overlap', 'first_author_match', 'year_match', 'year_diff',
        'arxiv_match', 'arxiv_in_content', 'num_matching_authors', 
        'title_len_ratio', 'combined_score'
    ]
    
    def __init__(self):
        self.weights = None
        self.bias = 0.0
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def train(self, X: np.ndarray, y: np.ndarray, lr: float = 0.1, epochs: int = 1000, verbose: bool = True):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        
        for epoch in range(epochs):
            pred = self._sigmoid(np.dot(X, self.weights) + self.bias)
            error = pred - y
            self.weights -= lr * np.dot(X.T, error) / n_samples
            self.bias -= lr * np.sum(error) / n_samples
            
            if verbose and epoch % 200 == 0:
                loss = -np.mean(y * np.log(pred + 1e-10) + (1 - y) * np.log(1 - pred + 1e-10))
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        if self.weights is None:
            self.weights = np.array([0.3, 0.2, 0.15, 0.25, 0.1, 0.15, -0.05, 1.0, 0.8, 0.1, 0.05, 0.0])
        return self._sigmoid(np.dot(X, self.weights) + self.bias)
    
    def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
        return (self.predict_proba(X) >= threshold).astype(int)
    
    def get_feature_importance(self) -> Dict[str, float]:
        return {name: abs(w) for name, w in zip(self.FEATURE_NAMES, self.weights)} if self.weights is not None else {}
    
    def save(self, path: Path):
        with open(path, 'w') as f:
            json.dump({'weights': self.weights.tolist() if self.weights is not None else None,
                       'bias': self.bias, 'feature_names': self.FEATURE_NAMES}, f, indent=2)
    
    def load(self, path: Path):
        with open(path, 'r') as f:
            data = json.load(f)
        self.weights = np.array(data['weights']) if data['weights'] else None
        self.bias = data['bias']

## 4. Train Model

In [6]:
# Publication-level data split
PARTITION_ASSIGNMENTS = {
    "2411-00222": "test", "2411-00223": "valid",
    "2411-00225": "train", "2411-00226": "train", "2411-00227": "train",
}

# Load auto-labeled partitions
for pub_path in DATA_DIR.iterdir():
    if not pub_path.is_dir():
        continue
    pred_file = pub_path / "pred.json"
    if pred_file.exists() and pub_path.name not in PARTITION_ASSIGNMENTS:
        try:
            with open(pred_file, 'r') as f:
                pred_data = json.load(f)
            PARTITION_ASSIGNMENTS[pub_path.name] = pred_data.get('partition', 'train') if isinstance(pred_data, dict) else 'train'
        except:
            pass

# Split by partition
train_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'train'])
val_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'valid'])
test_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'test'])

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx] if len(val_idx) > 0 else np.array([]).reshape(0, X.shape[1]), y[val_idx] if len(val_idx) > 0 else np.array([])
X_test, y_test = X[test_idx] if len(test_idx) > 0 else np.array([]).reshape(0, X.shape[1]), y[test_idx] if len(test_idx) > 0 else np.array([])

print(f"Train: {len(train_idx)} pairs, Valid: {len(val_idx)} pairs, Test: {len(test_idx)} pairs")

Train: 5310379 pairs, Valid: 1153 pairs, Test: 1784 pairs


In [7]:
model = ReferenceMatchingModel()
model.train(X_train, y_train, lr=0.1, epochs=1000)

Epoch 0, Loss: 0.6931
Epoch 200, Loss: 0.0259
Epoch 400, Loss: 0.0173
Epoch 600, Loss: 0.0142
Epoch 800, Loss: 0.0127


In [8]:
print("\nFeature Importance:")
for name, imp in sorted(model.get_feature_importance().items(), key=lambda x: -x[1]):
    print(f"  {name:22s}: {imp:.4f}")


Feature Importance:
  year_diff             : 1.6357
  title_len_ratio       : 1.0467
  year_match            : 0.6450
  title_edit_dist       : 0.2668
  num_matching_authors  : 0.1536
  combined_score        : 0.0944
  arxiv_in_content      : 0.0914
  arxiv_match           : 0.0556
  first_author_match    : 0.0499
  title_jaccard         : 0.0430
  author_overlap        : 0.0414
  title_overlap         : 0.0277


## 5. Validation

In [9]:
# Validation metrics
if len(X_val) > 0:
    y_pred = model.predict(X_val)
    tp, fp, fn = np.sum((y_pred == 1) & (y_val == 1)), np.sum((y_pred == 1) & (y_val == 0)), np.sum((y_pred == 0) & (y_val == 1))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"\nValidation: Acc={np.mean(y_pred == y_val):.4f}, P={precision:.4f}, R={recall:.4f}, F1={f1:.4f}")
else:
    print("\nNo validation data")


Validation: Acc=0.9887, P=0.0000, R=0.0000, F1=0.0000


## 6. Save Model

In [10]:
model.save(OUTPUT_DIR / 'reference_matching_model.json')
print(f"Model saved to {OUTPUT_DIR / 'reference_matching_model.json'}")

Model saved to ..\..\output\reference_matching_model.json


---
**Next:** `04_evaluation.ipynb`