In [1]:
# Imports
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple

sys.path.insert(0, str(Path.cwd().parent))

## 1. Load Data

In [2]:
# Configuration
OUTPUT_DIR = Path("../../output")

# Load features
X = np.load(OUTPUT_DIR / 'features.npy')
print(f"Features shape: {X.shape}")

# Load pair metadata using streaming parser (handles large files)
try:
    import ijson
    
    print("Loading pair metadata with streaming parser...")
    pair_metadata = []
    with open(OUTPUT_DIR / 'pair_metadata.json', 'rb') as f:
        for item in ijson.items(f, 'item'):
            pair_metadata.append(item)
    print(f"Number of pairs: {len(pair_metadata)}")
    
except ImportError:
    print("ijson not installed, trying standard json...")
    # Fallback: try loading in a more memory-efficient way
    import gc
    gc.collect()
    
    with open(OUTPUT_DIR / 'pair_metadata.json', 'r', encoding='utf-8') as f:
        pair_metadata = json.load(f)
    print(f"Number of pairs: {len(pair_metadata)}")

Features shape: (5313316, 12)
Loading pair metadata with streaming parser...
Number of pairs: 5313316


## 2. Load Manual Labels

In [3]:
# Load manual labels if available
manual_labels_file = OUTPUT_DIR / 'manual_labels.json'

if manual_labels_file.exists():
    with open(manual_labels_file, 'r', encoding='utf-8') as f:
        manual_labels = json.load(f)
    print(f"Loaded {len(manual_labels)} manual labels")
    
    # Convert to lookup dict
    label_lookup = {}
    for label in manual_labels:
        key = (label['pub_id'], label['bib_key'], label['arxiv_id'])
        label_lookup[key] = label['is_match']
else:
    print("No manual labels found - using heuristic labels")
    label_lookup = {}

No manual labels found - using heuristic labels


In [4]:
# Create labels for training
# If manual labels exist, use them; otherwise use auto-labeling heuristics

y = np.zeros(len(pair_metadata))

for i, pair in enumerate(pair_metadata):
    key = (pair['pub_id'], pair['bib_key'], pair['arxiv_id'])
    
    if key in label_lookup:
        # Use manual label
        y[i] = 1.0 if label_lookup[key] else 0.0
    else:
        # Auto-label based on high confidence signals
        # ArXiv match or very high combined score
        if X[i, 7] == 1.0 or X[i, 8] == 1.0:  # arxiv_match or arxiv_in_content
            y[i] = 1.0
        elif pair['combined_score'] > 0.8:
            y[i] = 1.0
        else:
            y[i] = 0.0

print(f"Labels: {int(y.sum())} positive, {int(len(y) - y.sum())} negative")

Labels: 7382 positive, 5305934 negative


## 3. Reference Matching Model

In [5]:
class ReferenceMatchingModel:
    """
    Logistic regression model for reference matching.
    """
    
    FEATURE_NAMES = [
        'title_jaccard', 'title_overlap', 'title_edit_dist',
        'author_overlap', 'first_author_match',
        'year_match', 'year_diff',
        'arxiv_match', 'arxiv_in_content',
        'num_matching_authors', 'title_len_ratio', 'combined_score'
    ]
    
    def __init__(self):
        self.weights = None
        self.bias = 0.0
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def train(self, X: np.ndarray, y: np.ndarray, 
              learning_rate: float = 0.1, epochs: int = 1000,
              verbose: bool = True):
        """Train the model using gradient descent"""
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        
        for epoch in range(epochs):
            # Forward pass
            linear = np.dot(X, self.weights) + self.bias
            predictions = self._sigmoid(linear)
            
            # Compute gradients
            error = predictions - y
            dw = np.dot(X.T, error) / n_samples
            db = np.sum(error) / n_samples
            
            # Update weights
            self.weights -= learning_rate * dw
            self.bias -= learning_rate * db
            
            # Log progress
            if verbose and epoch % 200 == 0:
                loss = -np.mean(y * np.log(predictions + 1e-10) + 
                               (1 - y) * np.log(1 - predictions + 1e-10))
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict match probabilities"""
        if self.weights is None:
            # Use heuristic weights if not trained
            self.weights = np.array([
                0.3,   # title_jaccard
                0.2,   # title_overlap
                0.15,  # title_edit_dist
                0.25,  # author_overlap
                0.1,   # first_author_match
                0.15,  # year_match
                -0.05, # year_diff
                1.0,   # arxiv_match
                0.8,   # arxiv_in_content
                0.1,   # num_matching_authors
                0.05,  # title_len_ratio
                0.0    # combined_score
            ])
        
        linear = np.dot(X, self.weights) + self.bias
        return self._sigmoid(linear)
    
    def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
        """Predict binary labels"""
        return (self.predict_proba(X) >= threshold).astype(int)
    
    def get_feature_importance(self) -> Dict[str, float]:
        """Get feature importance (absolute weights)"""
        if self.weights is None:
            return {}
        return {name: abs(w) for name, w in zip(self.FEATURE_NAMES, self.weights)}
    
    def save(self, path: Path):
        """Save model to file"""
        data = {
            'weights': self.weights.tolist() if self.weights is not None else None,
            'bias': self.bias,
            'feature_names': self.FEATURE_NAMES
        }
        with open(path, 'w') as f:
            json.dump(data, f, indent=2)
    
    def load(self, path: Path):
        """Load model from file"""
        with open(path, 'r') as f:
            data = json.load(f)
        self.weights = np.array(data['weights']) if data['weights'] else None
        self.bias = data['bias']

## 4. Train Model

In [6]:
# ============================================================================
# PUBLICATION-LEVEL DATA SPLIT (Requirement 2.2.4)
# ============================================================================
# Test: 1 manual + 1 auto-labeled publication
# Valid: 1 manual + 1 auto-labeled publication  
# Train: All remaining publications

DATA_DIR = Path("../../23120260")

# Define partition assignments
PARTITION_ASSIGNMENTS = {
    # Manual labels (from 00_manual_labeling.ipynb)
    "2411-00222": "test",    # Manual test
    "2411-00223": "valid",   # Manual valid
    "2411-00225": "train",
    "2411-00226": "train", 
    "2411-00227": "train",
}

# Load auto-labeled partitions from pred.json files
for pub_path in DATA_DIR.iterdir():
    if not pub_path.is_dir():
        continue
    pred_file = pub_path / "pred.json"
    if pred_file.exists():
        try:
            with open(pred_file, 'r') as f:
                pred_data = json.load(f)
            pub_id = pub_path.name
            if pub_id not in PARTITION_ASSIGNMENTS:
                # Handle both dict and list formats
                if isinstance(pred_data, dict):
                    partition = pred_data.get('partition', 'train')
                else:
                    # If it's a list or other format, default to train
                    partition = 'train'
                PARTITION_ASSIGNMENTS[pub_id] = partition
        except Exception as e:
            # Skip files that can't be parsed
            pass

# Create publication-to-partition lookup
pub_to_partition = PARTITION_ASSIGNMENTS.copy()

# Split indices by partition
train_indices = []
valid_indices = []
test_indices = []

for i, pair in enumerate(pair_metadata):
    pub_id = pair['pub_id']
    partition = pub_to_partition.get(pub_id, 'train')
    
    if partition == 'test':
        test_indices.append(i)
    elif partition == 'valid':
        valid_indices.append(i)
    else:
        train_indices.append(i)

# Convert to arrays
train_idx = np.array(train_indices) if train_indices else np.array([], dtype=int)
val_idx = np.array(valid_indices) if valid_indices else np.array([], dtype=int)
test_idx = np.array(test_indices) if test_indices else np.array([], dtype=int)

X_train = X[train_idx] if len(train_idx) > 0 else np.array([]).reshape(0, X.shape[1])
y_train = y[train_idx] if len(train_idx) > 0 else np.array([])
X_val = X[val_idx] if len(val_idx) > 0 else np.array([]).reshape(0, X.shape[1])
y_val = y[val_idx] if len(val_idx) > 0 else np.array([])
X_test = X[test_idx] if len(test_idx) > 0 else np.array([]).reshape(0, X.shape[1])
y_test = y[test_idx] if len(test_idx) > 0 else np.array([])

# Summary
test_pubs = set(pair_metadata[i]['pub_id'] for i in test_indices)
valid_pubs = set(pair_metadata[i]['pub_id'] for i in valid_indices)
train_pubs = set(pair_metadata[i]['pub_id'] for i in train_indices)

print("=" * 60)
print("PUBLICATION-LEVEL DATA SPLIT")
print("=" * 60)
print(f"\nTest Set ({len(test_pubs)} publications, {len(test_idx)} pairs):")
for pub in sorted(test_pubs):
    print(f"  - {pub}")

print(f"\nValidation Set ({len(valid_pubs)} publications, {len(val_idx)} pairs):")
for pub in sorted(valid_pubs):
    print(f"  - {pub}")

print(f"\nTraining Set ({len(train_pubs)} publications, {len(train_idx)} pairs)")
print(f"\nTotal: {len(X)} pairs")

PUBLICATION-LEVEL DATA SPLIT

Test Set (4 publications, 1784 pairs):
  - 2411-00222
  - 2411-00236
  - 2411-00252
  - 2411-00260

Validation Set (2 publications, 526 pairs):
  - 2411-00223
  - 2411-00285

Training Set (878 publications, 5311006 pairs)

Total: 5313316 pairs


In [7]:
# Train model
model = ReferenceMatchingModel()
model.train(X_train, y_train, learning_rate=0.1, epochs=1000)

Epoch 0, Loss: 0.6931
Epoch 200, Loss: 0.0259
Epoch 400, Loss: 0.0173
Epoch 600, Loss: 0.0143
Epoch 800, Loss: 0.0127


In [8]:
# Feature importance
print("\n=== Feature Importance ===")
importance = model.get_feature_importance()
for name, imp in sorted(importance.items(), key=lambda x: -x[1]):
    print(f"  {name:25s}: {imp:.4f}")


=== Feature Importance ===
  year_diff                : 1.6357
  title_len_ratio          : 1.0466
  year_match               : 0.6449
  title_edit_dist          : 0.2666
  num_matching_authors     : 0.1536
  combined_score           : 0.0943
  arxiv_in_content         : 0.0916
  arxiv_match              : 0.0556
  first_author_match       : 0.0499
  title_jaccard            : 0.0432
  author_overlap           : 0.0413
  title_overlap            : 0.0276


## 5. Validation

In [9]:
# Evaluate on validation set
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)

# Metrics
accuracy = np.mean(y_pred == y_val)
tp = np.sum((y_pred == 1) & (y_val == 1))
fp = np.sum((y_pred == 1) & (y_val == 0))
fn = np.sum((y_pred == 0) & (y_val == 1))

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("=== Validation Metrics ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

=== Validation Metrics ===
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


## 6. Save Model

In [10]:
# Save model
model_path = OUTPUT_DIR / 'reference_matching_model.json'
model.save(model_path)
print(f"Model saved to: {model_path}")

Model saved to: ..\..\output\reference_matching_model.json


---
**Next:** Continue to `04_evaluation.ipynb` to evaluate the model and generate predictions.