In [1]:
import sys
import json
import numpy as np
from pathlib import Path
from typing import Dict, List

sys.path.insert(0, str(Path.cwd().parent))
from matching.common import (
    ReferenceMatchingModel, FeatureExtractor,
    get_labeled_publications, load_ground_truth,
    OUTPUT_DIR, DATA_DIR
)

## 1. Load Data

In [2]:
# OUTPUT_DIR and DATA_DIR imported from common.py

# Load features
X = np.load(OUTPUT_DIR / 'features.npy')
print(f"Features: {X.shape}")

# Load metadata with streaming parser
try:
    import ijson
    pair_metadata = list(ijson.items(open(OUTPUT_DIR / 'pair_metadata.json', 'rb'), 'item'))
except ImportError:
    with open(OUTPUT_DIR / 'pair_metadata.json', 'r', encoding='utf-8') as f:
        pair_metadata = json.load(f)
print(f"Pairs: {len(pair_metadata)}")

Features: (1948340, 12)
Pairs: 1948340


## 2. Load Ground Truth from pred.json

In [3]:
# Load ground truth and partitions from pred.json files (auto-detected)
ground_truth, PARTITION_ASSIGNMENTS = load_ground_truth(DATA_DIR)
print(f"Loaded ground truth for {len(ground_truth)} publications")
print(f"Partitions: test={sum(1 for p in PARTITION_ASSIGNMENTS.values() if p == 'test')}, "
      f"valid={sum(1 for p in PARTITION_ASSIGNMENTS.values() if p == 'valid')}, "
      f"train={sum(1 for p in PARTITION_ASSIGNMENTS.values() if p == 'train')}")

Loaded ground truth for 1890 publications
Partitions: test=2, valid=2, train=1886


In [4]:
# Create labels from ground truth (pred.json) or auto-heuristic
y = np.zeros(len(pair_metadata))

for i, pair in enumerate(pair_metadata):
    pub_id, bib_key, arxiv_id = pair['pub_id'], pair['bib_key'], pair['arxiv_id']
    
    # Check if this pair is in ground truth
    if pub_id in ground_truth and bib_key in ground_truth[pub_id]:
        y[i] = 1.0 if ground_truth[pub_id][bib_key] == arxiv_id else 0.0
    # Fallback to heuristics for unlabeled pairs
    elif X[i, 7] == 1.0 or X[i, 8] == 1.0:  # arxiv_match or arxiv_in_content
        y[i] = 1.0
    elif pair['combined_score'] > 0.8:
        y[i] = 1.0

print(f"Labels: {int(y.sum())} positive, {int(len(y) - y.sum())} negative")

Labels: 15491 positive, 1932849 negative


## 3. Reference Matching Model

In [5]:
# ReferenceMatchingModel is imported from common.py
# Display model info
print("Model: Logistic Regression")
print(f"Features ({len(ReferenceMatchingModel.FEATURE_NAMES)}): {ReferenceMatchingModel.FEATURE_NAMES}")

Model: Logistic Regression
Features (12): ['title_jaccard', 'title_overlap', 'title_edit_dist', 'author_overlap', 'first_author_match', 'year_match', 'year_diff', 'arxiv_match', 'arxiv_in_content', 'num_matching_authors', 'title_len_ratio', 'combined_score']


## 4. Train Model

In [6]:
# Publication-level data split (partitions loaded from pred.json)
train_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'train'])
val_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'valid'])
test_idx = np.array([i for i, p in enumerate(pair_metadata) if PARTITION_ASSIGNMENTS.get(p['pub_id'], 'train') == 'test'])

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx] if len(val_idx) > 0 else np.array([]).reshape(0, X.shape[1]), y[val_idx] if len(val_idx) > 0 else np.array([])
X_test, y_test = X[test_idx] if len(test_idx) > 0 else np.array([]).reshape(0, X.shape[1]), y[test_idx] if len(test_idx) > 0 else np.array([])

print(f"Train: {len(train_idx)} pairs, Valid: {len(val_idx)} pairs, Test: {len(test_idx)} pairs")

Train: 1945804 pairs, Valid: 1341 pairs, Test: 1195 pairs


In [7]:
model = ReferenceMatchingModel()
model.train(X_train, y_train, lr=0.1, epochs=1000)

Epoch 0, Loss: 0.6931
Epoch 200, Loss: 0.0515
Epoch 400, Loss: 0.0420
Epoch 600, Loss: 0.0376
Epoch 800, Loss: 0.0345


In [8]:
print("\nFeature Importance:")
for name, imp in sorted(model.get_feature_importance().items(), key=lambda x: -x[1]):
    print(f"  {name:22s}: {imp:.4f}")


Feature Importance:
  year_diff             : 1.3833
  title_len_ratio       : 1.1295
  year_match            : 0.6246
  arxiv_in_content      : 0.5167
  title_jaccard         : 0.4265
  num_matching_authors  : 0.4152
  title_overlap         : 0.2979
  arxiv_match           : 0.2701
  first_author_match    : 0.2043
  author_overlap        : 0.1662
  combined_score        : 0.1160
  title_edit_dist       : 0.0138


## 5. Validation

In [9]:
# Validation metrics
if len(X_val) > 0:
    y_pred = model.predict(X_val)
    tp, fp, fn = np.sum((y_pred == 1) & (y_val == 1)), np.sum((y_pred == 1) & (y_val == 0)), np.sum((y_pred == 0) & (y_val == 1))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"\nValidation: Acc={np.mean(y_pred == y_val):.4f}, P={precision:.4f}, R={recall:.4f}, F1={f1:.4f}")
else:
    print("\nNo validation data")


Validation: Acc=0.9925, P=1.0000, R=0.0909, F1=0.1667


## 6. Save Model

In [10]:
model.save(OUTPUT_DIR / 'reference_matching_model.json')
print(f"Model saved to {OUTPUT_DIR / 'reference_matching_model.json'}")

Model saved to c:\Code\KHDL_Lab02_v2\output\reference_matching_model.json


---
**Next:** `04_evaluation.ipynb`