In [14]:
# Imports
import sys
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, field

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from matching import BibEntry, RefEntry, TextCleaner

## 1. Configuration

In [15]:
# Configuration
DATA_DIR = Path("../../23120260")  # Adjust to your data directory
OUTPUT_DIR = Path("../../output")
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

Data directory: ..\..\23120260
Output directory: ..\..\output


## 2. BibTeX Extractor

In [16]:
class BibExtractor:
    """Extract BibTeX entries from LaTeX sources"""
    
    @staticmethod
    def extract_bibitems(tex_content: str) -> List[BibEntry]:
        """Extract \\bibitem entries"""
        entries = []
        pattern = r'\\bibitem(?:\[([^\]]*)\])?\{([^}]+)\}(.*?)(?=\\bibitem|\\end\{thebibliography\}|\Z)'
        
        for match in re.finditer(pattern, tex_content, re.DOTALL):
            key = match.group(2).strip()
            content = match.group(3).strip()
            entry = BibExtractor._parse_bibitem_content(key, content)
            entries.append(entry)
        
        return entries
    
    @staticmethod
    def _parse_bibitem_content(key: str, content: str) -> BibEntry:
        """Parse bibitem content into structured entry"""
        # Clean content
        content = re.sub(r'\\newblock\s*', ' ', content)
        content = re.sub(r'\{\\em\s+([^}]*)\}', r'\1', content)
        content = re.sub(r'\\emph\{([^}]*)\}', r'\1', content)
        
        # Extract arXiv ID
        arxiv_id = TextCleaner.extract_arxiv_id(content) or ""
        
        # Extract year
        year = TextCleaner.extract_year(content) or ""
        
        # Try to extract title (often in italics or after authors)
        title = ""
        title_match = re.search(r'\{\\em\s+([^}]+)\}', content)
        if title_match:
            title = title_match.group(1)
        else:
            parts = content.split('.')
            if len(parts) > 1:
                title = parts[1].strip()
        
        # Extract authors (before first period or newblock)
        authors = []
        author_part = content.split('.')[0] if '.' in content else content[:100]
        author_matches = re.split(r'\s+and\s+|,\s*(?=[A-Z])', author_part)
        for a in author_matches:
            a = a.strip()
            if a and len(a) > 2 and not a.startswith('\\'):
                authors.append(a)
        
        return BibEntry(
            key=key,
            title=title,
            authors=authors[:10],
            year=year,
            arxiv_id=arxiv_id,
            raw_content=content[:500]
        )
    
    @staticmethod
    def extract_from_bib_file(bib_content: str) -> List[BibEntry]:
        """Extract entries from BibTeX file"""
        entries = []
        pattern = r'@(\w+)\{([^,]+),\s*(.*?)\n\}'
        
        for match in re.finditer(pattern, bib_content, re.DOTALL):
            entry_type = match.group(1).lower()
            key = match.group(2).strip()
            fields_str = match.group(3)
            
            # Parse fields
            fields = {}
            field_pattern = r'(\w+)\s*=\s*[{"]([^}"]*)["}]'
            for fm in re.finditer(field_pattern, fields_str, re.DOTALL):
                fields[fm.group(1).lower()] = fm.group(2).strip()
            
            # Extract authors
            authors = []
            if 'author' in fields:
                author_list = re.split(r'\s+and\s+', fields['author'])
                authors = [a.strip() for a in author_list]
            
            # Extract arXiv ID
            arxiv_id = fields.get('eprint', '') or fields.get('arxiv', '')
            if arxiv_id:
                arxiv_id = arxiv_id.replace('.', '-')
            
            entries.append(BibEntry(
                key=key,
                title=fields.get('title', ''),
                authors=authors,
                year=fields.get('year', ''),
                venue=fields.get('journal', fields.get('booktitle', '')),
                arxiv_id=arxiv_id,
                raw_content=match.group(0)[:500]
            ))
        
        return entries

## 3. Data Loading Functions

In [17]:
def load_references_json(path: Path) -> Dict[str, RefEntry]:
    """Load references.json and convert to RefEntry objects"""
    if not path.exists():
        return {}
    
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    entries = {}
    for arxiv_id, info in data.items():
        entries[arxiv_id] = RefEntry(
            arxiv_id=arxiv_id,
            title=info.get('paper_title', ''),
            authors=info.get('authors', []),
            submission_date=info.get('submission_date', ''),
            venue=info.get('publication_venue', '')
        )
    
    return entries


def extract_bibs_from_publication(pub_path: Path) -> List[BibEntry]:
    """Extract all BibTeX entries from a publication's LaTeX sources"""
    tex_path = pub_path / 'tex'
    all_entries = []
    
    if not tex_path.exists():
        return all_entries
    
    # Process all version directories
    for version_dir in tex_path.iterdir():
        if not version_dir.is_dir():
            continue
        
        # Look for .bib files
        for bib_file in version_dir.rglob('*.bib'):
            try:
                content = bib_file.read_text(encoding='utf-8', errors='ignore')
                entries = BibExtractor.extract_from_bib_file(content)
                all_entries.extend(entries)
            except Exception:
                pass
        
        # Look for thebibliography in .tex files
        for tex_file in version_dir.rglob('*.tex'):
            try:
                content = tex_file.read_text(encoding='utf-8', errors='ignore')
                if r'\bibitem' in content:
                    entries = BibExtractor.extract_bibitems(content)
                    all_entries.extend(entries)
            except Exception:
                pass
    
    # Deduplicate by key
    seen_keys = set()
    unique_entries = []
    for entry in all_entries:
        if entry.key not in seen_keys:
            seen_keys.add(entry.key)
            unique_entries.append(entry)
    
    return unique_entries

## 4. Process Publications

In [18]:
# Get list of publications
import random

all_publications = sorted([p for p in DATA_DIR.iterdir() if p.is_dir()])
print(f"Found {len(all_publications)} total publications")

# Manual labeled publications (must include)
MANUAL_PUBS = {"2411-00222", "2411-00223", "2411-00225", "2411-00226", "2411-00227"}

# Separate manual and non-manual publications
manual_pubs = [p for p in all_publications if p.name in MANUAL_PUBS]
non_manual_pubs = [p for p in all_publications if p.name not in MANUAL_PUBS]

print(f"  Manual labeled: {len(manual_pubs)}")
print(f"  Non-manual: {len(non_manual_pubs)}")

# Sample 500 random non-manual publications
random.seed(42)  # For reproducibility
sample_size = 1500
sampled_non_manual = random.sample(non_manual_pubs, min(sample_size, len(non_manual_pubs)))

# Combine: all manual + 500 sampled
publications = manual_pubs + sampled_non_manual
publications = sorted(publications, key=lambda p: p.name)

print(f"\nSelected {len(publications)} publications:")
print(f"  - {len(manual_pubs)} manual labeled")
print(f"  - {len(sampled_non_manual)} random sampled")

# Show sample
print("\nFirst 5 publications:")
for pub in publications[:5]:
    marker = " [MANUAL]" if pub.name in MANUAL_PUBS else ""
    print(f"  - {pub.name}{marker}")

Found 5000 total publications
  Manual labeled: 5
  Non-manual: 4995

Selected 1505 publications:
  - 5 manual labeled
  - 1500 random sampled

First 5 publications:
  - 2411-00222 [MANUAL]
  - 2411-00223 [MANUAL]
  - 2411-00225 [MANUAL]
  - 2411-00226 [MANUAL]
  - 2411-00227 [MANUAL]


In [19]:
# Process a sample publication
sample_pub = publications[0]
print(f"Processing: {sample_pub.name}")

# Load references
refs = load_references_json(sample_pub / 'references.json')
print(f"  References: {len(refs)}")

# Extract BibTeX entries
bibs = extract_bibs_from_publication(sample_pub)
print(f"  BibTeX entries: {len(bibs)}")

# Show sample entries
if bibs:
    print("\nSample BibTeX entry:")
    print(f"  Key: {bibs[0].key}")
    print(f"  Title: {bibs[0].title[:80]}..." if len(bibs[0].title) > 80 else f"  Title: {bibs[0].title}")
    print(f"  Authors: {bibs[0].authors[:3]}")
    print(f"  Year: {bibs[0].year}")

Processing: 2411-00222
  References: 16
  BibTeX entries: 31

Sample BibTeX entry:
  Key: ganjidoost2024protectingfeedforwardnetworksadversarial
  Title: Protecting Feed-Forward Networks from Adversarial Attacks Using Predictive Codin...
  Authors: ['Ehsan Ganjidoost', 'Jeff Orchard']
  Year: 2024


## 5. Extract All Data

In [20]:
# Process all publications with detailed validation and candidate filtering
# Using ThreadPoolExecutor for parallel I/O operations
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import multiprocessing

skipped = {
    'no_refs_file': [],
    'empty_refs': [],
    'no_bibs': [],
    'no_tex_folder': []
}

# Stats for filtering
total_refs_before = 0
total_refs_after = 0

def filter_refs_for_bibs(bibs: List[BibEntry], refs: Dict[str, RefEntry], max_title_diff: int = 40) -> Dict[str, RefEntry]:
    """
    Pre-filter refs to only keep candidates that could potentially match any bib.
    A ref is kept if its title length is within max_title_diff of at least one bib title.
    """
    filtered_refs = {}
    
    # Get all bib title lengths
    bib_title_lengths = []
    for bib in bibs:
        title = bib.title.replace('\n', ' ').strip() if bib.title else ''
        bib_title_lengths.append(len(title))
    
    # Keep refs where title length is close to at least one bib
    for arxiv_id, ref in refs.items():
        ref_title = ref.title.replace('\n', ' ').strip() if ref.title else ''
        ref_len = len(ref_title)
        
        # Check if this ref could match any bib based on title length
        for bib_len in bib_title_lengths:
            if abs(ref_len - bib_len) <= max_title_diff:
                filtered_refs[arxiv_id] = ref
                break
    
    return filtered_refs

def process_publication(pub_path: Path) -> dict:
    """Process a single publication - designed for parallel execution"""
    result = {'status': 'skipped', 'reason': None, 'data': None, 'stats': {}}
    
    try:
        # Check references.json
        refs_path = pub_path / 'references.json'
        if not refs_path.exists():
            result['reason'] = 'no_refs_file'
            result['pub_id'] = pub_path.name
            return result
        
        refs = load_references_json(refs_path)
        if not refs:
            result['reason'] = 'empty_refs'
            result['pub_id'] = pub_path.name
            return result
        
        # Check tex folder exists
        tex_path = pub_path / 'tex'
        if not tex_path.exists():
            result['reason'] = 'no_tex_folder'
            result['pub_id'] = pub_path.name
            return result
        
        # Extract bibs
        bibs = extract_bibs_from_publication(pub_path)
        if not bibs:
            result['reason'] = 'no_bibs'
            result['pub_id'] = pub_path.name
            return result
        
        # Track stats
        refs_before = len(refs)
        
        # Filter refs based on title length compatibility with bibs
        filtered_refs = filter_refs_for_bibs(bibs, refs, max_title_diff=40)
        refs_after = len(filtered_refs)
        
        # Valid publication (only save filtered refs)
        result['status'] = 'success'
        result['stats'] = {'refs_before': refs_before, 'refs_after': refs_after}
        result['data'] = {
            'pub_id': pub_path.name,
            'num_refs': refs_after,
            'num_bibs': len(bibs),
            'refs': {k: v.to_dict() for k, v in filtered_refs.items()},
            'bibs': [b.to_dict() for b in bibs]
        }
        
    except Exception as e:
        result['status'] = 'error'
        result['reason'] = 'error'
        result['pub_id'] = pub_path.name
        result['error'] = str(e)
    
    return result

# Process with ThreadPoolExecutor (better for I/O-bound operations)
num_workers = min(multiprocessing.cpu_count() * 2, 16)  # More threads for I/O
print(f"Processing with {num_workers} threads...")

all_data = []

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = [executor.submit(process_publication, pub_path) for pub_path in publications]
    
    for future in tqdm(as_completed(futures), total=len(publications), desc="Processing publications"):
        result = future.result()
        
        if result['status'] == 'success':
            all_data.append(result['data'])
            total_refs_before += result['stats']['refs_before']
            total_refs_after += result['stats']['refs_after']
        elif result['status'] == 'skipped':
            skipped[result['reason']].append(result['pub_id'])
        elif result['status'] == 'error':
            print(f"\nError processing {result['pub_id']}: {result.get('error', 'Unknown')}")

# Print summary
print(f"\n{'='*60}")
print("VALIDATION SUMMARY")
print(f"{'='*60}")
print(f"Total publications scanned: {len(publications)}")
print(f"Valid publications: {len(all_data)}")
print(f"\nSkipped publications:")
print(f"  - No references.json: {len(skipped['no_refs_file'])}")
print(f"  - Empty references.json: {len(skipped['empty_refs'])}")
print(f"  - No tex folder: {len(skipped['no_tex_folder'])}")
print(f"  - No BibTeX entries found: {len(skipped['no_bibs'])}")
print(f"  - Total skipped: {sum(len(v) for v in skipped.values())}")

print(f"\n{'='*60}")
print("CANDIDATE FILTERING SUMMARY")
print(f"{'='*60}")
print(f"Total refs before filtering: {total_refs_before}")
print(f"Total refs after filtering: {total_refs_after}")
if total_refs_before > 0:
    print(f"Refs removed: {total_refs_before - total_refs_after} ({(total_refs_before - total_refs_after) / total_refs_before * 100:.1f}%)")

Processing with 16 threads...


Processing publications: 100%|██████████| 1505/1505 [00:57<00:00, 25.96it/s] 


VALIDATION SUMMARY
Total publications scanned: 1505
Valid publications: 893

Skipped publications:
  - No references.json: 179
  - Empty references.json: 110
  - No tex folder: 136
  - No BibTeX entries found: 187
  - Total skipped: 612

CANDIDATE FILTERING SUMMARY
Total refs before filtering: 20767
Total refs after filtering: 20408
Refs removed: 359 (1.7%)





In [21]:
# Save extracted data
output_file = OUTPUT_DIR / 'extracted_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print(f"Saved extracted data to: {output_file}")

Saved extracted data to: ..\..\output\extracted_data.json


## 6. Summary Statistics

In [22]:
# Compute statistics
total_refs = sum(d['num_refs'] for d in all_data)
total_bibs = sum(d['num_bibs'] for d in all_data)

print("=== Data Summary ===")
print(f"Publications processed: {len(all_data)}")
print(f"Total reference candidates: {total_refs}")
print(f"Total BibTeX entries: {total_bibs}")
print(f"Average refs per publication: {total_refs / len(all_data):.1f}")
print(f"Average bibs per publication: {total_bibs / len(all_data):.1f}")

=== Data Summary ===
Publications processed: 893
Total reference candidates: 20408
Total BibTeX entries: 422393
Average refs per publication: 22.9
Average bibs per publication: 473.0


## 7. Automatic Labeling (10% of Non-Manual Data)

Requirement 2.2.2: Implement automatic matching based on classic tools (regex, string similarity) to generate labels for at least 10% of the remaining non-manual data.

In [23]:
class AutoLabeler:
    """
    Automatic labeling using classic matching heuristics.
    
    Strategies:
    1. Exact arXiv ID match in bibitem content
    2. High title similarity (Jaccard > 0.7)
    3. First author + year + partial title match
    """
    
    # Auto-labeled publication partitions (1 for test, 1 for valid, rest train)
    AUTO_PARTITIONS = {}  # Will be filled dynamically
    
    @staticmethod
    def jaccard(set1, set2):
        if not set1 or not set2:
            return 0.0
        return len(set1 & set2) / len(set1 | set2)
    
    @staticmethod
    def tokenize(text):
        if not text:
            return set()
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        return set(text.split())
    
    @staticmethod
    def extract_last_name(author):
        if not author:
            return ""
        parts = author.lower().split()
        return parts[-1] if parts else ""
    
    @staticmethod
    def find_arxiv_in_content(content, refs):
        """Find exact arXiv ID matches in bibitem content."""
        matches = []
        for arxiv_id in refs.keys():
            # Try both formats: 2411-00222 and 2411.00222
            arxiv_dot = arxiv_id.replace('-', '.')
            if arxiv_dot in content or arxiv_id in content:
                matches.append((arxiv_id, 1.0, "arxiv_exact"))
        return matches
    
    @staticmethod
    def find_title_match(bib, refs, threshold=0.7):
        """Find high title similarity matches."""
        bib_title_tokens = AutoLabeler.tokenize(bib.get('title', ''))
        if not bib_title_tokens:
            return []
        
        matches = []
        for arxiv_id, ref in refs.items():
            ref_title_tokens = AutoLabeler.tokenize(ref.get('paper_title', ''))
            sim = AutoLabeler.jaccard(bib_title_tokens, ref_title_tokens)
            if sim >= threshold:
                matches.append((arxiv_id, sim, "title_jaccard"))
        
        return matches
    
    @staticmethod
    def find_author_year_match(bib, refs):
        """Find matches by first author + year + partial title."""
        bib_authors = bib.get('authors', [])
        bib_year = bib.get('year', '') or TextCleaner.extract_year(bib.get('raw_content', ''))
        bib_title_tokens = AutoLabeler.tokenize(bib.get('title', ''))
        
        if not bib_authors or not bib_year:
            return []
        
        bib_first_author = AutoLabeler.extract_last_name(bib_authors[0])
        
        matches = []
        for arxiv_id, ref in refs.items():
            ref_authors = ref.get('authors', [])
            ref_year = ref.get('submission_date', '')[:4]
            ref_title_tokens = AutoLabeler.tokenize(ref.get('paper_title', ''))
            
            if not ref_authors:
                continue
            
            ref_first_author = AutoLabeler.extract_last_name(ref_authors[0])
            
            # Check first author match
            if bib_first_author != ref_first_author:
                continue
            
            # Check year match (within 1 year tolerance)
            try:
                if abs(int(bib_year) - int(ref_year)) > 1:
                    continue
            except ValueError:
                continue
            
            # Check partial title overlap
            title_overlap = AutoLabeler.jaccard(bib_title_tokens, ref_title_tokens)
            if title_overlap >= 0.3:
                confidence = 0.6 + 0.4 * title_overlap
                matches.append((arxiv_id, confidence, "author_year_title"))
        
        return matches
    
    @staticmethod
    def auto_label_publication(pub_data, refs_data):
        """Generate automatic labels for a publication."""
        labels = {}
        
        for bib in pub_data['bibs']:
            bib_key = bib['key']
            raw_content = bib.get('raw_content', '')
            
            # Strategy 1: Exact arXiv ID in content (highest confidence)
            matches = AutoLabeler.find_arxiv_in_content(raw_content, refs_data)
            
            # Strategy 2: High title similarity
            if not matches:
                matches = AutoLabeler.find_title_match(bib, refs_data, threshold=0.7)
            
            # Strategy 3: Author + year + partial title
            if not matches:
                matches = AutoLabeler.find_author_year_match(bib, refs_data)
            
            # Take best match if any
            if matches:
                best_match = max(matches, key=lambda x: x[1])
                labels[bib_key] = {
                    'arxiv_id': best_match[0],
                    'confidence': best_match[1],
                    'method': best_match[2]
                }
        
        return labels

In [24]:
# Run auto-labeling on ALL non-manual publications (all 500 sampled)
# Since we only extracted 500 + 5 manual, we auto-label all 500 non-manual

# Filter out manual publications
non_manual_data = [d for d in all_data if d['pub_id'] not in MANUAL_PUBS]
print(f"Non-manual publications to auto-label: {len(non_manual_data)}")

# Run auto-labeling on ALL of them (not just 10%)
auto_labeled_results = []

for pub_data in tqdm(non_manual_data, desc="Auto-labeling"):
    refs_data = pub_data['refs']
    labels = AutoLabeler.auto_label_publication(pub_data, refs_data)
    
    if labels:  # Only include if we found at least one label
        auto_labeled_results.append({
            'pub_id': pub_data['pub_id'],
            'labels': labels,
            'num_labels': len(labels)
        })

# Use ALL auto-labeled results (no filtering by target_count)
selected_auto = auto_labeled_results

print(f"\nAuto-labeling results:")
print(f"  Publications with auto-labels: {len(auto_labeled_results)}")
print(f"  Publications without labels: {len(non_manual_data) - len(auto_labeled_results)}")
print(f"  Coverage: {len(selected_auto) / len(non_manual_data) * 100:.1f}% of sampled data")

Non-manual publications to auto-label: 888


Auto-labeling: 100%|██████████| 888/888 [00:12<00:00, 73.27it/s] 


Auto-labeling results:
  Publications with auto-labels: 664
  Publications without labels: 224
  Coverage: 74.8% of sampled data





In [25]:
# Assign partitions to auto-labeled publications
# Requirement: 1 for test, 1 for valid, rest for train

AUTO_PARTITIONS = {}
for i, result in enumerate(selected_auto):
    pub_id = result['pub_id']
    if i == 0:
        AUTO_PARTITIONS[pub_id] = "test"   # First auto-labeled -> test
    elif i == 1:
        AUTO_PARTITIONS[pub_id] = "valid"  # Second auto-labeled -> valid
    else:
        AUTO_PARTITIONS[pub_id] = "train"

print("Auto-labeled partition assignments:")
for pub_id, partition in list(AUTO_PARTITIONS.items())[:10]:
    print(f"  {pub_id}: {partition}")
if len(AUTO_PARTITIONS) > 10:
    print(f"  ... and {len(AUTO_PARTITIONS) - 10} more (train)")

# Save auto-labels to pred.json files
saved_count = 0
for result in selected_auto:
    pub_id = result['pub_id']
    pub_path = DATA_DIR / pub_id
    partition = AUTO_PARTITIONS.get(pub_id, "train")
    
    # Convert labels to groundtruth format
    groundtruth = {k: v['arxiv_id'] for k, v in result['labels'].items()}
    prediction = {k: [] for k in groundtruth.keys()}
    
    pred_data = {
        "partition": partition,
        "groundtruth": groundtruth,
        "prediction": prediction,
        "label_source": "auto",
        "label_methods": {k: v['method'] for k, v in result['labels'].items()}
    }
    
    pred_file = pub_path / "pred.json"
    with open(pred_file, 'w', encoding='utf-8') as f:
        json.dump(pred_data, f, indent=2, ensure_ascii=False)
    saved_count += 1

print(f"\nSaved {saved_count} auto-labeled pred.json files")

Auto-labeled partition assignments:
  2411-00252: test
  2411-00285: valid
  2411-00256: train
  2411-00238: train
  2411-00248: train
  2411-00310: train
  2411-00265: train
  2411-00230: train
  2411-00287: train
  2411-00314: train
  ... and 654 more (train)

Saved 664 auto-labeled pred.json files


In [26]:
# Summary: Combined labeling statistics
total_auto_labels = sum(r['num_labels'] for r in selected_auto)

print("=" * 60)
print("LABELING SUMMARY")
print("=" * 60)
print(f"\nManual Labels:")
print(f"  Publications: {len(MANUAL_PUBS)}")
print(f"  Test: 2411-00222")
print(f"  Valid: 2411-00223")
print(f"  Train: 2411-00225, 2411-00226, 2411-00227")

print(f"\nAutomatic Labels (from {len(non_manual_data)} sampled papers):")
print(f"  Publications with labels: {len(selected_auto)}")
print(f"  Total label pairs: {total_auto_labels}")
print(f"  Coverage: {len(selected_auto) / len(non_manual_data) * 100:.1f}%")
print(f"  Test: {[p for p, part in AUTO_PARTITIONS.items() if part == 'test']}")
print(f"  Valid: {[p for p, part in AUTO_PARTITIONS.items() if part == 'valid']}")
print(f"  Train: {sum(1 for part in AUTO_PARTITIONS.values() if part == 'train')} publications")

print(f"\nData Split Summary:")
print(f"  Test set: 1 manual + 1 auto = 2 publications")
print(f"  Valid set: 1 manual + 1 auto = 2 publications") 
print(f"  Train set: 3 manual + {sum(1 for part in AUTO_PARTITIONS.values() if part == 'train')} auto publications")
print(f"\nTotal extracted publications: {len(all_data)} (5 manual + {len(non_manual_data)} sampled)")

LABELING SUMMARY

Manual Labels:
  Publications: 5
  Test: 2411-00222
  Valid: 2411-00223
  Train: 2411-00225, 2411-00226, 2411-00227

Automatic Labels (from 888 sampled papers):
  Publications with labels: 664
  Total label pairs: 5891
  Coverage: 74.8%
  Test: ['2411-00252']
  Valid: ['2411-00285']
  Train: 662 publications

Data Split Summary:
  Test set: 1 manual + 1 auto = 2 publications
  Valid set: 1 manual + 1 auto = 2 publications
  Train set: 3 manual + 662 auto publications

Total extracted publications: 893 (5 manual + 888 sampled)


---
**Next:** Continue to `02_feature_engineering.ipynb` to extract matching features.