
# MDC Dataset Linking Baseline

This notebook implements a two-stage pipeline:

1. **Stage A** – candidate extraction from XML/PDF.
2. **Stage B** – encoder based classification of extracted candidates.

The goal is to locate dataset identifiers such as DOI and accessions within scientific articles and classify their roles (Primary, Secondary, or Noise).


In [None]:

import json
import os
import random
from pathlib import Path

import numpy as np
import psutil
import torch

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

env = {
    'gpus': [],
    'cpu_count': os.cpu_count(),
    'ram_gb': round(psutil.virtual_memory().total / 1024 ** 3, 2),
}
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        prop = torch.cuda.get_device_properties(i)
        env['gpus'].append({
            'name': prop.name,
            'total_mem_gb': round(prop.total_memory / 1024 ** 3, 2),
        })

print(env)
Path('/kaggle/working').mkdir(parents=True, exist_ok=True)
with open('/kaggle/working/ENV.json', 'w', encoding='utf-8') as f:
    json.dump(env, f, indent=2)


In [None]:

# Offline installation of required packages
wheels_dir = '/kaggle/input/wheels/'
packages = [
    'transformers',
    'tokenizers',
    'lxml',
    'pymupdf',
    'pyarrow',
    'regex',
    'tqdm',
    'accelerate',
]
!pip install --no-index --find-links={wheels_dir} {' '.join(packages)}

import lxml
import pyarrow
import regex
import tokenizers
import tqdm
import transformers

try:
    import accelerate  # optional
    print('accelerate', accelerate.__version__)
except Exception:
    print('accelerate not available')

print('transformers', transformers.__version__)
print('tokenizers', tokenizers.__version__)
print('lxml', lxml.__version__)
print('pyarrow', pyarrow.__version__)
print('regex', regex.__version__)
print('tqdm', tqdm.__version__)


In [None]:

from pathlib import Path

CFG = {
    'train_xml_dir': Path('/kaggle/input/train_xml'),
    'train_pdf_dir': Path('/kaggle/input/train_pdf'),
    'test_xml_dir': Path('/kaggle/input/test_xml'),
    'test_pdf_dir': Path('/kaggle/input/test_pdf'),
    'labels_path': Path('/kaggle/input/labels.csv'),
    'sample_sub_path': Path('/kaggle/input/sample_submission.csv'),
    'max_len': 384,
    'batch_size': 8,
    'num_workers': 4,
    'use_two_gpus': True,
    'cache_dir': Path('/kaggle/working/cache'),
}
CFG['cache_dir'].mkdir(parents=True, exist_ok=True)


In [None]:

import contextlib
import logging
import time
from typing import Iterator

import pandas as pd


@contextlib.contextmanager
def timer(name: str) -> Iterator[None]:
    start = time.time()
    yield
    end = time.time()
    print(f'{name}: {end - start:.2f}s')


def get_logger() -> logging.Logger:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
    return logging.getLogger('mdc')


def save_parquet(df: pd.DataFrame, path: Path) -> None:
    tmp = path.with_suffix('.tmp')
    df.to_parquet(tmp, index=False)
    tmp.replace(path)


def load_parquet(path: Path) -> pd.DataFrame:
    return pd.read_parquet(path)


def report_memory() -> None:
    vm = psutil.virtual_memory()
    used = (vm.total - vm.available) / 1024 ** 3
    total = vm.total / 1024 ** 3
    print(f'RAM used: {used:.2f}GB / {total:.2f}GB')


In [None]:

import re

DOI_PREFIX = 'https://doi.org/'
SECTION_MAP = {
    'data availability': 'Data Availability',
    'availability of data': 'Data Availability',
}

def normalize_doi(text: str) -> str:
    if not text:
        return ''
    doi = text.strip().lower().rstrip('.;/')
    if doi.startswith(DOI_PREFIX):
        doi = doi[len(DOI_PREFIX):]
    if doi.startswith('10.'):
        return f'{DOI_PREFIX}{doi}'
    return doi

def normalize_id(text: str) -> str:
    return text.strip()

def normalize_section(name: str) -> str:
    return SECTION_MAP.get(name.strip().lower(), name.strip())


In [None]:

import pickle


def build_index(xml_dir: Path, pdf_dir: Path, out_path: Path) -> dict:
    index = {}
    for path in xml_dir.glob('*.xml'):
        article_id = path.stem
        index.setdefault(article_id, {})['xml_path'] = str(path)
    for path in pdf_dir.glob('*.pdf'):
        article_id = path.stem
        index.setdefault(article_id, {})['pdf_path'] = str(path)
    with open(out_path, 'wb') as f:
        pickle.dump(index, f)
    return index

index_train = build_index(
    CFG['train_xml_dir'], CFG['train_pdf_dir'], CFG['cache_dir'] / 'index_train.pkl'
)
index_test = build_index(
    CFG['test_xml_dir'], CFG['test_pdf_dir'], CFG['cache_dir'] / 'index_test.pkl'
)


In [None]:

from lxml import etree


def parse_xml(xml_path: str) -> list[dict]:
    """Parse JATS/PubMed XML and return section windows.

    This is a simplified placeholder implementation.
    """
    tree = etree.parse(xml_path)
    # TODO: extract sections and sliding windows
    return []


In [None]:

import fitz  # PyMuPDF


def parse_pdf(pdf_path: str) -> list[dict]:
    """Fallback PDF text extractor.

    Only used when XML is missing.
    """
    doc = fitz.open(pdf_path)
    text = '\n'.join(page.get_text() for page in doc)
    doc.close()
    # TODO: clean text and split into windows
    return []


In [None]:
import re

DOI_RE = re.compile(r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+')
ACC_RE = re.compile(r'PRJNA\d+|PRJEB\d+|SRR\d+|SRX\d+|SRA\d+|GSE\d+|GSM\d+|PDB[0-9A-Za-z]{4}')


def extract_candidates(windows: list[dict]) -> list[dict]:
    """Run regex patterns over text windows."""
    candidates = []
    for idx, win in enumerate(windows):
        text = win.get('window_text', '')
        for match in DOI_RE.findall(text):
            candidates.append({
                'raw_id': match,
                'id_type': 'doi',
                'section': win.get('section'),
                'window_idx': idx,
            })
        for match in ACC_RE.findall(text):
            candidates.append({
                'raw_id': match,
                'id_type': 'accession',
                'section': win.get('section'),
                'window_idx': idx,
            })
    return candidates


In [None]:

import pandas as pd


def normalize_candidates(article_id: str, candidates: list[dict]) -> pd.DataFrame:
    rows = []
    for c in candidates:
        dataset_id = (
            normalize_doi(c['raw_id']) if c['id_type'] == 'doi' else normalize_id(c['raw_id'])
        )
        rows.append({
            'article_id': article_id,
            'raw_id': c['raw_id'],
            'dataset_id': dataset_id,
            'id_type': c['id_type'],
            'section': c.get('section'),
            'window_idx': c.get('window_idx'),
        })
    df = pd.DataFrame(rows).drop_duplicates([
        'article_id',
        'dataset_id',
        'window_idx',
    ])
    return df


In [None]:

import pandas as pd


def candidate_report(df: pd.DataFrame, labels_path: Path) -> pd.DataFrame:
    labels = pd.read_csv(labels_path)
    merged = df.merge(labels, on=['article_id', 'dataset_id'], how='left')
    recall = merged['dataset_id_y'].notna().mean()
    print(f'Upper-bound recall: {recall:.2%}')
    return merged


In [None]:
# Cell 13 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 14 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 15 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 16 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 17 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 18 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 19 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 20 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 21 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 22 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 23 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 24 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 25 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 26 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 27 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 28 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 29 placeholder
# TODO: implement stage according to plan

In [None]:
# Cell 30 placeholder
# TODO: implement stage according to plan