# Полноценный бейзлайн для выделения металлургических сущностей и построения Knowledge Graph

Этот ноутбук реализует сквозной пайплайн:
1. Извлечение текста из PDF (только английский, без формул/изображений).
2. Автоматическая слабая разметка (без ручной аннотации).
3. Fine-tuning MatSciBERT для NER.
4. Извлечение функциональных и иерархических связей.
5. Построение и экспорт Knowledge Graph в JSON.

Все шаги основаны на предыдущих обсуждениях в чате.

In [None]:
# ШАГ 0. Установка зависимостей (запустить ОДИН раз)
!pip install -U torch torchvision torchaudio \
             transformers==4.41.0 datasets seqeval accelerate \
             snorkel networkx pymupdf tqdm rapidfuzz nltk openai py2neo

In [None]:
# ШАГ 1. Системные импорты и базовые настройки
import os, re, json, glob, logging, random, itertools
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import multiprocessing as mp

import fitz                           # PyMuPDF
from rapidfuzz import fuzz, process   # быстрые строчные сопоставления

from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          DataCollatorForTokenClassification, TrainingArguments,
                          Trainer, pipeline, AutoConfig)
from datasets import Dataset, DatasetDict, ClassLabel
from seqeval.metrics import classification_report
import torch, nltk
import openai
from snorkel.labeling import labeling_function, LabelModel, PandasLFApplier
import networkx as nx
from py2neo import Graph, Node, Relationship

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Общая конфигурация
class CFG:
    pdf_dir        = "./data"               # папка с входными PDF
    weak_label_dir = "weak_labels"           # куда сохранить промежуточную разметку
    model_ckpt     = "m3rg-iitd/matscibert"  # encoder-only доменная модель
    max_len        = 192
    num_epochs     = 4
    lr             = 3e-5
    batch_size     = 8
    openai_key     = os.getenv("OPENAI_API_KEY")  # укажите ваш ключ
cfg = CFG()
openai.api_key = cfg.openai_key

In [None]:
# ШАГ 2. Извлечение “чистого” текста (ENG only, без формул/рисунков)
from nltk.tokenize import sent_tokenize, word_tokenize

non_eng_pattern = re.compile(r'[А-Яа-яЁё]+')
latex_pattern = re.compile(r'\$[^$]*\$|\\$.*?\\$|\\begin\{.*?\}', re.S)

def clean_text(page_txt:str)->str:
    # убираем формулы, пустые строки, кириллицу
    t = latex_pattern.sub(' ', page_txt)
    t = re.sub(r'\s+', ' ', t)
    if non_eng_pattern.search(t):           # если страница русская — пропускаем
        return ""
    return t

def pdf_to_sentences(pdf_path:Path):
    doc = fitz.open(pdf_path)
    raw = " ".join(clean_text(p.get_text("text")) for p in doc)
    doc.close()
    # сегментация предложений
    sents = [s.strip() for s in sent_tokenize(raw) if len(s.split())>3]
    return sents

pdf_files = list(Path(cfg.pdf_dir).rglob("*.pdf"))
logger.info(f"Found {len(pdf_files)} pdfs")

sentences = []
for f in tqdm(pdf_files):
    sentences += pdf_to_sentences(f)
logger.info(f"Total sentences: {len(sentences):,}")

In [None]:
# ШАГ 3. Словарь металлургических сущностей и labeling functions
gazetteer = {
    "MATERIAL":  ["steel","stainless steel","carbon steel","alloy","copper","aluminium",
                  "nickel","titanium","bronze","cast iron","iron","slag","billet","slab"],
    "EQUIPMENT": ["furnace","converter","ladle","rolling mill","caster","annealing line",
                  "blast furnace","basic oxygen furnace","electric arc furnace"],
    "PROCESS":   ["smelting","rolling","casting","annealing","forging","quenching",
                  "tempering","pickling","hot rolling","cold rolling","heat treatment"],
    "CHEMICAL":  ["carbon","manganese","chromium","silicon","phosphorus","sulfur","vanadium"],
    "STANDARD":  ["ASTM","EN","ISO","DIN","JIS","GOST"],
}

# быстрое обратное индексирование
flat2type = {v.lower():k for k,vs in gazetteer.items() for v in vs}
lex_sorted = sorted(flat2type, key=len, reverse=True)   # longest → shortest

tokenizer = AutoTokenizer.from_pretrained(cfg.model_ckpt)

# Определение меток
ABSTAIN = -1
MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD = 0,1,2,3,4

@labeling_function()
def lf_gazetteer(x):
    kw = x.token.lower()
    if kw in flat2type:
        return {'MATERIAL':0, 'EQUIPMENT':1, 'PROCESS':2, 'CHEMICAL':3, 'STANDARD':4}[flat2type[kw]]
    return ABSTAIN

@labeling_function()
def lf_llm(x):
    prompt = f"Classify token '{x.token}' in sentence '{x.sent}' as one of: MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD or O. Return only the label."
    rsp = openai.ChatCompletion.create(
        model="gpt-4o-mini", temperature=0.0,
        messages=[{"role":"user","content":prompt}]
    )
    label = rsp.choices.message.content.strip()
    if label in ['MATERIAL', 'EQUIPMENT', 'PROCESS', 'CHEMICAL', 'STANDARD']:
        return {'MATERIAL':0, 'EQUIPMENT':1, 'PROCESS':2, 'CHEMICAL':3, 'STANDARD':4}[label]
    return ABSTAIN

lfs = [lf_gazetteer, lf_llm]

In [None]:
# ШАГ 4. Применение Snorkel и генерация серебряных меток
def prepare_df(sentences):
    data = []
    for sent in sentences:
        tokens = word_tokenize(sent)
        for tok in tokens:
            data.append({'sent': sent, 'token': tok})
    return pd.DataFrame(data)

df = prepare_df(sentences)
applier = PandasLFApplier(lfs=lfs)
L = applier.apply(df)
label_model = LabelModel(cardinality=5, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=50, seed=42)
preds = label_model.predict(L)
df['label'] = preds
df = df[df['label'] != ABSTAIN]  # отбрасываем неопределённые

In [None]:
# ШАГ 5. Преобразование в BIO-формат и Dataset
unique_tags = ['O', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT',
               'B-PROCESS', 'I-PROCESS', 'B-CHEMICAL', 'I-CHEMICAL',
               'B-STANDARD', 'I-STANDARD']
tag2id = {t:i for i,t in enumerate(unique_tags)}

def group_to_bio(group):
    tokens = group['token'].tolist()
    labels = []
    for i, lbl in enumerate(group['label']):
        tag = unique_tags[2 * lbl + 1] if i == 0 else unique_tags[2 * lbl + 2]  # B- / I-
        labels.append(tag)
    return {'tokens': tokens, 'ner_tags': labels}

grouped = df.groupby('sent').apply(group_to_bio).tolist()
ds = Dataset.from_list(grouped)
ds = ds.train_test_split(test_size=0.1, seed=42)
ds_dict = DatasetDict({"train": ds["train"], "validation": ds["test"]})
ds_dict = ds_dict.cast_column("ner_tags", ClassLabel(names=unique_tags))

In [None]:
# ШАГ 6. Токенизация + выравнивание BIO
def align_labels(batch):
    tokenized = tokenizer(batch["tokens"],
                          is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=cfg.max_len)
    new_labels = []
    for i in range(len(batch["ner_tags"])):
        word_ids = tokenized.word_ids(batch_index=i)
        sent_labels = [tag2id[t] for t in batch["ner_tags"][i]]
        label_ids = []
        prev = None
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            elif wid != prev:
                label_ids.append(sent_labels[wid])
            else:
                label_ids.append(sent_labels[wid] if sent_labels[wid] % 2 == 1 else sent_labels[wid] + 1)  # B to I
            prev = wid
        new_labels.append(label_ids)
    tokenized["labels"] = new_labels
    return tokenized

ds_tok = ds_dict.map(align_labels, batched=True, remove_columns=ds_dict["train"].column_names)

In [None]:
# ШАГ 7. Fine-tuning MatSciBERT
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    true, pred = [], []
    for p, l in zip(preds, labels):
        true_seq, pred_seq = [], []
        for pi, li in zip(p, l):
            if li != -100:
                true_seq.append(unique_tags[li])
                pred_seq.append(unique_tags[pi])
        true.append(true_seq)
        pred.append(pred_seq)
    report = classification_report(true, pred, output_dict=True, zero_division=0)
    return {"f1": report["weighted avg"]["f1-score"]}

model_config = AutoConfig.from_pretrained(cfg.model_ckpt,
                                          num_labels=len(unique_tags),
                                          id2label={i: t for i, t in enumerate(unique_tags)},
                                          label2id=tag2id)
model = AutoModelForTokenClassification.from_pretrained(cfg.model_ckpt, config=model_config)

args = TrainingArguments(
    output_dir="ner_matsci_checkpoint",
    learning_rate=cfg.lr,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size,
    num_train_epochs=cfg.num_epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=ds_tok["train"],
                  eval_dataset=ds_tok["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

trainer.train()
trainer.save_model("ner_matsci_final")
tokenizer.save_pretrained("ner_matsci_final")

In [None]:
# ШАГ 8. Инференс NER
ner_pipe = pipeline("ner",
                    model="ner_matsci_final",
                    tokenizer="ner_matsci_final",
                    aggregation_strategy="simple",
                    device=0 if torch.cuda.is_available() else -1)

def extract_entities(sentences):
    entities = []
    for sent in tqdm(sentences):
        res = ner_pipe(sent)
        for ent in res:
            ent['sentence'] = sent
            ent['sentence_id'] = hash(sent)
        entities.extend(res)
    return entities

all_entities = extract_entities(sentences)

In [None]:
# ШАГ 9. Извлечение связей (функциональные и иерархические)
def extract_functional_relations(entities, sentences):
    relations = []
    for sent in sentences:
        sent_ents = [e for e in entities if e['sentence'] == sent]
        if len(sent_ents)  'steel')
    for e1 in entities:
        for e2 in entities:
            if e1 != e2 and e2['word'] in e1['word'] and e1['entity_group'] == e2['entity_group']:
                relations.append({
                    'head': e1['word'],
                    'relation': 'HIERARCHICAL',
                    'tail': e2['word'],
                    'conf': 0.7
                })
    return relations

func_rels = extract_functional_relations(all_entities, sentences)
hier_rels = extract_hierarchical_relations(all_entities)
all_rels = func_rels + hier_rels

In [None]:
# ШАГ 10. Построение Knowledge Graph и экспорт
def build_kg(entities, relations):
    G = nx.DiGraph()
    for ent in entities:
        G.add_node(ent['word'], type=ent['entity_group'], conf=ent.get('score', 1.0))
    for rel in relations:
        G.add_edge(rel['head'], rel['tail'], relation=rel['relation'], conf=rel['conf'])
    return G

kg = build_kg(all_entities, all_rels)

# Экспорт в JSON
kg_data = nx.readwrite.json_graph.node_link_data(kg)
with open('knowledge_graph.json', 'w') as f:
    json.dump(kg_data, f, indent=2)

# Опционально: загрузка в Neo4j
# graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
# for node, data in kg.nodes(data=True):
#     neo_node = Node(data['type'], name=node, conf=data['conf'])
#     graph.create(neo_node)
# for u, v, data in kg.edges(data=True):
#     rel = Relationship(kg.nodes[u], data['relation'], kg.nodes[v], conf=data['conf'])
#     graph.create(rel)

print("Knowledge Graph построен и сохранён в knowledge_graph.json")