# Полноценный бейзлайн для выделения металлургических сущностей и построения Knowledge Graph

Этот ноутбук реализует сквозной пайплайн с использованием BIO-разметки без ручной аннотации:
1. Извлечение текста из PDF (только английский, без формул/изображений).
2. Автоматическая слабая разметка с помощью Snorkel и open-source модели Mistral-7B-Instruct (без OpenAI).
3. Fine-tuning MatSciBERT для NER.
4. Извлечение функциональных и иерархических связей.
5. Построение и экспорт Knowledge Graph в JSON.

Все шаги основаны на предыдущих обсуждениях в чате, с адаптацией под open-source подход.

In [1]:
%pip install -U torch transformers==4.41.0 datasets seqeval accelerate snorkel networkx pymupdf tqdm rapidfuzz nltk py2neo huggingface_hub[hf_xet]


Note: you may need to restart the kernel to use updated packages.


In [2]:
!nvidia-smi
!nvcc --version

Wed Jul 30 04:25:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 577.00                 Driver Version: 577.00         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   47C    P8              3W /   60W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

'nvcc' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
# Проверка доступности CUDA
if torch.cuda.is_available():
    print("CUDA подключена! Доступно GPU.")
    print(f"Количество GPU: {torch.cuda.device_count()}")
    print(f"Имя текущего GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda:0")  # Установка устройства на GPU
else:
    print("CUDA не доступна. Работа на CPU.")
    device = torch.device("cpu")

# Тест: Создайте тензор на GPU
try:
    test_tensor = torch.tensor([1.0], device=device)
    print("Тестовый тензор успешно создан на устройстве:", test_tensor.device)
except Exception as e:
    print("Ошибка при создании тензора на GPU:", e)


CUDA не доступна. Работа на CPU.
Тестовый тензор успешно создан на устройстве: cpu


In [5]:
# ШАГ 1. Системные импорты и базовые настройки
import os, re, json, glob, logging, random, itertools
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import multiprocessing as mp
import os
from dotenv import load_dotenv

import fitz                           # PyMuPDF
from rapidfuzz import fuzz, process   # быстрые строчные сопоставления

from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          DataCollatorForTokenClassification, TrainingArguments,
                          Trainer, pipeline, AutoConfig, pipeline as hf_pipeline)
from datasets import Dataset, DatasetDict, ClassLabel
from seqeval.metrics import classification_report
import torch, nltk
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model import LabelModel
import networkx as nx
from py2neo import Graph, Node, Relationship

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Общая конфигурация
class CFG:
    pdf_dir        = "./data"               # папка с входными PDF
    weak_label_dir = "weak_labels"           # куда сохранить промежуточную разметку
    model_ckpt     = "m3rg-iitd/matscibert"  # encoder-only доменная модель
    max_len        = 192
    num_epochs     = 20
    lr             = 3e-5
    batch_size     = 8
cfg = CFG()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\knyaz_ayotgwn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\knyaz_ayotgwn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\knyaz_ayotgwn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
# ШАГ 2. Извлечение “чистого” текста (ENG only, без формул/изображений)
from nltk.tokenize import sent_tokenize, word_tokenize

cyrillic_pattern = re.compile(r'[А-Яа-яЁё]+')
french_pattern = re.compile(r'[àâçéèêëîïôûüùœÆŒ]+', re.I)  # Общие французские акценты и лигатуры
latex_pattern = re.compile(r'\$[^$]*\$|\\$.*?\\$|\\begin\{.*?\}', re.S)

def clean_text(page_txt:str)->str:
    # убираем формулы, пустые строки
    t = latex_pattern.sub(' ', page_txt)
    t = re.sub(r'\s+', ' ', t)
    return t

def pdf_to_sentences(pdf_path:Path):
    doc = fitz.open(pdf_path)
    raw = " ".join(clean_text(p.get_text("text")) for p in doc)
    doc.close()
    # сегментация предложений
    all_sents = [s.strip() for s in sent_tokenize(raw) if len(s.split())>3]
    # Фильтрация: только предложения без кириллицы и французских символов
    eng_sents = [s for s in all_sents if not cyrillic_pattern.search(s) and not french_pattern.search(s)]
    return eng_sents

pdf_files = list(Path(cfg.pdf_dir).rglob("*.pdf"))
logger.info(f"Found {len(pdf_files)} pdfs")

sentences = []
for f in tqdm(pdf_files):
    sentences += pdf_to_sentences(f)
logger.info(f"Total sentences: {len(sentences):,}")
logger.info(f"Extracted {len(sentences)} English sentences from PDFs.")
if not sentences:
    logger.error("No English sentences found! Verify PDFs contain English text.")


INFO:__main__:Found 4 pdfs


  0%|          | 0/4 [00:00<?, ?it/s]

INFO:__main__:Total sentences: 1,382
INFO:__main__:Extracted 1382 English sentences from PDFs.


Модель Mistral-7B-Instruct-v0.2 (от Mistral AI) в предоставленном ноутбуке используется в рамках Snorkel для автоматизированной слабой разметки данных (weak supervision) без ручной аннотации. Конкретно:

Роль в labeling function (lf_mistral): Модель применяется для классификации отдельных токенов (слов) в предложениях из PDF-файлов. Она получает промпт с токеном и предложением, и возвращает метку (например, MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD или O), если токен относится к металлургической сущности.

Цель: Это часть пайплайна по созданию "серебряных" BIO-меток для обучения NER-модели (MatSciBERT). Mistral помогает генерировать метки на основе few-shot prompting, компенсируя отсутствие ручной разметки и дополняя правила (gazetteer и regex).

Преимущества: Как open-source модель, она позволяет обойти платные API (например, OpenAI), работает локально или на GPU, и хорошо справляется с инструктивными задачами, такими как классификация текста в домене материаловедения и металлургии.

В целом, Mistral интегрируется для повышения качества автоматической разметки, что критично для последующего fine-tuning и построения knowledge graph.

In [None]:
# Словарь металлургических сущностей и labeling functions (без OpenAI)
gazetteer = {
    "MATERIAL":  ["steel","stainless steel","carbon steel","alloy","copper","aluminium",
                  "nickel","titanium","bronze","cast iron","iron","slag","billet","slab"],
    "EQUIPMENT": ["furnace","converter","ladle","rolling mill","caster","annealing line",
                  "blast furnace","basic oxygen furnace","electric arc furnace"],
    "PROCESS":   ["smelting","rolling","casting","annealing","forging","quenching",
                  "tempering","pickling","hot rolling","cold rolling","heat treatment"],
    "CHEMICAL":  ["carbon","manganese","chromium","silicon","phosphorus","sulfur","vanadium"],
    "STANDARD":  ["ASTM","EN","ISO","DIN","JIS","GOST"],
}

temp_pattern = re.compile(r'\b\d{2,4}\s?°[CF]\b', flags=re.I)
std_pattern = re.compile(r'\b(ASTM|EN|ISO|DIN|JIS|GOST)\s?[A-Z0-9-]+\b', flags=re.I)

# быстрое обратное индексирование
flat2type = {v.lower():k for k,vs in gazetteer.items() for v in vs}
lex_sorted = sorted(flat2type, key=len, reverse=True)

tokenizer = AutoTokenizer.from_pretrained(cfg.model_ckpt)

# Определение меток
ABSTAIN = -1
MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD = 0,1,2,3,4

@labeling_function()
def lf_gazetteer(x):
    kw = x.token.lower()
    if kw in flat2type:
        return {'MATERIAL':0, 'EQUIPMENT':1, 'PROCESS':2, 'CHEMICAL':3, 'STANDARD':4}[flat2type[kw]]
    return ABSTAIN

@labeling_function()
def lf_regex_temp(x):
    return PROCESS if temp_pattern.match(x.token) else ABSTAIN

@labeling_function()
def lf_regex_standard(x):
    return STANDARD if std_pattern.match(x.token) else ABSTAIN

# Open-source модель Gemma
gemma_pipe = hf_pipeline("text-generation", model="google/gemma-2b-it", 
                         device=0 if torch.cuda.is_available() else -1, 
                         max_new_tokens=50, temperature=0.0, token=os.getenv("HF_TOKEN"))

@labeling_function()
def lf_gemma(x):
    prompt = f"Classify token '{x.token}' in sentence '{x.sent}' as one of: MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD or O. Return only the label."
    try:
        rsp = gemma_pipe(prompt)
        if not rsp or not isinstance(rsp, list) or not rsp[0].get('generated_text'):
            raise ValueError("Empty or invalid response from Gemma")
        generated = rsp[0]['generated_text'].strip()  # Исправленный доступ: rsp[0]
        label = generated.split('\n')[-1].strip()  # Берем последнюю строку ответа
        if label in ['MATERIAL', 'EQUIPMENT', 'PROCESS', 'CHEMICAL', 'STANDARD']:
            return {'MATERIAL':0, 'EQUIPMENT':1, 'PROCESS':2, 'CHEMICAL':3, 'STANDARD':4}[label]
    except Exception as e:
        logger.warning(f"Gemma inference failed: {e}")
    return ABSTAIN

lfs = [lf_gazetteer, lf_regex_temp, lf_regex_standard, lf_gemma]


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Применение Snorkel и генерация серебряных меток (адаптировано под GPU)
import joblib 
from joblib import Parallel, delayed

def prepare_df(sentences, sample_frac=0.05): 
    data = []
    sampled_sentences = random.sample(sentences, int(len(sentences) * sample_frac)) if sample_frac < 1 else sentences
    for sent in sampled_sentences:
        tokens = word_tokenize(sent)
        for tok in tokens:
            data.append({'sent': sent, 'token': tok})
    df = pd.DataFrame(data)
    logger.info(f"Prepared DataFrame with {len(df)} rows (sampled {sample_frac*100}% of data)")
    return df

df = prepare_df(sentences, sample_frac=0.05)  # ~5% для быстрого тестирования

if df.empty:
    logger.warning("DataFrame is empty! Skipping labeling.")
else:
    def batch_lf_gemma(batch):
        prompts = [f"Classify token '{row['token']}' in sentence '{row['sent']}' as one of: MATERIAL, EQUIPMENT, PROCESS, CHEMICAL, STANDARD or O. Return only the label." for _, row in batch.iterrows()]
        try:
            with torch.no_grad():
                rsps = gemma_pipe(prompts, batch_size=16)  # Увеличено для GPU (адаптируйте под память: 8-32)
            labels = []
            for rsp in rsps:
                generated = rsp[0]['generated_text'].strip()
                label = generated.split('\n')[-1].strip()
                if label in ['MATERIAL', 'EQUIPMENT', 'PROCESS', 'CHEMICAL', 'STANDARD']:
                    labels.append({'MATERIAL':0, 'EQUIPMENT':1, 'PROCESS':2, 'CHEMICAL':3, 'STANDARD':4}[label])
                else:
                    labels.append(ABSTAIN)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()  # Очистка GPU-памяти после батча
            return labels
        except Exception as e:
            logger.warning(f"Batch Gemma inference failed: {e}")
            return [ABSTAIN] * len(batch)

    applier = PandasLFApplier(lfs=[lf_gazetteer, lf_regex_temp, lf_regex_standard])
    L_fast = applier.apply(df)

 
    n_cores = min(mp.cpu_count(), 2) 
    df_chunks = np.array_split(df, n_cores * 4)
    try:
        gemma_results = Parallel(n_jobs=n_cores, backend='threading')(  
            delayed(batch_lf_gemma)(chunk) for chunk in df_chunks
        )
    except PicklingError as pe:
        logger.warning(f"Parallel failed due to pickling: {pe}. Falling back to sequential execution.")
        gemma_results = [batch_lf_gemma(chunk) for chunk in tqdm(df_chunks)] 

    gemma_labels = list(itertools.chain.from_iterable(gemma_results))

    L = np.column_stack((L_fast, np.array(gemma_labels)))

    label_model = LabelModel(cardinality=5, verbose=True)
    label_model.fit(L_train=L, n_epochs=500, log_freq=50, seed=42)
    preds = label_model.predict(L)
    df['label'] = preds
    df = df[df['label'] != ABSTAIN]


INFO:__main__:Prepared DataFrame with 1617 rows (sampled 5.0% of data)
100%|██████████| 1617/1617 [00:00<00:00, 37061.15it/s]
  return bound(*args, **kwds)
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.000]
 10%|▉         | 49/500 [00:00<00:01, 430.81epoch/s]INFO:root:[50 epochs]: TRAIN:[loss=0.000]
INFO:root:[100 epochs]: TRAIN:[loss=0.000]
INFO:root:[150 epochs]: TRAIN:[loss=0.000]
 31%|███       | 155/500 [00:00<00:00, 781.05epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.000]
INFO:root:[250 epochs]: TRAIN:[loss=0.000]
 53%|█████▎    | 265/500 [00:00<00:00, 920.61epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.000]
INFO:root:[350 epochs]: TRAIN:[loss=0.000]
 76%|███████▌  | 378/500 [00:00<00:00, 997.73epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
INFO:root:[450 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 939.92epoch/s] 
INFO:root:Finished Training


In [None]:
unique_tags = ['O', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT',
               'B-PROCESS', 'I-PROCESS', 'B-CHEMICAL', 'I-CHEMICAL',
               'B-STANDARD', 'I-STANDARD']
tag2id = {t:i for i,t in enumerate(unique_tags)}

def group_to_bio(group):
    tokens = group['token'].tolist()
    # !!! здесь конвертируем строки в индексы
    labels = []
    for i, lbl in enumerate(group['label']):
        tag = unique_tags[2 * lbl + 1] if i == 0 else unique_tags[2 * lbl + 2]  # B- / I-
        labels.append(tag2id[tag])
    return {'tokens': tokens, 'ner_tags': labels}

grouped = df.groupby('sent').apply(group_to_bio).tolist()

from datasets import Features, Sequence, ClassLabel, Value

features = Features({
    "tokens": Sequence(feature=Value("string")),
    "ner_tags": Sequence(feature=ClassLabel(names=unique_tags)),
})
ds = Dataset.from_list(grouped, features=features)

ds = ds.train_test_split(test_size=0.1, seed=42)
ds_dict = DatasetDict({"train": ds["train"], "validation": ds["test"]})


ds_dict = DatasetDict({
    "train": ds_dict["train"].cast(features),
    "validation": ds_dict["validation"].cast(features)
})



  grouped = df.groupby('sent').apply(group_to_bio).tolist()


Casting the dataset:   0%|          | 0/23 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

In [None]:
# ШАГ 6. Токенизация + выравнивание BIO
def align_labels(batch):
    tokenized = tokenizer(batch["tokens"],
                          is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=cfg.max_len)
    new_labels = []
    for i in range(len(batch["ner_tags"])):
        word_ids = tokenized.word_ids(batch_index=i)
        sent_labels = batch["ner_tags"][i]  # индексы, не строка
        label_ids = []
        prev = None
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            elif wid != prev:
                label_ids.append(sent_labels[wid])
            else:
                # B → I сдвиг, только если не O (обычно B = нечетное, I = четное)
                label_ids.append(sent_labels[wid] if sent_labels[wid] % 2 == 1 else sent_labels[wid] + 1)
            prev = wid
        new_labels.append(label_ids)
    tokenized["labels"] = new_labels
    return tokenized


ds_tok = ds_dict.map(align_labels, batched=True, remove_columns=ds_dict["train"].column_names)

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [12]:
# ШАГ 7. Fine-tuning MatSciBERT
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    true, pred = [], []
    for p, l in zip(preds, labels):
        true_seq, pred_seq = [], []
        for pi, li in zip(p, l):
            if li != -100:
                true_seq.append(unique_tags[li])
                pred_seq.append(unique_tags[pi])
        true.append(true_seq)
        pred.append(pred_seq)
    report = classification_report(true, pred, output_dict=True, zero_division=0)
    return {"f1": report["weighted avg"]["f1-score"]}

model_config = AutoConfig.from_pretrained(cfg.model_ckpt,
                                          num_labels=len(unique_tags),
                                          id2label={i: t for i, t in enumerate(unique_tags)},
                                          label2id=tag2id)
model = AutoModelForTokenClassification.from_pretrained(cfg.model_ckpt, config=model_config)

args = TrainingArguments(
    output_dir="ner_matsci_checkpoint",
    learning_rate=cfg.lr,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=ds_tok["train"],
                  eval_dataset=ds_tok["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

trainer.train()
trainer.save_model("ner_matsci_final")
tokenizer.save_pretrained("ner_matsci_final")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at m3rg-iitd/matscibert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6635397672653198, 'eval_f1': 1.0, 'eval_runtime': 1.1426, 'eval_samples_per_second': 2.626, 'eval_steps_per_second': 0.875, 'epoch': 1.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0655632019042969, 'eval_f1': 1.0, 'eval_runtime': 0.7395, 'eval_samples_per_second': 4.057, 'eval_steps_per_second': 1.352, 'epoch': 2.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6856730580329895, 'eval_f1': 1.0, 'eval_runtime': 0.7628, 'eval_samples_per_second': 3.933, 'eval_steps_per_second': 1.311, 'epoch': 3.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5399779081344604, 'eval_f1': 1.0, 'eval_runtime': 0.7412, 'eval_samples_per_second': 4.048, 'eval_steps_per_second': 1.349, 'epoch': 4.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4649123251438141, 'eval_f1': 1.0, 'eval_runtime': 0.6831, 'eval_samples_per_second': 4.392, 'eval_steps_per_second': 1.464, 'epoch': 5.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.38519778847694397, 'eval_f1': 1.0, 'eval_runtime': 0.479, 'eval_samples_per_second': 6.263, 'eval_steps_per_second': 2.088, 'epoch': 6.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2710091471672058, 'eval_f1': 1.0, 'eval_runtime': 0.6399, 'eval_samples_per_second': 4.688, 'eval_steps_per_second': 1.563, 'epoch': 7.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.20995201170444489, 'eval_f1': 1.0, 'eval_runtime': 0.7191, 'eval_samples_per_second': 4.172, 'eval_steps_per_second': 1.391, 'epoch': 8.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.18599455058574677, 'eval_f1': 1.0, 'eval_runtime': 0.6008, 'eval_samples_per_second': 4.993, 'eval_steps_per_second': 1.664, 'epoch': 9.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.1711001843214035, 'eval_f1': 1.0, 'eval_runtime': 0.7289, 'eval_samples_per_second': 4.116, 'eval_steps_per_second': 1.372, 'epoch': 10.0}
{'train_runtime': 355.9071, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.084, 'train_loss': 1.425003178914388, 'epoch': 10.0}


('ner_matsci_final\\tokenizer_config.json',
 'ner_matsci_final\\special_tokens_map.json',
 'ner_matsci_final\\vocab.txt',
 'ner_matsci_final\\added_tokens.json',
 'ner_matsci_final\\tokenizer.json')

In [13]:
# ШАГ 8. Инференс NER
ner_pipe = pipeline("ner",
                    model="ner_matsci_final",
                    tokenizer="ner_matsci_final",
                    aggregation_strategy="simple",
                    device=0 if torch.cuda.is_available() else -1)

def extract_entities(sentences):
    entities = []
    for sent in tqdm(sentences):
        res = ner_pipe(sent)
        for ent in res:
            ent['sentence'] = sent
            ent['sentence_id'] = hash(sent)
        entities.extend(res)
    return entities

all_entities = extract_entities(sentences)

  0%|          | 0/1382 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
all_entities

[{'entity_group': 'CHEMICAL',
  'score': np.float32(0.18400662),
  'word': 'development',
  'start': 0,
  'end': 11,
  'sentence': 'Development of a model for copper converting A. Lennartsson*, F. Engstro¨m, B. Bjo¨rkman and C. Samuelsson Building on previous work reported in the literature, a dynamic model of the operation of the Peirce-Smith Converter has been developed to describe the distribution of the major elements present.',
  'sentence_id': 6412450471319604874},
 {'entity_group': 'STANDARD',
  'score': np.float32(0.1750614),
  'word': 'of',
  'start': 12,
  'end': 14,
  'sentence': 'Development of a model for copper converting A. Lennartsson*, F. Engstro¨m, B. Bjo¨rkman and C. Samuelsson Building on previous work reported in the literature, a dynamic model of the operation of the Peirce-Smith Converter has been developed to describe the distribution of the major elements present.',
  'sentence_id': 6412450471319604874},
 {'entity_group': 'EQUIPMENT',
  'score': np.float32(0.23

In [None]:

def extract_functional_relations(entities, sentences):
    relations = []
    for sent in sentences:
        sent_ents = [e for e in entities if e['sentence'] == sent]
        # Для одного предложения — функциональные связи между последовательными сущностями
        if len(sent_ents) >= 2:
            for i in range(len(sent_ents) - 1):
                e1, e2 = sent_ents[i], sent_ents[i+1]
                relations.append({
                    'head': e1['word'],
                    'relation': 'FUNCTIONAL',
                    'tail': e2['word'],
                    'verb': 'interacts',  
                    'conf': 0.8
                })
    return relations

def extract_hierarchical_relations(entities):
    relations = []
    # Находим все пары, где одно слово входит в другое среди одного типа сущностей
    for e1 in entities:
        for e2 in entities:
            if (
                e1 != e2
                and e2['word'].lower() in e1['word'].lower()
                and e1['entity_group'] == e2['entity_group']
                and e1['word'].lower() != e2['word'].lower()
            ):
                relations.append({
                    'head': e1['word'],
                    'relation': 'HIERARCHICAL',
                    'tail': e2['word'],
                    'conf': 0.7
                })
    return relations

func_rels = extract_functional_relations(all_entities, sentences)
hier_rels = extract_hierarchical_relations(all_entities)
all_rels = func_rels + hier_rels


In [None]:
# ШАГ 10. Построение Knowledge Graph и экспорт
def build_kg(entities, relations):
    G = nx.DiGraph()
    for ent in entities:
        G.add_node(ent['word'], type=ent['entity_group'], conf=ent.get('score', 1.0))
    for rel in relations:
        G.add_edge(rel['head'], rel['tail'], relation=rel['relation'], conf=rel['conf'])
    return G

import numpy as np
import json
import networkx as nx

def convert_floats(obj):
    """
    Рекурсивно привести все numpy.float и numpy.int к стандартным float/int.
    """
    if isinstance(obj, dict):
        return {k: convert_floats(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_floats(v) for v in obj]
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Ваш текущий код:
kg = build_kg(all_entities, all_rels)
kg_data = nx.readwrite.json_graph.node_link_data(kg)

# Добавьте строку:
kg_data = convert_floats(kg_data)

with open('knowledge_graph.json', 'w') as f:
    json.dump(kg_data, f, indent=2)

print("Knowledge Graph построен и сохранён в knowledge_graph.json")


print("Knowledge Graph построен и сохранён в knowledge_graph.json")

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


Knowledge Graph построен и сохранён в knowledge_graph.json
Knowledge Graph построен и сохранён в knowledge_graph.json
