In [61]:
def char_to_word_index(words: list, char_idx: int) -> int:
    word_starts = []
    offset = 0
    for word in words:
        word_starts.append(offset)
        offset += len(word) + 1 

    for i in reversed(range(len(word_starts))):
        if word_starts[i] <= char_idx:
            return i

    raise ValueError("char_idx nằm ngoài phạm vi câu.")

def process_bert_results(words, ner_results):
    word = []
    for result in ner_results:
        word.append({'type': result['entity_group'], 'start': char_to_word_index(words, result['start']), 'end': char_to_word_index(words, result['end']) + 1, 'source': "ychenNLP/arabic-ner-ace"})
    return word

def process_gliner_results(words, ner_results):
    mapping = {
        "Person": "PER",
        "Organization": "ORG",
        "Location": "LOC",
        "Geo-Political Entity": "GPE",
        "Facility": "FAC",
        "Vehicle": "VEH",
        "Weapon": "WEA"
    }
    word = []
    for result in ner_results:
        word.append({'type': mapping[result['label']], 'start': char_to_word_index(words, result['start']), 'end': char_to_word_index(words, result['end']) + 1, 'source': "gliner-community/gliner_small-v2.5"})
    return word

In [64]:
import json
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
from gliner import GLiNER

model_gliner = GLiNER.from_pretrained("gliner-community/gliner_small-v2.5", load_tokenizer=True)
labels = ["Person", "Organization", "Location", "Geo-Political Entity", "Facility", "Vehicle", "Weapon"]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-ner-ace")
model_bert = AutoModelForTokenClassification.from_pretrained("ychenNLP/arabic-ner-ace")

nlp = pipeline("ner", model=model_bert, tokenizer=tokenizer, device=device, aggregation_strategy="simple")


data = json.loads(open('data/datasets/ace2004/ace2004_test_context@2.json').read())
for i in tqdm(range(len(data))):
    sentence = ' '.join(data[i]['tokens'])
    
    glidner_predictions = model_gliner.predict_entities(sentence, labels)
    bert_predictions = nlp(sentence)

    glidner_results = process_gliner_results(data[i]['tokens'], glidner_predictions)
    bert_results = process_bert_results(data[i]['tokens'], bert_predictions)
    

    data[i]['entities_preds'] = glidner_results + bert_results
    
json.dump(data, open('data/datasets/ace2004/ace2004_test_context@3_pretrained.json', 'w'), ensure_ascii=False, indent=4)

Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 9988.82it/s]
  0%|          | 0/809 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 809/809 [05:14<00:00,  2.57it/s]


# Lightning

In [1]:
def read_conf_file(path):
    config = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):  # bỏ dòng trống hoặc comment
                continue
            if '=' in line:
                key, value = line.split('=', 1)
                key = key.strip()
                value = value.strip()
                # Chuyển giá trị sang đúng kiểu
                if value.lower() == "true":
                    value = True
                elif value.lower() == "false":
                    value = False
                elif value.lower() == "none":
                    value = None
                else:
                    try:
                        # Chuyển thành số nếu được
                        if '.' in value:
                            value = float(value)
                        else:
                            value = int(value)
                    except ValueError:
                        pass  # giữ nguyên là chuỗi
                config[key] = value
    return config

In [2]:
from transformers import AutoTokenizer
from diffusionner.input_reader import JsonInputReader
import logging
from torch.utils.data import DataLoader
from diffusionner import sampling
from diffusionner.entities import Dataset

config_path = "configs/ace2004.conf"
config = read_conf_file(config_path)
print(config)

tokenizer = AutoTokenizer.from_pretrained(config['model_path'])
logger = logging.getLogger()
input_reader = JsonInputReader(config['types_path'], tokenizer, logger, repeat_gt_entities=config['repeat_gt_entities'])
input_reader.read({'train': config['train_path'], 'dev': config['valid_path']})

train_dataset = input_reader.get_dataset('train')
dev_dataset = input_reader.get_dataset('dev')
# dev_dataset.switch_mode(Dataset.EVAL_MODE)

train_loader = DataLoader(
    train_dataset,
    batch_size=config['train_batch_size'],
    shuffle=True,
    num_workers=2,
    collate_fn=sampling.collate_fn_padding,
    persistent_workers=True
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=config['eval_batch_size'],
    shuffle=False,
    num_workers=2,
    collate_fn=sampling.collate_fn_padding,
    persistent_workers=True
)

for batch in train_loader:
    print(batch.keys())
    break

{'train_path': 'data/datasets/ace2004/ace2004_train_context@2.json', 'valid_path': 'data/datasets/ace2004/ace2004_dev_context@2.json', 'save_path': 'data/ace2004/', 'save_path_include_iteration': False, 'init_eval': False, 'save_optimizer': False, 'train_log_iter': 1, 'final_eval': False, 'train_batch_size': 2, 'epochs': 2, 'lr': '2e-05', 'lr_warmup': 0.1, 'weight_decay': 0.01, 'max_grad_norm': 1.0, 'match_solver': 'hungarian', 'type_loss': 'celoss', 'nil_weight': -1.0, 'match_boundary_weight': 1.0, 'match_class_weight': 1.0, 'loss_boundary_weight': 1.0, 'loss_class_weight': 1.0, 'match_boundary_type': 'logp', 'repeat_gt_entities': 60, 'eval_every_epochs': 8, 'eval_test': False, 'config': 'configs/ace2004.conf', 'local_rank': -1, 'world_size': -1, 'types_path': 'data/datasets/ace2004/ace2004_types.json', 'tokenizer_path': 'bert-base-cased', 'lowercase': False, 'sampling_processes': 4, 'label': 'ace2004_train', 'log_path': 'data/ace2004/', 'store_predictions': False, 'store_examples': F

Parse dataset 'train': 100%|██████████| 6198/6198 [01:13<00:00, 84.17it/s] 
Parse dataset 'dev': 100%|██████████| 742/742 [00:08<00:00, 83.38it/s] 


dict_keys(['encodings', 'context_masks', 'seg_encoding', 'context2token_masks', 'token_masks', 'gt_types', 'gt_spans', 'entity_masks', 'meta_doc'])


In [3]:
from diffusionner import models
import pytorch_lightning as pl
import torch
from transformers import AutoConfig
from diffusionner.loss import Criterion
from diffusionner.evaluator import Evaluator
import os

class DiffusionNERTrainer(pl.LightningModule):
    def __init__(self, config, logger):
        super().__init__()
        model_class = models.get_model(config['model_type'])
        model_config = AutoConfig.from_pretrained(config['model_path'], cache_dir=config['cache_path'])
        self.model = model_class.from_pretrained(
            config['model_path'],
            ignore_mismatched_sizes=True,
            # local_files_only = True,
            config = model_config,
            # Prompt4NER model parameters
            entity_type_count=input_reader.entity_type_count,
            lstm_layers = config['lstm_layers'],
            span_attn_layers = config['span_attn_layers'],
            timesteps = config['timesteps'],
            beta_schedule = config['beta_schedule'],
            sampling_timesteps = config['sampling_timesteps'],
            num_proposals = config['num_proposals'],
            scale = config['scale'],
            extand_noise_spans = config['extand_noise_spans'],
            span_renewal = config['span_renewal'],
            step_ensemble = config['step_ensemble'],
            prop_drop = config['prop_drop'],
            soi_pooling = config['soi_pooling'],
            pos_type =  config['pos_type'],
            step_embed_type = config['step_embed_type'],
            sample_dist_type = config['sample_dist_type'],
            split_epoch = config['split_epoch'],
            pool_type = config['pool_type'],
            wo_self_attn = config['wo_self_attn'],
            wo_cross_attn = config['wo_cross_attn'])
        
        self.config = config
        self.custom_logger = logger

        self.weight_dict = {'loss_ce': config['loss_class_weight'], 'loss_boundary': config['loss_boundary_weight']}
        losses = ['labels', 'boundary']
        self.criterion = Criterion(input_reader.entity_type_count, self.weight_dict, config['nil_weight'], losses, config['type_loss'], config['match_class_weight'], config['match_boundary_weight'], config['match_boundary_type'], config['match_solver'])
        self._predictions_path = os.path.join(config['log_path'], 'predictions_%s_epoch_%s.json')
        self._examples_path = os.path.join(config['log_path'], 'examples_%s_%s_epoch_%s.html')

    def compute(self, output, gt_types, gt_spans, entity_masks, epoch, batch = None):

        gt_types_wo_nil = gt_types.masked_select(entity_masks)
        
        if len(gt_types_wo_nil) == 0:
            return 0.1

        sizes = [i.sum() for i in entity_masks]
        entity_masks = entity_masks.unsqueeze(2).repeat(1, 1, 2)
        spans_wo_nil = gt_spans.masked_select(entity_masks).view(-1, 2)

        targets = {"labels": gt_types_wo_nil, "gt_left":spans_wo_nil[:, 0], "gt_right":spans_wo_nil[:, 1], "sizes":sizes}

        train_loss = []
        indices = None

        pred_logits, pred_left, pred_right, pred_left, pred_right = output["pred_logits"], output["pred_spans"][:, :, 0], output["pred_spans"][:, :, 1], output["pred_left"], output["pred_right"]

        outputs = {"pred_logits":pred_logits, "pred_left":pred_left, "pred_right":pred_right, "pred_left":pred_left, "pred_right":pred_right, "token_mask": batch["token_masks"]}
        loss_dict, indices = self.criterion(outputs, targets, epoch, indices = indices)
        
        train_loss = sum(loss_dict[k] * self.weight_dict[k] for k in loss_dict.keys())
        
        return train_loss
    
    def forward(self, batch, is_train = True):
        if is_train:
            output = self.model(
                encodings=batch['encodings'], 
                context_masks=batch['context_masks'], 
                seg_encoding = batch['seg_encoding'], 
                context2token_masks = batch['context2token_masks'], 
                token_masks = batch['token_masks'],
                entity_spans = batch['gt_spans'],
                entity_types = batch['gt_types'],
                entity_masks = batch['entity_masks'],
                meta_doc = batch['meta_doc'], 
                epoch = self.current_epoch)
        else:
            output = self.model(
                encodings=batch['encodings'], 
                context_masks=batch['context_masks'], 
                seg_encoding = batch['seg_encoding'], 
                context2token_masks = batch['context2token_masks'], 
                token_masks = batch['token_masks'],
                meta_doc = batch['meta_doc'])
        return output
        
    def training_step(self, batch, batch_idx):
        config = self.config
        output = self(batch, is_train = True)
        gt_types = batch["gt_types"]
        gt_spans = batch["gt_spans"]
        entity_masks = batch["entity_masks"]
        epoch = self.current_epoch
        train_loss = self.compute(output, gt_types, gt_spans, entity_masks, epoch, batch)
        self.log("train_loss", train_loss)
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        config = self.config
        output = self(batch, is_train = True)
        gt_types = batch["gt_types"]
        gt_spans = batch["gt_spans"]
        entity_masks = batch["entity_masks"]
        epoch = self.current_epoch
        val_loss = self.compute(output, gt_types, gt_spans, entity_masks, epoch, batch)
        self.log("val_loss", val_loss)

        self.evaluator.eval_batch(output, batch)
        
        return val_loss

    def on_validation_epoch_start(self):
        self.evaluator = Evaluator(dev_dataset, input_reader, tokenizer, self.custom_logger, self.config['no_overlapping'], self.config['no_partial_overlapping'], self.config['no_duplicate'], self._predictions_path, self._examples_path, self.config['example_count'], self.current_epoch, dev_dataset.label,  cls_threshold = self.config['cls_threshold'], boundary_threshold = self.config['boundary_threshold'], entity_threshold = self.config['entity_threshold'], save_prediction = self.config['store_predictions'])

    def on_validation_epoch_end(self):
        ner_eval, ner_loc_eval, ner_cls_eval = self.evaluator.compute_scores()
        print("NER Evaluation: ", ner_eval)
        print("NER Location Evaluation: ", ner_loc_eval)
        print("NER Classification Evaluation: ", ner_cls_eval)


    def configure_optimizers(self):
        config = self.config
        optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=float(config['lr']),
            weight_decay=config['weight_decay']
        )

        return optimizer
    
# test model
test_model = DiffusionNERTrainer(config, logger)
test_model.eval()
out = test_model(batch)
print(out.keys())

Some weights of BertDiffusionNER were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['affine_end.bias', 'affine_end.weight', 'affine_start.bias', 'affine_start.weight', 'alphas_cumprod', 'alphas_cumprod_prev', 'bert.embeddings.position_ids', 'betas', 'downlinear.bias', 'downlinear.weight', 'entity_classifier.classifier.0.bias', 'entity_classifier.classifier.0.weight', 'entity_classifier.classifier.2.bias', 'entity_classifier.classifier.2.weight', 'left_boundary_predictor.boundary_predictor.bias', 'left_boundary_predictor.boundary_predictor.weight', 'left_boundary_predictor.entity_embedding_linear.0.bias', 'left_boundary_predictor.entity_embedding_linear.0.weight', 'left_boundary_predictor.token_embedding_linear.0.bias', 'left_boundary_predictor.token_embedding_linear.0.weight', 'log_one_minus_alphas_cumprod', 'lstm.bias_hh_l0', 'lstm.bias_hh_l0_reverse', 'lstm.bias_hh_l1', 'lstm.bias_hh_l1_reverse', 'lstm.bias_ih_l0', 'lstm.bias_ih_l0_reverse',

dict_keys(['pred_logits', 'pred_spans', 'pred_left', 'pred_right'])


In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

model = DiffusionNERTrainer(config, logger)
model.train()  
tslogger = TensorBoardLogger(config['log_path'], name='ace2004_train')
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=os.path.join(config['log_path'], 'checkpoints'),
    filename='ace04-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)
trainer = pl.Trainer(
    max_epochs=config['epochs'],
    logger=tslogger,
    callbacks=[checkpoint_callback],
    log_every_n_steps=1,
)
trainer.fit(model, train_loader, dev_loader)