In [None]:
!pip install seqeval

In [None]:
import json
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset

LABEL_LIST = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC"]
LABEL_MAPPING = {label: i for i, label in enumerate(LABEL_LIST)}
MAX_LEN = 512

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

with open(r"C:\Users\bauke\OneDrive - KU Leuven\Documents\Documenten\5 digital humanities\stage\Marcel150_clean.json", "r", encoding="utf-8") as f:
    label_studio_data = json.load(f)

def split_into_chunks(tokens, labels, max_len=512):
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_labels = labels[i:i+max_len]
        chunks.append({
            "tokens": chunk_tokens,
            "ner_tags": chunk_labels
        })
    return chunks

def convert_to_hf_format(data):
    hf_dataset = []

    for item in data:
        text = item["data"]["cleaned_text"]
        annotations = item["annotations"][0]["result"]

        encoding = tokenizer(text, return_offsets_mapping=True, truncation=False)
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
        offsets = encoding["offset_mapping"]

        labels = ["O"] * len(tokens)

        for ann in annotations:
            start = ann["value"]["start"]
            end = ann["value"]["end"]
            label_text = ann["value"]["labels"][0].upper() 
            if label_text not in {"PER", "LOC"}:
                continue  

            for i, (token_start, token_end) in enumerate(offsets):
                if token_start is None or token_end is None:
                    continue
                if token_start >= start and token_end <= end:
                    labels[i] = f"B-{label_text}" if token_start == start else f"I-{label_text}"

        filtered = [
            (token, LABEL_MAPPING.get(label, 0))
            for token, label in zip(tokens, labels)
            if token not in tokenizer.all_special_tokens
        ]

        if filtered:
            token_list, tag_list = zip(*filtered)
            chunks = split_into_chunks(list(token_list), list(tag_list), MAX_LEN)
            hf_dataset.extend(chunks)

    return hf_dataset

hf_data = convert_to_hf_format(label_studio_data)

train_data, eval_data = train_test_split(hf_data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

with open("hf_ner_data150.json", "w", encoding="utf-8") as f:
    json.dump(hf_data, f, indent=4, ensure_ascii=False)

print("Clean and label-aware dataset saved to hf_ner_data150.json")


Token indices sequence length is longer than the specified maximum sequence length for this model (1280 > 512). Running this sequence through the model will result in indexing errors


✅ Clean and label-aware dataset saved to hf_ner_data150.json


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import torch
from seqeval.metrics import f1_score, precision_score, recall_score
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        truncation=True,
        padding="max_length",
        max_length=512,
        is_split_into_words=True,
    )
    labels = []
    word_ids = tokenized.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    tokenized["labels"] = labels
    return tokenized

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[["O", "B-PER", "I-PER", "B-LOC", "I-LOC"][l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [["O", "B-PER", "I-PER", "B-LOC", "I-LOC"][p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=False)

id2label = {i: l for i, l in enumerate(LABEL_LIST)}
label2id = {l: i for i, l in enumerate(LABEL_LIST)}

model = AutoModelForTokenClassification.from_pretrained(
    "Jean-Baptiste/camembert-ner",
    num_labels=len(LABEL_LIST),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./ner_camembert_finetuned4",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=20,
    lr_scheduler_type='linear',
    save_total_limit=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2747,0.136714,0.25,0.304762,0.274678
2,0.1237,0.086277,0.517544,0.561905,0.538813
3,0.0802,0.073686,0.603524,0.652381,0.627002
4,0.0611,0.066196,0.668161,0.709524,0.688222
5,0.0508,0.062235,0.688073,0.714286,0.700935
6,0.0432,0.062576,0.686364,0.719048,0.702326
7,0.04,0.06128,0.714286,0.738095,0.725995
8,0.0343,0.057371,0.730594,0.761905,0.745921
9,0.0297,0.056736,0.722727,0.757143,0.739535
10,0.0257,0.061067,0.727273,0.761905,0.744186


TrainOutput(global_step=370, training_loss=0.07632879566501927, metrics={'train_runtime': 5697.455, 'train_samples_per_second': 0.52, 'train_steps_per_second': 0.13, 'total_flos': 386729688883200.0, 'train_loss': 0.07632879566501927, 'epoch': 10.0})

In [17]:
model.save_pretrained("./ner_camembert_finetuned4")
tokenizer.save_pretrained("./ner_camembert_finetuned4")

('./ner_camembert_finetuned4\\tokenizer_config.json',
 './ner_camembert_finetuned4\\special_tokens_map.json',
 './ner_camembert_finetuned4\\sentencepiece.bpe.model',
 './ner_camembert_finetuned4\\added_tokens.json',
 './ner_camembert_finetuned4\\tokenizer.json')

In [None]:
from transformers import pipeline
import os
import json
import re
import unicodedata

ner_pipeline = pipeline(
    "ner",
    model="./ner_camembert_finetuned4",
    tokenizer="./ner_camembert_finetuned4",
    aggregation_strategy="simple"
)

folder_path = r"C:\Users\bauke\OneDrive - KU Leuven\Documents\Documenten\5 digital humanities\stage\articles-verbetering"
output_data = []

def preprocess_text(text):
    text = unicodedata.normalize("NFKC", text)  
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)  
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()  
    text = re.sub(r"(\w)([.,;!?])", r"\1 \2", text)  
    return text

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, encoding="utf-8") as f:
            text = f.read()
        
        clean_text = preprocess_text(text)

        entities_raw = ner_pipeline(clean_text)

        entities = []
        for ent in entities_raw:
            if ent["entity_group"] in ["PER", "LOC"]:
                entities.append({
                    "text": ent["word"],
                    "start": ent["start"],
                    "end": ent["end"],
                    "label": ent["entity_group"],
                    "score": round(float(ent["score"]), 4)
                })

        output_data.append({
            "filename": filename,
            "entities": entities
        })

with open("ner_results_trainedformllm.json", "w", encoding="utf-8") as out_file:
    json.dump(output_data, out_file, ensure_ascii=False, indent=2)

print("Saved NER results with confidence scores to ner_results_trainedformllm.json")


Device set to use cpu


✅ Saved NER results with confidence scores to ner_results_trainedformllm.json
