In [1]:
import tensorflow as tf
import os
import pandas as pd
import json
import ast
import json
from datasets import Dataset
from transformers import CamembertTokenizerFast, DataCollatorForTokenClassification, CamembertForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report
import torch
from torchsummary import summary

os.environ["TOKENIZERS_PARALLELISM"] = "true"

2026-02-02 10:01:40.320705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770026503.279505 3516150 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770026503.918795 3516150 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-02-02 10:01:48.408000: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_json_safely(file_path):
    try:
        
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except:
        try:
            
            data = []
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        
                        data.append(json.loads(line.strip()))
                    except:
                        continue
            return data
        except:
            
            return pd.read_json(file_path, lines=True).to_dict('records')


train_data = load_json_safely('./annotations/train_extended_bio_feb.json')
test_data = load_json_safely('./annotations/val_extended_bio_feb.json')

len(train_data), len(test_data)

(59900, 14758)

In [3]:
train_dataset = Dataset.from_dict({"tokens": [item["tokens"] for item in train_data],
                                   "ner_tags": [item["tags"] for item in train_data]})
test_dataset = Dataset.from_dict({"tokens": [item["tokens"] for item in test_data],
                                  "ner_tags": [item["tags"] for item in test_data]})


tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
data_collator = DataCollatorForTokenClassification(tokenizer)


label_list = ["O", "B-country", "I-country", "B-region", "I-region", "B-departement", "I-departement", "B-province", "I-province", "B-village", "I-village"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def convert_tags_to_ids(examples):
    examples["ner_tags"] = [[label2id[tag] for tag in tags] for tags in examples["ner_tags"]]
    return examples


train_dataset = train_dataset.map(convert_tags_to_ids, batched=True)
test_dataset = test_dataset.map(convert_tags_to_ids, batched=True)


def tokenize_and_align_labels(examples):

    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True,
        return_offsets_mapping=True
    )

    batch_labels  = []
    batch_offsets = []


    for i, labels in enumerate(examples["ner_tags"]):
        word_ids      = tokenized.word_ids(batch_index=i)
        offsets_i     = tokenized["offset_mapping"][i]
        label_ids, offs = [], []

        prev_word_idx = None
        for idx, word_idx in enumerate(word_ids):
            if word_idx is None:

                label_ids.append(-100)
                offs.append((None, None))
            elif word_idx != prev_word_idx:

                label_ids.append(labels[word_idx])
                offs.append(offsets_i[idx])
            else:

                label_ids.append(-100)
                offs.append((None, None))
            prev_word_idx = word_idx

        batch_labels.append(label_ids)
        batch_offsets.append(offs)


    tokenized["labels"]       = batch_labels
    tokenized["char_offsets"] = batch_offsets



    return tokenized


train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 59900/59900 [00:07<00:00, 7493.14 examples/s]
Map: 100%|██████████| 14758/14758 [00:01<00:00, 7604.18 examples/s]
Map: 100%|██████████| 59900/59900 [01:09<00:00, 864.32 examples/s] 
Map: 100%|██████████| 14758/14758 [00:17<00:00, 827.48 examples/s]


In [4]:
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=len(label_list))

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.to(torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu'))

CamembertForTokenClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

In [None]:
# Freeze embedding layers
for param in model.roberta.embeddings.parameters():
    param.requires_grad = False

In [None]:
# Count trainable vs frozen parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Frozen parameters: {frozen_params:,}")
print(f"Trainable percentage: {100 * trainable_params / total_params:.2f}%")

# Details per component
print("\n--- Details per component ---")
for name, module in model.named_children():
    module_params = sum(p.numel() for p in module.parameters())
    module_trainable = sum(p.numel() for p in module.parameters() if p.requires_grad)
    print(f"{name}: {module_trainable:,}/{module_params:,} trainable parameters")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/data/charles/agile/camembert-ner-finetuned",
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=70,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps = 100,
    #
    do_train=True,
    do_predict=True,
    save_total_limit=100,
    push_to_hub=False,
)



# Compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    results = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "precision": results["micro avg"]["precision"],
        "recall": results["micro avg"]["recall"],
        "f1": results["micro avg"]["f1-score"],
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [30]:
trainer.evaluate()

{'eval_loss': 0.004938560537993908,
 'eval_precision': 0.9345902626580812,
 'eval_recall': 0.950933257918552,
 'eval_f1': 0.9426909328785364}

In [None]:
import json
from datasets import Dataset
from transformers import CamembertTokenizerFast, DataCollatorForTokenClassification, CamembertForTokenClassification, Trainer
import numpy as np
from seqeval.metrics import classification_report

# Load JSON data
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load test data
test_data = load_json('./annotations/test_extended_bio_feb.json')

# Convert to Dataset format
test_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in test_data],
    "ner_tags": [item["tags"] for item in test_data]
})

# Initialize tokenizer
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

# Define label list and create label <-> id mappings
label_list = ["O", "B-country", "I-country", "B-region", "I-region", "B-departement", "I-departement", "B-province", "I-province", "B-village", "I-village"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# Convert string tags to numeric IDs
def convert_tags_to_ids(examples):
    examples["ner_tags"] = [[label2id[tag] for tag in tags] for tags in examples["ner_tags"]]
    return examples

# Apply conversion on test dataset
test_dataset = test_dataset.map(convert_tags_to_ids, batched=True)

# Tokenize inputs and align labels with subword tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # New word
            else:
                label_ids.append(-100)  # Same word, ignore
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization and label alignment
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Load fine-tuned model
model = CamembertForTokenClassification.from_pretrained("/data/charles/agile/camembert-ner-finetuned/checkpoint-15000", num_labels=len(label_list))

# Initialize Trainer for evaluation only (no training)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

# Run predictions on test set
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Convert IDs back to labels
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
true_predictions = [[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                    for prediction, label in zip(predictions, labels)]

In [None]:
# Classification report on test set
print(classification_report(true_labels, true_predictions))