In [1]:
# Download NER datasets from GitHub
! wget https://raw.githubusercontent.com/Alexpbunea/book_recommendation_RAG/main/data/ner_train_merged.jsonl
! wget https://raw.githubusercontent.com/Alexpbunea/book_recommendation_RAG/main/data/ner_eval_merged.jsonl

# Install required packages
! pip install accelerate -U
! pip install datasets transformers seqeval

--2025-12-01 11:51:32--  https://raw.githubusercontent.com/Alexpbunea/book_recommendation_RAG/main/data/ner_train_merged.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 871562 (851K) [text/plain]
Saving to: ‘ner_train_merged.jsonl’


2025-12-01 11:51:32 (32.3 MB/s) - ‘ner_train_merged.jsonl’ saved [871562/871562]

--2025-12-01 11:51:32--  https://raw.githubusercontent.com/Alexpbunea/book_recommendation_RAG/main/data/ner_eval_merged.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97425 (95K) [text/plain]
Saving to: ‘ner

### Hugging Face Login and Dataset Loading

First, you might need to log in to Hugging Face Hub if you haven't already. Then, we will load the `empathyai/books-ner-dataset`.

In [2]:
import huggingface_hub
# Uncomment and run if you need to log in to Hugging Face
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
import json
import numpy as np
from datasets import Dataset
from transformers import TrainingArguments

# Load the JSONL files
def load_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

print("Loading datasets from JSONL files...")
train_data = load_jsonl("ner_train_merged.jsonl")
eval_data = load_jsonl("ner_eval_merged.jsonl")

print(f"Train examples: {len(train_data)}")
print(f"Eval examples: {len(eval_data)}")

# Preview a sample
print("\nSample train example:")
print(train_data[0])

Loading datasets from JSONL files...
Train examples: 4519
Eval examples: 502

Sample train example:
{'tokenized_text': ['Indigo', 'Blue', 'is', 'the', 'kind', 'of', 'cozy', 'read', 'I', 'need', 'right', 'now', '.'], 'ner': [[0, 1, 'title']]}


In [5]:
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from collections import Counter

# Define BIO labels
custom_labels = ["O", "B-AUTHOR", "I-AUTHOR", "B-TITLE", "I-TITLE"]
label2id = {label: idx for idx, label in enumerate(custom_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Labels: {custom_labels}")
print(f"label2id: {label2id}")

def convert_to_bio_format(data, labels):
    """
    Convert span-based NER to BIO token classification format.
    Input format: {"tokenized_text": [...], "ner": [[start, end, label], ...]}
    """
    tokens_list = []
    ner_tags_list = []

    label2id_local = {label: idx for idx, label in enumerate(labels)}

    skipped_entities = 0
    total_entities = 0

    for ex in data:
        tokens = ex['tokenized_text']

        # Initialize all tokens as "O"
        bio_tags = [label2id_local["O"]] * len(tokens)

        for entity in ex['ner']:
            total_entities += 1
            # Handle both list format [start, end, label] and dict format
            if isinstance(entity, list):
                token_start, token_end, entity_label = entity
            else:
                token_start = int(entity['start'])
                token_end = int(entity['end'])
                entity_label = entity['label']

            entity_label = entity_label.upper()

            # Validate indices
            if token_start >= len(tokens) or token_start < 0:
                skipped_entities += 1
                continue

            # Clamp end to valid range (end is INCLUSIVE)
            actual_end = min(token_end, len(tokens) - 1)

            if token_start == actual_end:
                # Single token entity
                bio_tags[token_start] = label2id_local.get(f"B-{entity_label}", label2id_local["O"])
            else:
                # Multi-token: start gets B-, tokens from start+1 to end (inclusive) get I-
                bio_tags[token_start] = label2id_local.get(f"B-{entity_label}", label2id_local["O"])
                for k in range(token_start + 1, actual_end + 1):
                    bio_tags[k] = label2id_local.get(f"I-{entity_label}", label2id_local["O"])

        tokens_list.append(list(tokens))
        ner_tags_list.append(bio_tags)

    print(f"Total entities: {total_entities}, Skipped: {skipped_entities}")

    features = Features({
        'tokens': Sequence(Value('string')),
        'ner_tags': Sequence(ClassLabel(names=labels))
    })

    return Dataset.from_dict(
        {'tokens': tokens_list, 'ner_tags': ner_tags_list},
        features=features
    )

# Convert datasets
print("\nConverting datasets to BIO format...")
train_dataset = convert_to_bio_format(train_data, custom_labels)
eval_dataset = convert_to_bio_format(eval_data, custom_labels)

print(f"\nTrain: {len(train_dataset)}, Eval: {len(eval_dataset)}")
print("Sample tokens:", train_dataset[0]['tokens'][:10])
print("Sample tags:", train_dataset[0]['ner_tags'][:10])
print("Decoded tags:", [id2label[t] for t in train_dataset[0]['ner_tags'][:10]])

Labels: ['O', 'B-AUTHOR', 'I-AUTHOR', 'B-TITLE', 'I-TITLE']
label2id: {'O': 0, 'B-AUTHOR': 1, 'I-AUTHOR': 2, 'B-TITLE': 3, 'I-TITLE': 4}

Converting datasets to BIO format...
Total entities: 7544, Skipped: 2
Total entities: 825, Skipped: 0

Train: 4519, Eval: 502
Sample tokens: ['Indigo', 'Blue', 'is', 'the', 'kind', 'of', 'cozy', 'read', 'I', 'need']
Sample tags: [3, 4, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded tags: ['B-TITLE', 'I-TITLE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
# Diagnostic: Check label distribution and data quality
print("="*60)
print("DIAGNOSTIC: Checking data quality")
print("="*60)

# 1. Check label distribution
all_tags = []
for example in train_dataset:
    all_tags.extend(example['ner_tags'])

tag_counts = Counter(all_tags)
total_tags = len(all_tags)

print("\n1. Label Distribution in Training Data:")
for tag_id, count in sorted(tag_counts.items()):
    label_name = custom_labels[tag_id]
    percentage = (count / total_tags) * 100
    print(f"   {label_name}: {count:,} ({percentage:.2f}%)")

# 2. Check if entities are being captured
entity_samples = 0
for i in range(min(100, len(train_dataset))):
    if any(tag != 0 for tag in train_dataset[i]['ner_tags']):
        entity_samples += 1

print(f"\n2. Samples with at least one entity (first 100): {entity_samples}/100")

# 3. Check a few examples with entities
print("\n3. Sample examples with entities:")
shown = 0
for i in range(len(train_dataset)):
    if shown >= 3:
        break
    example = train_dataset[i]
    entity_indices = [j for j, tag in enumerate(example['ner_tags']) if tag != 0]
    if entity_indices:
        shown += 1
        print(f"\n   Example {i}:")
        tokens = example['tokens']
        tags = example['ner_tags']
        for idx in entity_indices[:8]:
            token = tokens[idx]
            tag = custom_labels[tags[idx]]
            print(f"      Token[{idx}]: '{token}' -> {tag}")

DIAGNOSTIC: Checking data quality

1. Label Distribution in Training Data:
   O: 33,906 (44.68%)
   B-AUTHOR: 3,730 (4.92%)
   I-AUTHOR: 14,599 (19.24%)
   B-TITLE: 3,808 (5.02%)
   I-TITLE: 19,843 (26.15%)

2. Samples with at least one entity (first 100): 100/100

3. Sample examples with entities:

   Example 0:
      Token[0]: 'Indigo' -> B-TITLE
      Token[1]: 'Blue' -> I-TITLE

   Example 1:
      Token[3]: 'The' -> B-TITLE
      Token[4]: 'Day' -> I-TITLE
      Token[5]: 'After' -> I-TITLE
      Token[6]: 'Tomorrow' -> I-TITLE
      Token[9]: 'Allan' -> B-AUTHOR
      Token[10]: 'Folsom' -> I-AUTHOR

   Example 2:
      Token[5]: 'Illustrations' -> B-TITLE
      Token[6]: 'Of' -> I-TITLE
      Token[7]: 'Political' -> I-TITLE
      Token[8]: 'Economy' -> I-TITLE
      Token[9]: ',' -> I-TITLE
      Token[10]: 'Volume' -> I-TITLE
      Token[11]: '6' -> I-TITLE
      Token[12]: '(' -> I-TITLE


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

# Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(custom_labels),
    id2label=id2label,
    label2id=label2id
)

# Tokenization with label alignment
def tokenize_and_align_labels(examples):
    """
    Tokenize and align labels with subword tokens.
    B-* labels propagate to first subword, I-* to subsequent subwords.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=256,
        padding=False
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:
                # First subword of a word: use the original label
                label_ids.append(label[word_idx])
            else:
                # Subsequent subwords: convert B- to I- for consistency
                original_label = label[word_idx]
                original_label_name = custom_labels[original_label]

                if original_label_name.startswith("B-"):
                    i_label_name = "I-" + original_label_name[2:]
                    label_ids.append(label2id.get(i_label_name, original_label))
                else:
                    label_ids.append(original_label)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=eval_dataset.column_names)

# Verify device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)

print(f"\nTokenized train samples: {len(tokenized_train)}")
print(f"Tokenized eval samples: {len(tokenized_eval)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing datasets...


Map:   0%|          | 0/4519 [00:00<?, ? examples/s]

Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Using device: cuda

Tokenized train samples: 4519
Tokenized eval samples: 502


In [8]:
from transformers import Trainer
import torch.nn as nn

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Custom Trainer with class weights to handle imbalanced O tag
class WeightedNERTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = torch.tensor(self.class_weights, device=logits.device, dtype=logits.dtype)
            loss_fct = nn.CrossEntropyLoss(weight=weight, ignore_index=-100)
        else:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Calculate class weights (inverse frequency, dampened)
tag_counts_list = [tag_counts.get(i, 1) for i in range(len(custom_labels))]
total = sum(tag_counts_list)
class_weights = [min(10.0, (total / (len(custom_labels) * count)) ** 0.5) for count in tag_counts_list]
print("Class weights:", {custom_labels[i]: f"{w:.2f}" for i, w in enumerate(class_weights)})

# Training arguments
args = TrainingArguments(
    output_dir="./ner-books-model",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    save_steps=500,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedNERTrainer(
    class_weights=class_weights,
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train!
print("\n" + "="*50)
print("Starting training...")
print("="*50)

trainer.train()

# Save model
model.save_pretrained("./ner-books-model-final")
tokenizer.save_pretrained("./ner-books-model-final")
print("\nModel saved to ./ner-books-model-final")

Class weights: {'O': '0.67', 'B-AUTHOR': '2.02', 'I-AUTHOR': '1.02', 'B-TITLE': '2.00', 'I-TITLE': '0.87'}

Starting training...


  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss
500,0.0681,0.082245
1000,0.0391,0.082771



Model saved to ./ner-books-model-final


In [9]:
# Better evaluation using the Trainer's built-in evaluation
print("\n" + "="*60)
print("Evaluation using Trainer:")
print("="*60)
model.push_to_hub("aicoral048/ner-books-model-final", private=False)
tokenizer.push_to_hub("aicoral048/ner-books-model-final", private=False)
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


Evaluation using Trainer:


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...l-final/model.safetensors:   0%|          |  555kB /  709MB            

README.md: 0.00B [00:00, ?B/s]


Evaluation Results:
eval_loss: 0.0822
eval_runtime: 0.6645
eval_samples_per_second: 755.4050
eval_steps_per_second: 48.1530
epoch: 5.0000


In [12]:
# Compute proper metrics using seqeval (standard for NER)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

print("\n" + "="*60)
print("Proper NER evaluation with seqeval:")
print("="*60)

def evaluate_ner_proper(model_path, eval_dataset, num_samples=500):
    """Evaluate using token-level predictions directly from the model"""
    from transformers import AutoModelForTokenClassification, AutoTokenizer
    import torch

    model = AutoModelForTokenClassification.from_pretrained(model_path)
    # Use original tokenizer to avoid config compatibility issues
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model.eval()
    model.to(device)

    true_labels_all = []
    pred_labels_all = []

    for i in range(min(num_samples, len(eval_dataset))):
        example = eval_dataset[i]
        tokens = example['tokens']
        true_tags = [custom_labels[tag] for tag in example['ner_tags']]

        # Tokenize
        inputs = tokenizer(
            tokens,
            is_split_into_words=True,
            return_tensors="pt",
            truncation=True,
            max_length=256
        ).to(device)

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)[0].cpu().numpy()

        # Align predictions with original tokens
        word_ids = inputs.word_ids()
        pred_tags = []
        previous_word_idx = None

        for word_idx, pred_id in zip(word_ids, predictions):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                pred_tags.append(custom_labels[pred_id])
                previous_word_idx = word_idx

        # Ensure same length
        if len(pred_tags) != len(true_tags):
            pred_tags = pred_tags[:len(true_tags)] + ['O'] * (len(true_tags) - len(pred_tags))

        true_labels_all.append(true_tags)
        pred_labels_all.append(pred_tags)

    # Compute metrics
    print(classification_report(true_labels_all, pred_labels_all))
    print(f"\nOverall F1 Score: {f1_score(true_labels_all, pred_labels_all):.4f}")
    print(f"Overall Precision: {precision_score(true_labels_all, pred_labels_all):.4f}")
    print(f"Overall Recall: {recall_score(true_labels_all, pred_labels_all):.4f}")

# Use local model path
evaluate_ner_proper("./ner-books-model-final", eval_dataset, num_samples=len(eval_dataset))


Proper NER evaluation with seqeval:
              precision    recall  f1-score   support

      AUTHOR       0.92      0.95      0.93       401
       TITLE       0.92      0.94      0.93       424

   micro avg       0.92      0.94      0.93       825
   macro avg       0.92      0.94      0.93       825
weighted avg       0.92      0.94      0.93       825


Overall F1 Score: 0.9294
Overall Precision: 0.9174
Overall Recall: 0.9418


In [13]:
# Quick inference test on new examples
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

print("\n" + "="*60)
print("Quick inference test on new examples:")
print("="*60)

# Load model from saved path, but use original tokenizer to avoid config issues
model_for_inference = AutoModelForTokenClassification.from_pretrained("./ner-books-model-final")
tokenizer_for_inference = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

ner_pipeline = pipeline(
    "token-classification",
    model=model_for_inference,
    tokenizer=tokenizer_for_inference,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

test_texts = [
    "I love reading One Hundred Years of Solitude by Gabriel García Márquez.",
    "The Harry Potter series was written by J.K. Rowling.",
    "Don Quixote is a novel by Miguel de Cervantes.",
    "have you read The Lord of the Rings?",
    "anything like 1984 by george orwell would be great",
    "looking for books similar to Pride and Prejudice",
]

for text in test_texts:
    print(f"\nText: {text}")
    entities = ner_pipeline(text)
    if entities:
        for entity in entities:
            print(f"  → {entity['word']}: {entity['entity_group']} (confidence: {entity['score']:.3f})")
    else:
        print("  → No entities detected")


Quick inference test on new examples:


Device set to use cuda:0



Text: I love reading One Hundred Years of Solitude by Gabriel García Márquez.
  → One Hundred Years of Solitude: TITLE (confidence: 0.997)
  → Gabriel García Márquez: AUTHOR (confidence: 0.991)

Text: The Harry Potter series was written by J.K. Rowling.
  → Harry Potter: TITLE (confidence: 0.847)
  → J. K. Rowling: AUTHOR (confidence: 0.992)

Text: Don Quixote is a novel by Miguel de Cervantes.
  → Don Quixote: TITLE (confidence: 0.997)
  → Miguel de Cervantes: AUTHOR (confidence: 0.996)

Text: have you read The Lord of the Rings?
  → The Lord of the Rings: TITLE (confidence: 0.997)

Text: anything like 1984 by george orwell would be great
  → 1984: TITLE (confidence: 0.997)
  → george orwell: AUTHOR (confidence: 0.996)

Text: looking for books similar to Pride and Prejudice
  → Pride and Prejudice: TITLE (confidence: 0.995)
