In [None]:
# Install transformers for the BERT model, datasets for data handling,
# and seqeval for NER-specific evaluation metrics.
!pip install transformers datasets evaluate seqeval -q


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
from datasets import Dataset

# 1. Define all unique NER tags your model should learn.
labels_list = [
    "O",
    "B-NAME", "I-NAME",
    "B-ID",
    "B-ADDRESS", "I-ADDRESS"
]

# 2. Create mappings between the string labels and integer IDs.
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}

print("--- Label to ID Mapping ---")
print(label2id)

# 3. Create your raw dataset. This is the smaller, original dataset.
raw_data = {
    'id': ['0', '1', '2', '3'],
    'tokens': [
        ["My", "name", "is", "Priya", "Sharma", "."],
        ["Please", "use", "ID", "number", "AX-451-22", "."],
        ["He", "lives", "at", "123", "Main", "St", "."],
        ["Forward", "mail", "for", "John", "Doe", "to", "123", "Main", "St", "."]
    ],
    'ner_tags_str': [
        ["O", "O", "O", "B-NAME", "I-NAME", "O"],
        ["O", "O", "O", "O", "B-ID", "O"],
        ["O", "O", "O", "B-ADDRESS", "I-ADDRESS", "I-ADDRESS", "O"],
        ["O", "O", "O", "B-NAME", "I-NAME", "O", "B-ADDRESS", "I-ADDRESS", "I-ADDRESS", "O"]
    ]
}


# 4. Convert the string tags to their integer IDs.
raw_data['ner_tags'] = [
    [label2id[tag] for tag in tags] for tags in raw_data['ner_tags_str']
]

# 5. Convert the Python dictionary into a Hugging Face Dataset object.
dataset = Dataset.from_dict(raw_data)

print("\n--- Sample Dataset Record ---")
print(dataset[0])


--- Label to ID Mapping ---
{'O': 0, 'B-NAME': 1, 'I-NAME': 2, 'B-ID': 3, 'B-ADDRESS': 4, 'I-ADDRESS': 5}

--- Sample Dataset Record ---
{'id': '0', 'tokens': ['My', 'name', 'is', 'Priya', 'Sharma', '.'], 'ner_tags_str': ['O', 'O', 'O', 'B-NAME', 'I-NAME', 'O'], 'ner_tags': [0, 0, 0, 1, 2, 0]}


In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer. 'bert-base-cased' is a robust choice
# as it respects capitalization, which can be important for names.
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    """
    This function tokenizes text and aligns the NER labels with the
    new subword tokens.
    """
    # Tokenize the words, respecting the pre-split format.
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    # For each sentence in the batch...
    for i, label in enumerate(examples["ner_tags"]):
        # Get the word IDs for each token.
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        # For each token in the sentence...
        for word_idx in word_ids:
            # If it's a special token ([CLS], [SEP]), assign -100.
            if word_idx is None:
                label_ids.append(-100)
            # If it's a new word, assign its actual label.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # If it's a subsequent subword of the same word, assign -100.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    # Add the aligned labels to our tokenized inputs.
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# Use the .map() method to apply the function to every record in the dataset.
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Remove original columns that the model doesn't need for training.
tokenized_dataset = tokenized_dataset.remove_columns([
    'tokens', 'ner_tags_str', 'ner_tags', 'id'
])

print("\n--- Tokenized and Aligned Record ---")
print("Tokens:", tokenizer.convert_ids_to_tokens(tokenized_dataset[0]['input_ids']))
print("Labels:", tokenized_dataset[0]['labels'])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]


--- Tokenized and Aligned Record ---
Tokens: ['[CLS]', 'My', 'name', 'is', 'P', '##riya', 'Sharma', '.', '[SEP]']
Labels: [-100, 0, 0, 0, 1, -100, 2, 0, -100]


In [None]:
import numpy as np
import evaluate
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# A Data Collator creates batches of data. This one will dynamically pad
# sentences to the same length for every batch.
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load the seqeval metric, which is the standard for NER tasks.
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    """
    This function computes precision, recall, F1, and accuracy for the NER task.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert IDs back to string labels, removing the -100 ignored indices.
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics using seqeval.
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Load the pre-trained BERT model for token classification.
# Pass our custom label mappings to it.
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels_list),
    id2label=id2label,
    label2id=label2id
)

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the directory where the model checkpoints will be saved.
model_output_dir = "/content/drive/MyDrive/bert_pii_ner_model"

# Define the directory where the model checkpoints will be saved.
model_output_dir = "/content/drive/MyDrive/bert_pii_ner_model"

# Define the training hyperparameters using TrainingArguments.
training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increase epochs for better performance on real data
    weight_decay=0.01,
    # NOTE: The argument names below were changed to fix a common versioning issue.
    # Older versions of the transformers library used these names.
    eval_strategy="epoch", # Evaluate performance at the end of each epoch
    save_strategy="epoch",      # Save a model checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
)

# Initialize the Trainer object.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset, # The dataset is small, so we use it for both
    eval_dataset=tokenized_dataset,  # In practice, create a separate validation set
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start the training process!
print("--- Starting Model Training ---")
trainer.train()
print("--- Training Complete ---")
print("--- Training Complete ---")

  trainer = Trainer(


--- Starting Model Training ---


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mishant3366[0m ([33mishant3366-srm-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.569215,0.0,0.0,0.0,0.448276
2,No log,1.345324,0.0,0.0,0.0,0.689655
3,No log,1.159911,0.0,0.0,0.0,0.724138
4,No log,1.019813,0.0,0.0,0.0,0.724138
5,No log,0.911639,0.0,0.0,0.0,0.724138
6,No log,0.827464,0.0,0.0,0.0,0.758621
7,No log,0.764217,0.0,0.0,0.0,0.758621
8,No log,0.719343,0.0,0.0,0.0,0.758621
9,No log,0.690515,0.25,0.2,0.222222,0.827586
10,No log,0.676255,1.0,0.6,0.75,0.896552


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


--- Training Complete ---
--- Training Complete ---


In [None]:
# Define a path for the final, ready-to-use model.
final_model_path = f"{model_output_dir}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Final model and tokenizer saved to: {final_model_path}")

Final model and tokenizer saved to: /content/drive/MyDrive/bert_pii_ner_model/final_model


In [None]:
from transformers import pipeline

# Load the inference pipeline with your custom model.
ner_pipeline = pipeline("ner", model=final_model_path, tokenizer=final_model_path)

# Test the pipeline on new text.
text = "Please send the documents for Jane Doe to 456 Park Ave. Her reference is GZ-123-45."

# The `aggregation_strategy` groups subword tokens back into single entities.
results = ner_pipeline(text, aggregation_strategy="simple")

print("\n--- Inference Results ---")
for entity in results:
  print(f"Entity: {entity['word']}\nGroup: {entity['entity_group']}\nScore: {entity['score']:.4f}\n")

Device set to use cuda:0



--- Inference Results ---
