In [1]:
!pip install torch torchvision torchaudio transformers datasets seqeval numpy pandas tqdm python-crfsuite torchcrf



In [8]:
!curl -L -o conll2003_train.json https://huggingface.co/datasets/mesolitica/conll2003/resolve/main/conll2003_train.json
!curl -L -o conll2003_validation.json https://huggingface.co/datasets/mesolitica/conll2003/resolve/main/conll2003_validation.json
!curl -L -o conll2003_test.json https://huggingface.co/datasets/mesolitica/conll2003/resolve/main/conll2003_test.json


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    29  100    29    0     0     80      0 --:--:-- --:--:-- --:--:--    80
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    29  100    29    0     0     84      0 --:--:-- --:--:-- --:--:--    84
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    29  100    29    0     0    101      0 --:--:-- --:--:-- --:--:--   101


In [2]:
!pip install -q datasets
from datasets import load_dataset

dataset = load_dataset("wikiann", "en")
print(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})


In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [5]:
!pip install -q evaluate
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [6]:
!pip install -U transformers accelerate




In [7]:
# Upgrade huggingface libs to versions that support evaluation_strategy, save_strategy, etc.
!pip install -q --upgrade transformers accelerate datasets evaluate
# show installed versions for quick sanity check
import pkgutil, importlib
import transformers, datasets, evaluate, accelerate
print("transformers", transformers.__version__)
print("datasets", datasets.__version__)
print("evaluate", evaluate.__version__)
print("accelerate", accelerate.__version__)


transformers 4.57.1
datasets 4.3.0
evaluate 0.4.6
accelerate 1.11.0


In [14]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import os

os.environ["WANDB_DISABLED"] = "true"

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(5000)),
    eval_dataset=tokenized_datasets["validation"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ✅ Train and evaluate
trainer.train()
eval_metrics = trainer.evaluate()
print(eval_metrics)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
50,1.2158
100,0.5945
150,0.4347
200,0.3672
250,0.3589
300,0.359
350,0.2527
400,0.2348
450,0.2411
500,0.2329


{'eval_loss': 0.2552184760570526, 'eval_precision': 0.8099337748344371, 'eval_recall': 0.8463667820069204, 'eval_f1': 0.8277495769881558, 'eval_accuracy': 0.9321244792942907, 'eval_runtime': 7.3659, 'eval_samples_per_second': 135.761, 'eval_steps_per_second': 8.553, 'epoch': 3.0}


In [17]:
# ✅ Explicitly save full model + tokenizer with configuration
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

In [18]:
import torch
from transformers import pipeline

# Load the fine-tuned model directly into a NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model="./ner_model",     # path to your trained model
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # groups subword tokens together
)

# Test sentences
texts = [
    "Barack Obama was born in Hawaii.",
    "Apple Inc. is headquartered in Cupertino, California.",
    "Sachin Tendulkar played for Mumbai Indians in the IPL.",
    "OpenAI developed ChatGPT for natural language processing."
]

# Run predictions
for text in texts:
    print(f"\n🔹 Input: {text}")
    results = ner_pipeline(text)
    for entity in results:
        print(f"  - {entity['word']} → {entity['entity_group']} (score: {entity['score']:.2f})")

Device set to use cuda:0



🔹 Input: Barack Obama was born in Hawaii.
  - Barack → LABEL_1 (score: 0.96)
  - Obama → LABEL_2 (score: 0.98)
  - was born in → LABEL_0 (score: 1.00)
  - Hawaii → LABEL_5 (score: 0.97)
  - . → LABEL_0 (score: 1.00)

🔹 Input: Apple Inc. is headquartered in Cupertino, California.
  - Apple → LABEL_3 (score: 0.88)
  - Inc. → LABEL_4 (score: 0.94)
  - is headquartered in → LABEL_0 (score: 1.00)
  - Cup → LABEL_5 (score: 0.97)
  - ##ertino, California → LABEL_6 (score: 0.90)
  - . → LABEL_0 (score: 1.00)

🔹 Input: Sachin Tendulkar played for Mumbai Indians in the IPL.
  - Sachin → LABEL_1 (score: 0.80)
  - Tendulkar → LABEL_2 (score: 0.96)
  - played for → LABEL_0 (score: 1.00)
  - Mumbai → LABEL_3 (score: 0.98)
  - Indians → LABEL_4 (score: 0.98)
  - in the → LABEL_0 (score: 1.00)
  - IP → LABEL_3 (score: 0.90)
  - ##L → LABEL_4 (score: 0.94)
  - . → LABEL_0 (score: 1.00)

🔹 Input: OpenAI developed ChatGPT for natural language processing.
  - Open → LABEL_3 (score: 0.78)
  - ##AI → LABEL

In [19]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("./ner_model")
tokenizer = AutoTokenizer.from_pretrained("./ner_model")

# View label mappings
label2id = model.config.label2id
id2label = model.config.id2label

print("🔹 Label → ID mapping:")
for label, idx in label2id.items():
    print(f"  {label:15} → {idx}")

print("\n🔹 ID → Label mapping:")
for idx, label in id2label.items():
    print(f"  {idx:2} → {label}")

🔹 Label → ID mapping:
  LABEL_0         → 0
  LABEL_1         → 1
  LABEL_2         → 2
  LABEL_3         → 3
  LABEL_4         → 4
  LABEL_5         → 5
  LABEL_6         → 6

🔹 ID → Label mapping:
   0 → LABEL_0
   1 → LABEL_1
   2 → LABEL_2
   3 → LABEL_3
   4 → LABEL_4
   5 → LABEL_5
   6 → LABEL_6


In [20]:
from transformers import AutoModelForTokenClassification

# Path to your trained model folder
model_dir = "./ner_model"

# Use these labels (you can adjust if using a different dataset)
labels = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# Create label mappings
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load your trained model
model = AutoModelForTokenClassification.from_pretrained(model_dir)

# Attach label info
model.config.id2label = id2label
model.config.label2id = label2id

# Save back the updated config
model.save_pretrained(model_dir)

print("✅ Labels updated successfully!")

✅ Labels updated successfully!


In [21]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("./ner_model")
print("🔹 ID → Label mapping:")
print(model.config.id2label)

🔹 ID → Label mapping:
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}


In [22]:
from transformers import pipeline

ner_pipeline = pipeline(
    "token-classification",
    model="./ner_model",
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

texts = [
    "Barack Obama was born in Hawaii.",
    "Apple Inc. is headquartered in Cupertino, California.",
    "Sachin Tendulkar played for Mumbai Indians in the IPL."
]

for text in texts:
    print(f"\n🔹 Input: {text}")
    results = ner_pipeline(text)
    for entity in results:
        print(f"  - {entity['word']} → {entity['entity_group']} (score: {entity['score']:.2f})")

Device set to use cuda:0



🔹 Input: Barack Obama was born in Hawaii.
  - Barack Obama → PER (score: 0.97)
  - Hawaii → LOC (score: 0.97)

🔹 Input: Apple Inc. is headquartered in Cupertino, California.
  - Apple Inc. → ORG (score: 0.92)
  - Cupertino, California → LOC (score: 0.91)

🔹 Input: Sachin Tendulkar played for Mumbai Indians in the IPL.
  - Sa → PER (score: 0.85)
  - ##chin Tendulkar → PER (score: 0.92)
  - Mumbai Indians → ORG (score: 0.98)
  - IPL → ORG (score: 0.92)


In [23]:
!ls -R /content

/content:
conll2003_test.json   conll2003_validation.json  ner_model
conll2003_train.json  logs			 sample_data

/content/logs:
events.out.tfevents.1761813177.6c14dae8594f.4484.0
events.out.tfevents.1761813258.6c14dae8594f.4484.1
events.out.tfevents.1761813318.6c14dae8594f.4484.2
events.out.tfevents.1761813661.6c14dae8594f.4484.3

/content/ner_model:
checkpoint-500	config.json	   special_tokens_map.json  tokenizer.json
checkpoint-939	model.safetensors  tokenizer_config.json    vocab.txt

/content/ner_model/checkpoint-500:
config.json	   scheduler.pt		    trainer_state.json
model.safetensors  special_tokens_map.json  training_args.bin
optimizer.pt	   tokenizer_config.json    vocab.txt
rng_state.pth	   tokenizer.json

/content/ner_model/checkpoint-939:
config.json	   scheduler.pt		    trainer_state.json
model.safetensors  special_tokens_map.json  training_args.bin
optimizer.pt	   tokenizer_config.json    vocab.txt
rng_state.pth	   tokenizer.json

/content/sample_data:
anscombe.json		     

In [24]:
!rm /content/conll2003_*.json


In [25]:
!rm /content/conll2003_*.json


rm: cannot remove '/content/conll2003_*.json': No such file or directory


In [26]:
!ls -R /content

/content:
logs  ner_model  sample_data

/content/logs:
events.out.tfevents.1761813177.6c14dae8594f.4484.0
events.out.tfevents.1761813258.6c14dae8594f.4484.1
events.out.tfevents.1761813318.6c14dae8594f.4484.2
events.out.tfevents.1761813661.6c14dae8594f.4484.3

/content/ner_model:
checkpoint-500	config.json	   special_tokens_map.json  tokenizer.json
checkpoint-939	model.safetensors  tokenizer_config.json    vocab.txt

/content/ner_model/checkpoint-500:
config.json	   scheduler.pt		    trainer_state.json
model.safetensors  special_tokens_map.json  training_args.bin
optimizer.pt	   tokenizer_config.json    vocab.txt
rng_state.pth	   tokenizer.json

/content/ner_model/checkpoint-939:
config.json	   scheduler.pt		    trainer_state.json
model.safetensors  special_tokens_map.json  training_args.bin
optimizer.pt	   tokenizer_config.json    vocab.txt
rng_state.pth	   tokenizer.json

/content/sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
cali

In [27]:
from google.colab import drive
drive.mount('/content/drive')

# Copy the model and logs to Drive
!cp -r /content/ner_model /content/drive/MyDrive/
!cp -r /content/logs /content/drive/MyDrive/

Mounted at /content/drive
