In [1]:

!pip install datasets --no-build-isolation
!pip install seqeval
!pip install transformers[torch]




In [2]:
%pip install evaluate



In [3]:
!pip install evaluate



In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import numpy as np
import evaluate

# CoNLL-2003 Dataset Description

The CoNLL-2003 dataset is widely used for training and evaluating Named Entity Recognition (NER) models. The dataset focuses on four types of named entities: persons (PER), locations (LOC), organizations (ORG), and miscellaneous entities (MISC).

## Dataset Structure:
Each data file contains four columns separated by a single space:
1. Word
2. Part-of-Speech (POS) tag
3. Syntactic chunk tag
4. Named entity tag

Words are listed on separate lines, and sentences are separated by a blank line.
The chunk and named entity tags follow the IOB2 tagging scheme:
- `B-TYPE`: Beginning of a phrase of type TYPE
- `I-TYPE`: Inside a phrase of type TYPE
- `O`: Outside any named entity phrase

## Example:
```python
{
    "chunk_tags": [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 11, 12, 12, 21, 22, 22, 13, 11, 0],
    "id": "0",
    "ner_tags": [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    "pos_tags": [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7],
    "tokens": ["The", "European", "Commission", "said", "on", "Thursday", "it", "disagreed", "with", "German", "advice", "to", "consumers", "to", "shun", "British", "lamb", "until", "scientists", "determine", "whether", "mad", "cow", "disease", "can", "be", "transmitted", "to", "sheep", "."]
}


## Named Entity Tags
- **O**: Outside a named entity
- **B-PER**: Beginning of a person's name
- **I-PER**: Inside a person's name
- **B-ORG**: Beginning of an organization name
- **I-ORG**: Inside an organization name
- **B-LOC**: Beginning of a location name
- **I-LOC**: Inside a location name
- **B-MISC**: Beginning of miscellaneous entity
- **I-MISC**: Inside a miscellaneous entity


In [5]:
 !pip install "datasets==2.19.0"



In [6]:
from google.colab import userdata
from datasets import load_dataset

# Load dataset with latin-1 encoding
hf_token = userdata.get('HF_TOKEN') # Assuming your token is stored as 'HF_TOKEN' in Colab secrets
dataset = load_dataset("conll2003", token=hf_token, encoding='latin-1')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

ValueError: BuilderConfig Conll2003Config(name='conll2003', version=1.0.0, data_dir=None, data_files=None, description='Conll2003 dataset') doesn't have a 'encoding' key.

In [7]:
dataset = load_dataset("conll2003")

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/datasets/conll2003/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98 (last modified on Sat Aug 23 07:13:43 2025) since it couldn't be found locally at conll2003, or remotely on the Hugging Face Hub.


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [10]:
# Model checkpoint
checkpoint = "bert-base-cased"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## Token Labeling in NER: Use of `-100`

In Named Entity Recognition (NER) tasks, the label `-100` is commonly used to signify that certain tokens should be ignored during the loss calculation in model training. This approach helps focus the learning on meaningful parts of the data. Here's an overview of the types of tokens typically assigned a `-100` label:

### 1. **Subsequent Sub-tokens**
After a word is split into multiple sub-tokens, only the first sub-token receives the actual entity label. Subsequent sub-tokens receive `-100` to ensure that entity labels are not incorrectly assigned to fragments of words.

### 2. **Special Tokens**
Special tokens such as `[CLS]`, `[SEP]`, and `[PAD]` used for managing sequence boundaries and lengths in models like BERT are also assigned `-100` as they do not correspond to real words in the text.

### 3. **Non-Entity Tokens**
In certain training setups, tokens that do not correspond to any entity and are not the focus of the task might also be marked with `-100`, especially in cases of imbalanced datasets.

### Example
- **Sentence**: "John lives in New York"
- **Tokens**: ["[CLS]", "John", "lives", "in", "New", "York", "[SEP]"]
- **Labels**: [-100, "B-PER", "O", "O", "B-LOC", "I-LOC", -100]

This labeling strategy is critical for efficient model training, ensuring that the model focuses only on relevant tokens.


In [11]:
# Tokenize and align labels without fixed padding
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [12]:
# Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=9)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


`seqeval` is a Python framework for sequence labeling evaluation. seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on.

In [18]:
# Load seqeval metric
metric = evaluate.load("seqeval")

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [19]:
# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names

# Set up data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

In [28]:
# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names

# Add these lines to set label mappings
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

In [29]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model.config.id2label = id2label
model.config.label2id = label2id
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

In [20]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [21]:
# Initialize Trainer with compute_metrics and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [22]:
# Train model
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcsd1-19-54[0m ([33mcsd1-19-54-octaloop[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1613,0.041455,0.919179,0.934029,0.926544,0.988396
2,0.0275,0.039589,0.939936,0.945473,0.942697,0.990304
3,0.0139,0.038142,0.94557,0.950185,0.947872,0.990966


TrainOutput(global_step=2634, training_loss=0.05083486910742526, metrics={'train_runtime': 622.9262, 'train_samples_per_second': 67.621, 'train_steps_per_second': 4.228, 'total_flos': 1050534559887048.0, 'train_loss': 0.05083486910742526, 'epoch': 3.0})

In [23]:
# Evaluate model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.03814166784286499, 'eval_precision': 0.945570256238486, 'eval_recall': 0.9501851228542578, 'eval_f1': 0.9478720725258123, 'eval_accuracy': 0.9909660838752385, 'eval_runtime': 10.4535, 'eval_samples_per_second': 310.901, 'eval_steps_per_second': 19.515, 'epoch': 3.0}


In [24]:
# 1. Install necessary libraries
!pip install -q streamlit pyngrok
!pip install -q "datasets==2.19.0" "transformers==4.40.1" "torch==2.3.0" "seqeval"

# 2. Define the Streamlit app content as a Python string
app_code = """
import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np

# --- CONFIGURATION ---
MODEL_DIR = "./model"
st.set_page_config(page_title="NER with BERT", page_icon="🤖", layout="wide")

# --- MODEL LOADING ---
@st.cache_resource
def load_model_and_tokenizer(model_path):
    '''Load the fine-tuned model and tokenizer.'''
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading model from {model_path}: {e}")
        return None, None

tokenizer, model = load_model_and_tokenizer(MODEL_DIR)
if model is None:
    st.error("Model and/or tokenizer could not be loaded. Please ensure the './model' directory exists and contains the correct files.")
    st.stop()

# --- NER VISUALIZATION ---
ENTITY_COLORS = {
    "PER": "#ffc107",  # Yellow
    "ORG": "#007bff",  # Blue
    "LOC": "#28a745",  # Green
    "MISC": "#dc3545", # Red
}
LABEL_NAMES = model.config.id2label

def get_entity_html(text, label):
    '''Generates HTML for a single entity with a colored background.'''
    entity_type = label.split('-')[-1]
    color = ENTITY_COLORS.get(entity_type, "#adb5bd")
    return f'<span style="background-color: {color}; color: white; padding: 0.2em 0.4em; margin: 0 0.2em; border-radius: 0.3em; font-weight: bold;">{text} <span style="font-size: 0.8em; opacity: 0.7;">{entity_type}</span></span>'

# --- STREAMLIT APP LAYOUT ---
st.title("Named Entity Recognition (NER) with BERT")
st.markdown("Enter text below to identify entities like Persons (PER), Organizations (ORG), Locations (LOC), and Miscellaneous (MISC).")

text_input = st.text_area("Input Text", height=150, placeholder="Example: Elon Musk, the CEO of SpaceX, announced a new mission to Mars from their headquarters in California.")

if st.button("Analyze Text"):
    if not text_input:
        st.warning("Please enter some text to analyze.")
    elif not tokenizer or not model:
        st.error("Model is not loaded. Cannot perform analysis.")
    else:
        with st.spinner("Analyzing..."):
            # 1. Tokenization and Prediction
            inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
            tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

            # 2. Post-process to align tokens with words and labels
            word_predictions = []
            current_word = ""
            current_label_id = -1
            word_ids = inputs.word_ids()

            for i, token in enumerate(tokens):
                if token in (tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token):
                    continue

                word_id = word_ids[i]
                if word_id is not None:
                    start, end = inputs.token_to_chars(i)
                    word = text_input[start:end]

                    # New word begins
                    if word_id != (word_ids[i-1] if i > 0 else None):
                        if current_word: # Append previous word
                            word_predictions.append((current_word, LABEL_NAMES[current_label_id]))
                        current_word = word
                        current_label_id = predictions[i]
                    # Word continues (subword)
                    else:
                        # The label for a multi-token word is determined by its first token
                        pass

            # Add the last word
            if current_word:
                word_predictions.append((current_word, LABEL_NAMES[current_label_id]))

            # 3. Group recognized entities
            display_text = text_input
            grouped_entities = []
            current_entity_text = ""
            current_entity_label = ""

            for word, label in word_predictions:
                if label.startswith("B-"):
                    if current_entity_text:
                        grouped_entities.append({"text": current_entity_text, "label": current_entity_label})
                    current_entity_text = word
                    current_entity_label = label.split('-')[1]
                elif label.startswith("I-") and current_entity_label == label.split('-')[1]:
                    current_entity_text += " " + word
                else:
                    if current_entity_text:
                        grouped_entities.append({"text": current_entity_text, "label": current_entity_label})
                        current_entity_text = ""
                        current_entity_label = ""

            if current_entity_text:
                grouped_entities.append({"text": current_entity_text, "label": current_entity_label})

            # 4. Display Results
            st.subheader("Analysis Results")
            # Highlight entities in the text
            highlighted_text = text_input
            for entity in reversed(grouped_entities):
                highlighted_text = highlighted_text.replace(entity["text"], get_entity_html(entity["text"], entity["label"]), 1)
            st.markdown(highlighted_text, unsafe_allow_html=True)

            # List extracted entities
            st.subheader("Extracted Entities")
            if grouped_entities:
                for entity in grouped_entities:
                    st.markdown(f"- **{entity['text']}** (`{entity['label']}`)")
            else:
                st.info("No entities were found in the text.")
"""

# 3. Write the app code to a file named app.py
with open("app.py", "w") as f:
    f.write(app_code)

# 4. Save the fine-tuned model and tokenizer from your trainer
# This assumes your 'trainer' and 'tokenizer' variables are already defined and the model is trained.
model_save_path = "./model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# 5. Setup ngrok and run Streamlit
from pyngrok import ngrok

# Terminate any existing tunnels
ngrok.kill()

# Get your ngrok authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
# It's recommended to set this as a secret in Colab
NGROK_AUTH_TOKEN = "31fsIBq4OPDzMgH7CMSxZp239nc_5jjBM7CDN7XxU8PBTkG6e" #@param {type:"string"}
if not NGROK_AUTH_TOKEN:
    print("Please enter your ngrok authtoken.")
else:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    # Run streamlit in background
    !nohup streamlit run app.py --server.port 8501 &
    # Open a tunnel to the streamlit port
    public_url = ngrok.connect(addr="8501", proto="http")
    print(f"🎉 Your Streamlit app is live at: {public_url}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [26]:
 # Cell to zip the model folder
!zip -r model.zip ./model

  adding: model/ (stored 0%)
  adding: model/tokenizer.json (deflated 70%)
  adding: model/training_args.bin (deflated 54%)
  adding: model/model.safetensors (deflated 7%)
  adding: model/config.json (deflated 56%)
  adding: model/special_tokens_map.json (deflated 42%)
  adding: model/vocab.txt (deflated 49%)
  adding: model/tokenizer_config.json (deflated 75%)


In [27]:
from transformers import pipeline
nlp = pipeline("ner", model="./model", tokenizer="./model", aggregation_strategy="simple")
print(nlp("Elon Musk is the CEO of SpaceX."))

Device set to use cuda:0


[{'entity_group': 'LABEL_1', 'score': np.float32(0.9977156), 'word': 'El', 'start': 0, 'end': 2}, {'entity_group': 'LABEL_2', 'score': np.float32(0.90362686), 'word': '##on Musk', 'start': 2, 'end': 9}, {'entity_group': 'LABEL_0', 'score': np.float32(0.99985623), 'word': 'is the CEO of', 'start': 10, 'end': 23}, {'entity_group': 'LABEL_3', 'score': np.float32(0.9982938), 'word': 'Space', 'start': 24, 'end': 29}, {'entity_group': 'LABEL_4', 'score': np.float32(0.99671626), 'word': '##X', 'start': 29, 'end': 30}, {'entity_group': 'LABEL_0', 'score': np.float32(0.99987435), 'word': '.', 'start': 30, 'end': 31}]
