In [None]:
!pip install -q transformers torch

In [None]:
!pip install -q transformers datasets seqeval accelerate

In [None]:
!pip install -q sklearn-crfsuite seqeval joblib

In [21]:
import csv, os
from sklearn_crfsuite import CRF
from seqeval.metrics import classification_report
import joblib

TRAIN_CSV = "roman_train_5500.csv"
DEV_CSV   = "roman_dev_300.csv"
TEST_CSV  = "roman_test_300.csv"


def read_csv_as_lists(csv_path):
    """
    CSV expected columns: sentence_id, sentence, labels
    sentence = tokens joined by space
    labels   = BIO labels joined by space (aligned to tokens)
    Returns list_of_tokens, list_of_labels
    """
    sents = []
    labs = []
    with open(csv_path, encoding="utf8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if 'sentence' in row and 'labels' in row:
                sent = row['sentence'].strip()
                labels = row['labels'].strip()
            else:
                vals = list(row.values())
                if len(vals) >= 3:
                    sent = vals[1].strip()
                    labels = vals[2].strip()
                else:
                    continue
            tokens = sent.split()
            label_tokens = labels.split()
            if len(tokens) != len(label_tokens):
                # If alignment mismatch, try to best-effort fix: truncate to min length
                m = min(len(tokens), len(label_tokens))
                tokens = tokens[:m]
                label_tokens = label_tokens[:m]
            sents.append(tokens)
            labs.append(label_tokens)
    return sents, labs

def word2features(sent, i):
    word = sent[i]
    f = {
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i>0:
        w1=sent[i-1]
        f.update({'-1:word.lower()':w1.lower(), '-1:istitle()':w1.istitle()})
    else:
        f['BOS']=True
    if i<len(sent)-1:
        w1=sent[i+1]
        f.update({'+1:word.lower()':w1.lower(), '+1:istitle()':w1.istitle()})
    else:
        f['EOS']=True
    f['prefix3']=word[:3]
    f['suffix3']=word[-3:]
    return f

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

assert os.path.exists(TRAIN_CSV), f"Train CSV not found: {TRAIN_CSV}"
assert os.path.exists(DEV_CSV), f"Dev CSV not found: {DEV_CSV}"
assert os.path.exists(TEST_CSV), f"Test CSV not found: {TEST_CSV}"

train_sents, train_labels = read_csv_as_lists(TRAIN_CSV)
dev_sents, dev_labels = read_csv_as_lists(DEV_CSV)
test_sents, test_labels = read_csv_as_lists(TEST_CSV)

print(f"Loaded: train={len(train_sents)}, dev={len(dev_sents)}, test={len(test_sents)}")

# Build feature matrices
X_train = [sent2features(s) for s in train_sents]
X_dev   = [sent2features(s) for s in dev_sents]

# Train CRF
crf = CRF(
    algorithm='lbfgs',
    c1=0.1, c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
print("Fitting CRF on train set...")
crf.fit(X_train, train_labels)

# Predict & report
y_pred = crf.predict(X_dev)
print("\nCRF dev set classification report:\n")
print(classification_report(dev_labels, y_pred, digits=4))

# Save model to disk
joblib.dump(crf, "crf_model.joblib")
print("\nSaved CRF model to crf_model.joblib in the notebook working directory.")


Loaded: train=5501, dev=300, test=300
Fitting CRF on train set...

CRF dev set classification report:

              precision    recall  f1-score   support

        DATE     0.6071    0.5667    0.5862        30
       EVENT     1.0000    0.4800    0.6486        25
         LOC     0.8953    0.7404    0.8105       104
         ORG     0.5895    1.0000    0.7417        56
         PER     0.8813    0.9800    0.9280       250
     PRODUCT     0.8919    0.8919    0.8919        37

   micro avg     0.8209    0.8765    0.8478       502
   macro avg     0.8109    0.7765    0.7678       502
weighted avg     0.8420    0.8765    0.8459       502


Saved CRF model to crf_model.joblib in the notebook working directory.


In [22]:
# MBERT
import os
os.environ["WANDB_DISABLED"] = "true"

import csv
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import precision_score, recall_score, f1_score

TRAIN_CSV = "roman_train_5500.csv"
DEV_CSV   = "roman_dev_300.csv"
TEST_CSV  = "roman_test_300.csv"


def read_csv(csv_path):
    sents, labs = [], []
    with open(csv_path, encoding="utf8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            toks  = row["sentence"].split()
            labels = row["labels"].split()
            if len(toks) != len(labels):
                m = min(len(toks), len(labels))
                toks   = toks[:m]
                labels = labels[:m]
            sents.append(toks)
            labs.append(labels)
    return sents, labs

train_sents, train_labels = read_csv(TRAIN_CSV)
dev_sents, dev_labels = read_csv(DEV_CSV)
test_sents, test_labels = read_csv(TEST_CSV)

# Create label set
label_set = sorted({l for seq in (train_labels + dev_labels + test_labels) for l in seq})
if "O" in label_set:
    label_set.remove("O")
label_set = ["O"] + label_set

label_to_id = {l:i for i,l in enumerate(label_set)}
id_to_label = {i:l for l,i in label_to_id.items()}

MODEL = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

def encode_examples(sents, labels):
    all_input_ids = []
    all_attention = []
    all_labels = []

    for tokens, labs in zip(sents, labels):
        tok = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=128
        )

        word_ids = tok.word_ids()
        lab_ids = []

        for w in word_ids:
            if w is None:
                lab_ids.append(-100)
            else:
                lab_ids.append(label_to_id[labs[w]])

        all_input_ids.append(tok["input_ids"])
        all_attention.append(tok["attention_mask"])
        all_labels.append(lab_ids)

    return Dataset.from_dict({
        "input_ids": all_input_ids,
        "attention_mask": all_attention,
        "labels": all_labels
    })

train_ds = encode_examples(train_sents, train_labels)
dev_ds   = encode_examples(dev_sents, dev_labels)
test_ds  = encode_examples(test_sents, test_labels)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL,
    num_labels=len(label_set),
    id2label=id_to_label,
    label2id=label_to_id
)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=2)

    true_labels = [[id_to_label[l] for l in lab_row if l != -100] for lab_row in labels]
    true_preds  = [[id_to_label[p] for p,l in zip(pred_row, lab_row) if l != -100]
                   for pred_row, lab_row in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    output_dir="mbert_out1",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_steps=50,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

print("\nDEV RESULTS:", trainer.evaluate())

test_out = trainer.predict(test_ds)
print("\nTEST RESULTS:", test_out.metrics)

trainer.save_model("mbert_model")
print("\nSaved model to ./mbert_model")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.7829
100,0.1827
150,0.0856
200,0.0636
250,0.0392
300,0.0456
350,0.0257
400,0.0353
450,0.0331
500,0.018



DEV RESULTS: {'eval_loss': 0.4220834970474243, 'eval_precision': 0.8380281690140845, 'eval_recall': 0.9054347826086957, 'eval_f1': 0.87042842215256, 'eval_runtime': 2.1608, 'eval_samples_per_second': 138.837, 'eval_steps_per_second': 17.586, 'epoch': 4.0}

TEST RESULTS: {'test_loss': 0.44631582498550415, 'test_precision': 0.8341658341658341, 'test_recall': 0.9115720524017468, 'test_f1': 0.8711528429838289, 'test_runtime': 1.6114, 'test_samples_per_second': 186.176, 'test_steps_per_second': 23.582}

Saved model to ./mbert_model


In [23]:
from seqeval.metrics import classification_report

preds = np.argmax(test_out.predictions, axis=2)

true_labels = []
true_preds  = []

for pred_row, label_row in zip(preds, test_out.label_ids):
    t_labels = []
    t_preds  = []
    for p,l in zip(pred_row, label_row):
        if l != -100:
            t_labels.append(id_to_label[l])
            t_preds.append(id_to_label[p])
    true_labels.append(t_labels)
    true_preds.append(t_preds)

print(classification_report(true_labels, true_preds, digits=4))


              precision    recall  f1-score   support

        DATE     0.5730    0.7183    0.6375        71
       EVENT     1.0000    1.0000    1.0000        37
         LOC     0.9306    0.7090    0.8048       189
         ORG     0.5588    1.0000    0.7170        57
         PER     0.9089    0.9961    0.9505       511
     PRODUCT     0.6812    0.9216    0.7833        51

   micro avg     0.8342    0.9116    0.8712       916
   macro avg     0.7754    0.8908    0.8155       916
weighted avg     0.8566    0.9116    0.8743       916



In [24]:
#XLM-RoBERTa NER

import os
os.environ["WANDB_DISABLED"] = "true"

import csv
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import precision_score, recall_score, f1_score

TRAIN_CSV = "roman_train_5500.csv"
DEV_CSV   = "roman_dev_300.csv"
TEST_CSV  = "roman_test_300.csv"


def read_csv(csv_path):
    sents, labs = [], []
    with open(csv_path, encoding="utf8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            toks  = row["sentence"].split()
            labels = row["labels"].split()
            if len(toks) != len(labels):
                m = min(len(toks), len(labels))
                toks   = toks[:m]
                labels = labels[:m]
            sents.append(toks)
            labs.append(labels)
    return sents, labs

train_sents, train_labels = read_csv(TRAIN_CSV)
dev_sents, dev_labels = read_csv(DEV_CSV)
test_sents, test_labels = read_csv(TEST_CSV)

# Build label set
label_set = sorted({l for seq in (train_labels + dev_labels + test_labels) for l in seq})
if "O" in label_set:
    label_set.remove("O")
label_set = ["O"] + label_set

label_to_id = {l:i for i,l in enumerate(label_set)}
id_to_label = {i:l for l,i in label_to_id.items()}

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def encode_examples(sents, labels):
    all_input_ids = []
    all_attention = []
    all_labels = []

    for tokens, labs in zip(sents, labels):
        tok = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=128
        )

        word_ids = tok.word_ids()
        aligned = []
        for w in word_ids:
            if w is None:
                aligned.append(-100)
            else:
                aligned.append(label_to_id[labs[w]])

        all_input_ids.append(tok["input_ids"])
        all_attention.append(tok["attention_mask"])
        all_labels.append(aligned)

    return Dataset.from_dict({
        "input_ids": all_input_ids,
        "attention_mask": all_attention,
        "labels": all_labels
    })

train_ds = encode_examples(train_sents, train_labels)
dev_ds   = encode_examples(dev_sents, dev_labels)
test_ds  = encode_examples(test_sents, test_labels)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_set),
    id2label=id_to_label,
    label2id=label_to_id
)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=2)

    true_labels = [[id_to_label[l] for l in lab_row if l != -100] for lab_row in labels]
    true_preds  = [[id_to_label[p] for p,l in zip(pred_row, lab_row) if l != -100]
                   for pred_row, lab_row in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    output_dir="xlmr_out1",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_steps=50,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

print("\nDEV RESULTS:", trainer.evaluate())

test_out = trainer.predict(test_ds)
print("\nTEST RESULTS:", test_out.metrics)

trainer.save_model("xlmr_model")
print("\nSaved model to ./xlmr_model")


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,1.3271
100,0.327
150,0.1303
200,0.0759
250,0.0608
300,0.0413
350,0.0329
400,0.0451
450,0.0397
500,0.0311



DEV RESULTS: {'eval_loss': 0.33201727271080017, 'eval_precision': 0.8770114942528736, 'eval_recall': 0.9585427135678392, 'eval_f1': 0.9159663865546218, 'eval_runtime': 0.5892, 'eval_samples_per_second': 509.182, 'eval_steps_per_second': 64.496, 'epoch': 4.0}

TEST RESULTS: {'test_loss': 0.3613591492176056, 'test_precision': 0.8760045924225028, 'test_recall': 0.9670468948035488, 'test_f1': 0.919277108433735, 'test_runtime': 0.6394, 'test_samples_per_second': 469.179, 'test_steps_per_second': 59.429}

Saved model to ./xlmr_model


In [25]:
from seqeval.metrics import classification_report

preds = np.argmax(test_out.predictions, axis=2)

true_labels = []
true_preds  = []

for pred_row, label_row in zip(preds, test_out.label_ids):
    t_labels = []
    t_preds  = []
    for p,l in zip(pred_row, label_row):
        if l != -100:
            t_labels.append(id_to_label[l])
            t_preds.append(id_to_label[p])
    true_labels.append(t_labels)
    true_preds.append(t_preds)

print(classification_report(true_labels, true_preds, digits=4))


              precision    recall  f1-score   support

        DATE     0.5909    0.7222    0.6500        54
       EVENT     1.0000    0.9730    0.9863        37
         LOC     0.9444    0.9551    0.9497       178
         ORG     0.5377    1.0000    0.6994        57
         PER     0.9928    0.9976    0.9952       412
     PRODUCT     0.7246    0.9804    0.8333        51

   micro avg     0.8760    0.9670    0.9193       789
   macro avg     0.7984    0.9380    0.8523       789
weighted avg     0.9045    0.9670    0.9290       789



In [None]:
# TESTING

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

BASE_MODEL = "bert-base-multilingual-cased"   # original tokenizer source
MODEL_DIR = "./mbert_model"                   # fine-tuned model weights folder

print("Loading tokenizer from:", BASE_MODEL)
print("Loading fine-tuned model from:", MODEL_DIR)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Build id2label mapping
if hasattr(model.config, "id2label") and model.config.id2label:
    id2label = model.config.id2label
else:
    id2label = {i: f"LABEL_{i}" for i in range(model.config.num_labels)}

def predict_sentence_entities(sentence, threshold_softmax=None):
    """
    Predict token labels for a single sentence and return entity spans.
    Returns: dict with tokens, words, predicted_word_labels, entities list
    """
    # Split into words for is_split_into_words=True alignment
    words = sentence.split()
    # Tokenize with word-level mapping and return tensors
    enc = tokenizer(words,
                    is_split_into_words=True,
                    return_offsets_mapping=True,
                    truncation=True,
                    max_length=256,
                    return_tensors="pt",
                    padding=True)
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out.logits  # shape: (1, seq_len, num_labels)
        probs = None
        if threshold_softmax is not None:
            probs = torch.softmax(logits, dim=-1).cpu().numpy()

    pred_ids = torch.argmax(logits, dim=-1).squeeze().cpu().tolist()

    # Build mapping from tokens->word index
    word_ids = enc.word_ids(batch_index=0)  # list of word_id per token (None for special tokens)
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().tolist())

    # Collect predicted label ids for each token, then pick first token label for the word
    word_to_subtoken_labels = {}
    token_index = 0
    for idx, wid in enumerate(word_ids):
        if wid is None:
            continue
        lbl = pred_ids[idx]
        if wid not in word_to_subtoken_labels:
            word_to_subtoken_labels[wid] = []
        word_to_subtoken_labels[wid].append((idx, lbl))
        token_index += 1

    # Decide per-word label: use first subtoken's predicted label
    word_labels = []
    for wid in range(len(words)):
        if wid in word_to_subtoken_labels:
            first_label_id = word_to_subtoken_labels[wid][0][1]
            word_labels.append(first_label_id)
        else:
            # If a word had no tokens (unlikely), mark as O if 'O' exists
            # fall back to label id 0
            word_labels.append(0)

    # Convert label ids to label strings
    word_label_names = [id2label.get(lid, str(lid)) for lid in word_labels]

    # Now extract entities based on BIO scheme (B-XXX, I-XXX, O)
    entities = []
    current_ent = None  # (label, start_word_idx, end_word_idx, tokens list)
    for i, lab in enumerate(word_label_names):
        if lab == "O" or lab == "o":
            if current_ent is not None:
                # close entity
                label, start, toks = current_ent
                entities.append({
                    "label": label,
                    "start_word": start,
                    "end_word": i-1,
                    "text": " ".join(words[start:i])
                })
                current_ent = None
            continue

        # label like B-PER or I-LOC etc.
        if lab.startswith("B-"):
            if current_ent is not None:
                # close previous
                label_prev, start_prev, toks_prev = current_ent
                entities.append({
                    "label": label_prev,
                    "start_word": start_prev,
                    "end_word": i-1,
                    "text": " ".join(words[start_prev:i])
                })
            ent_label = lab.split("-", 1)[1]
            current_ent = (ent_label, i, [words[i]])
        elif lab.startswith("I-"):
            ent_label = lab.split("-", 1)[1]
            if current_ent is None:
                # treat I- as B-
                current_ent = (ent_label, i, [words[i]])
            else:
                # continue only if same label
                if current_ent[0] == ent_label:
                    pass  # continue
                else:
                    # close previous and start new
                    label_prev, start_prev, toks_prev = current_ent
                    entities.append({
                        "label": label_prev,
                        "start_word": start_prev,
                        "end_word": i-1,
                        "text": " ".join(words[start_prev:i])
                    })
                    current_ent = (ent_label, i, [words[i]])
        else:
            # Unknown format (maybe label without B-/I-). If it's like 'PER' assume B-
            if "-" not in lab:
                # close previous if exists
                if current_ent is not None:
                    label_prev, start_prev, toks_prev = current_ent
                    entities.append({
                        "label": label_prev,
                        "start_word": start_prev,
                        "end_word": i-1,
                        "text": " ".join(words[start_prev:i])
                    })
                current_ent = (lab, i, [words[i]])
            else:
                # fallback treat as O
                if current_ent is not None:
                    label_prev, start_prev, toks_prev = current_ent
                    entities.append({
                        "label": label_prev,
                        "start_word": start_prev,
                        "end_word": i-1,
                        "text": " ".join(words[start_prev:i])
                    })
                    current_ent = None

    # Close last entity if present
    if current_ent is not None:
        label, start, toks = current_ent
        entities.append({
            "label": label,
            "start_word": start,
            "end_word": len(words)-1,
            "text": " ".join(words[start:len(words)])
        })

    return {
        "words": words,
        "word_labels": word_label_names,
        "entities": entities
    }

def pretty_print_result(res):
    print("\nInput words:")
    for i,w in enumerate(res["words"]):
        print(f"{i:02d}: {w:20}  --> {res['word_labels'][i]}")
    print("\nDetected entities (count={}):".format(len(res["entities"])))
    for e in res["entities"]:
        print(f" - {e['text']!r} \t label={e['label']} \t span=({e['start_word']},{e['end_word']})")
    print("")

# Interactive loop
if __name__ == "__main__":
    print("\n** mBERT NER tester **")
    print("Type a sentence and press Enter. Type 'exit' or empty line to quit.\n")
    while True:
        s = input("Enter sentence(Identifying Entities Totally Depend on the Dataset. ) > ").strip()
        if s.lower() in ("exit", "quit", ""):
            print("Exiting.")
            break
        try:
            r = predict_sentence_entities(s)
            pretty_print_result(r)
        except Exception as ex:
            print("Error during prediction:", ex)
            print("Make sure MODEL_DIR points to a valid fine-tuned mBERT model folder.")


Loading tokenizer from: bert-base-multilingual-cased
Loading fine-tuned model from: ./mbert_model

** mBERT NER tester **
Type a sentence and press Enter. Type 'exit' or empty line to quit.

Enter sentence > Kal subha Areeba Khan Pindi Cantt se office ja rhi thi.

Input words:
00: Kal                   --> O
01: subha                 --> O
02: Areeba                --> B-PER
03: Khan                  --> I-PER
04: Pindi                 --> B-LOC
05: Cantt                 --> I-LOC
06: se                    --> O
07: office                --> O
08: ja                    --> O
09: rhi                   --> O
10: thi.                  --> O

Detected entities (count=2):
 - 'Areeba Khan' 	 label=PER 	 span=(2,3)
 - 'Pindi Cantt' 	 label=LOC 	 span=(4,5)

Enter sentence > Aaj Bilal ne apna Nikon camera repair krawaya

Input words:
00: Aaj                   --> O
01: Bilal                 --> B-PER
02: ne                    --> O
03: apna                  --> O
04: Nikon                 --> 