In [None]:
from datasets import load_dataset

ds = load_dataset("AliFartout/PEYMA-ARMAN-Mixed")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.31M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/431k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/423k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3296 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3296 [00:00<?, ? examples/s]

In [None]:
ds["train"].column_names
ds["train"][0]


{'tokens': ['یوکوویچ',
  'متولد',
  'دانمارک',
  'است',
  'اما',
  'والدین',
  'او',
  'صرب',
  'هستند',
  '.'],
 'ner_tags': [5, 20, 1, 20, 20, 20, 20, 1, 20, 20],
 'ner_tags_names': ['B_PER',
  'O',
  'B_LOC',
  'O',
  'O',
  'O',
  'O',
  'B_LOC',
  'O',
  'O']}

In [None]:
train_ds = ds["train"]
val_ds = ds["validation"]
test_ds = ds["test"]

In [None]:
# Extract all unique label names
all_labels = set()
for sample in train_ds:
    all_labels.update(sample["ner_tags_names"])

label_list = sorted(list(all_labels))

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(label2id)


{'B_DAT': 0, 'B_EVE': 1, 'B_FAC': 2, 'B_LOC': 3, 'B_MON': 4, 'B_ORG': 5, 'B_PCT': 6, 'B_PER': 7, 'B_PRO': 8, 'B_TIM': 9, 'I_DAT': 10, 'I_EVE': 11, 'I_FAC': 12, 'I_LOC': 13, 'I_MON': 14, 'I_ORG': 15, 'I_PCT': 16, 'I_PER': 17, 'I_PRO': 18, 'I_TIM': 19, 'O': 20}


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

model = AutoModelForTokenClassification.from_pretrained(
    "HooshvareLab/bert-base-parsbert-uncased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def ner_collate_fn(batch, tokenizer, max_len=128):
    input_ids_list = []
    attention_masks_list = []
    labels_list = []

    for sample in batch:
        tokens = sample["tokens"]
        labels = sample["ner_tags"]

        encoding = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=max_len,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        word_ids = encoding.word_ids(0)

        aligned_labels = []
        prev_word = None

        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != prev_word:
                aligned_labels.append(labels[word_idx])
            else:
                aligned_labels.append(-100)
            prev_word = word_idx

        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)
        labels_list.append(torch.tensor(aligned_labels))

    return {
        "input_ids": pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id),
        "attention_mask": pad_sequence(attention_masks_list, batch_first=True, padding_value=0),
        "labels": pad_sequence(labels_list, batch_first=True, padding_value=-100),
    }


In [None]:
from torch.utils.data import DataLoader

batch_size = 8

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda b: ner_collate_fn(b, tokenizer))

val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
                        collate_fn=lambda b: ner_collate_fn(b, tokenizer))

test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
                         collate_fn=lambda b: ner_collate_fn(b, tokenizer))


In [None]:
import torch
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
optimizer = AdamW(model.parameters(), lr=3e-5)

total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


In [None]:
best_val_loss = float("inf")
patience = 2
patience_counter = 0

for epoch in range(epochs):
    print(f"\n🔵 Epoch {epoch+1}/{epochs}")
    model.train()

    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            labels=batch["labels"].to(device)
        )

        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train = train_loss / len(train_loader)

    # VALIDATION
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device)
            )
            val_loss += outputs.loss.item()

    avg_val = val_loss / len(val_loader)
    print(f"Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f}")

    # EARLY STOPPING
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        patience_counter = 0
        torch.save(model.state_dict(), "best_ner_model.pt")
        print("💾 Saved best model!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("⛔ Early stopping triggered!")
            break



🔵 Epoch 1/3
Train Loss: 0.1888 | Val Loss: 0.0568
💾 Saved best model!

🔵 Epoch 2/3
Train Loss: 0.0313 | Val Loss: 0.0347
💾 Saved best model!

🔵 Epoch 3/3
Train Loss: 0.0091 | Val Loss: 0.0347
💾 Saved best model!


In [None]:
from transformers import AutoModelForTokenClassification
import torch

model = AutoModelForTokenClassification.from_pretrained(
    "HooshvareLab/bert-base-parsbert-uncased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

model.load_state_dict(torch.load("best_ner_model.pt", map_location=device))
model.to(device)
model.eval()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [None]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=d296a57711ab82adb2302e24f2c2231417117dc47671f05fdd0b3c9c7ca6c280
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

true_labels = []
pred_labels = []

for batch in test_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    labels = batch["labels"].cpu().numpy()

    for pred_seq, label_seq in zip(predictions, labels):
        true_seq = []
        pred_seq_clean = []

        for p, l in zip(pred_seq, label_seq):
            if l == -100:
                continue  # skip subword predictions
            true_seq.append(id2label[l])
            pred_seq_clean.append(id2label[p])

        true_labels.append(true_seq)
        pred_labels.append(pred_seq_clean)


In [None]:
print(classification_report(true_labels, pred_labels, digits=4))




              precision    recall  f1-score   support

        _DAT     0.9331    0.9208    0.9269      1515
        _EVE     0.9686    0.9562    0.9624      1874
        _FAC     0.9773    0.9762    0.9768       926
        _LOC     0.9548    0.9670    0.9609      2033
        _MON     0.9412    0.9536    0.9474       151
        _ORG     0.9754    0.9849    0.9801      1651
        _PCT     0.9482    0.9714    0.9597       245
        _PER     0.8976    0.8341    0.8647       452
        _PRO     0.8852    0.9307    0.9074       779
        _TIM     0.9051    0.9709    0.9368       275

   micro avg     0.9497    0.9528    0.9512      9901
   macro avg     0.9387    0.9466    0.9423      9901
weighted avg     0.9498    0.9528    0.9512      9901



In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score

# First, get all unique labels
unique_labels = sorted(label_list)

per_tag_metrics = []

for tag in unique_labels:
    # Collect all true/pred labels for this tag
    y_true_tag = []
    y_pred_tag = []
    for t_seq, p_seq in zip(true_labels, pred_labels):
        for t, p in zip(t_seq, p_seq):
            if t == tag:
                y_true_tag.append(t)
                y_pred_tag.append(p)

    # Compute metrics
    precision = precision_score([y_true_tag], [y_pred_tag], average='macro')
    recall = recall_score([y_true_tag], [y_pred_tag], average='macro')
    f1 = f1_score([y_true_tag], [y_pred_tag], average='macro')
    accuracy = sum([t==p for t,p in zip(y_true_tag, y_pred_tag)]) / len(y_true_tag)

    per_tag_metrics.append({
        "tag": tag,
        "precision": precision,
        "recall": recall,
        "f1-score": f1,
        "accuracy": accuracy,
        "support": len(y_true_tag)
    })

# Print nicely
print(f"{'Tag':<6} {'Precision':>9} {'Recall':>7} {'F1-score':>9} {'Accuracy':>9} {'Support':>8}")
for m in per_tag_metrics:
    print(f"{m['tag']:<6} {m['precision']*100:9.4f} {m['recall']*100:7.4f} {m['f1-score']*100:9.4f} {m['accuracy']*100:9.4f} {m['support']:8}")


  _warn_prf(average, modifier, msg_start, len(result))


Tag    Precision  Recall  F1-score  Accuracy  Support
B_DAT    33.3333 23.8361   27.7959   71.5084      179
B_EVE    14.2857 13.6982   13.9858   95.8874     1848
B_FAC    50.0000 46.0784   47.9592   92.1569       51
B_LOC    14.2857 13.8662   14.0728   97.0632     2009
B_MON   100.0000 74.0741   85.1064   74.0741       27
B_ORG    20.0000 19.6745   19.8359   98.3724     1536
B_PCT    50.0000 38.8889   43.7500   77.7778       27
B_PER    33.1276 22.8369   27.0361   68.9362      235
B_PRO    16.6667 14.9278   15.7494   89.5669      508
B_TIM    50.0000 47.0370   48.4733   94.0741      135
I_DAT     0.0000  0.0000    0.0000   95.5187     2834
I_EVE     0.0000  0.0000    0.0000   80.6452       31
I_FAC     0.0000  0.0000    0.0000   97.9943     1047
I_LOC     0.0000  0.0000    0.0000   78.3784       37
I_MON   100.0000 100.0000  100.0000  100.0000      124
I_ORG   100.0000 100.0000  100.0000  100.0000      237
I_PCT   100.0000 100.0000  100.0000  100.0000      218
I_PER   100.0000 100.0000

In [None]:
acc = accuracy_score(true_labels, pred_labels)
print("Accuracy:", acc)

Accuracy: 0.9921635073599492


In [None]:
def get_wrong_predictions(true_labels, pred_labels, token_seqs):
    """
    true_labels: list of lists of true tags
    pred_labels: list of lists of predicted tags
    token_seqs: list of lists of tokens (words)
    """
    wrong_examples = []

    for tokens, t_seq, p_seq in zip(token_seqs, true_labels, pred_labels):
        for token, true, pred in zip(tokens, t_seq, p_seq):
            if true != pred:
                wrong_examples.append({
                    "token": token,
                    "true_label": true,
                    "pred_label": pred
                })

    return wrong_examples


In [None]:
# Collect tokens from your test dataset
token_seqs = [sample["tokens"] for sample in test_ds]

wrong_examples = get_wrong_predictions(true_labels, pred_labels, token_seqs)

# Show first 10 wrong predictions
for example in wrong_examples[:10]:
    print(f"Token: {example['token']:15} True: {example['true_label']:7} Pred: {example['pred_label']:7}")


Token: رویس            True: B_ORG   Pred: B_LOC  
Token: شهر             True: O       Pred: B_EVE  
Token: آلومینیوم       True: O       Pred: B_PRO  
Token: کهف             True: O       Pred: B_EVE  
Token: حصین            True: O       Pred: B_PRO  
Token: ثامن            True: B_ORG   Pred: B_PRO  
Token: الائمه          True: I_FAC   Pred: B_PRO  
Token: (ع)             True: I_FAC   Pred: B_PRO  
Token: 17              True: B_DAT   Pred: B_PER  
Token: نگارخانه        True: B_EVE   Pred: I_MON  


In [None]:
def show_sentence_errors(tokens, true_seq, pred_seq):
    output = []
    for token, true, pred in zip(tokens, true_seq, pred_seq):
        if true != pred:
            output.append(f"[{token} | T:{true} P:{pred}]")
        else:
            output.append(token)
    print(" ".join(output))

# Example with first 5 test sentences
for i in range(5):
    show_sentence_errors(token_seqs[i], true_labels[i], pred_labels[i])


[رویس | T:B_ORG P:B_LOC] و تمسخر بایرن بخاطر باخت به اتلتیکو ( عکس ) .
! حال جای این سوال باقی است چرا در ورودی این شهر اقتصادی نوشته\u200cاند " به بندرعباس [شهر | T:O P:B_EVE] [آلومینیوم | T:O P:B_PRO] خوش آمدید "؟ !
عبدالمطلب ، در خواب می\u200cبیند كه مأمور حفر چاه زمزم شده است .
این منبع امنیتی که خواست نامش فاش نشود تاکید کرد بعد از فرار این 9 کودک از نبردهای فلوجه ، داعش آنها را بازداشت و اقدام به اعدام آنها کرده است .
وی تاکید کرد تعرض به آزادی\u200cهای اساسی در [کهف | T:O P:B_EVE] [حصین | T:O P:B_PRO] [ثامن | T:B_ORG P:B_PRO] [الائمه | T:I_FAC P:B_PRO] [(ع) | T:I_FAC P:B_PRO] مایه شرمساری است .
