In [43]:
from datasets import load_from_disk
train = load_from_disk("train_25k.hf")
test = load_from_disk("test_25k.hf")

In [44]:
train = train.shuffle(seed=42)
test = test.shuffle(seed=42)

Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\train_25k.hf\cache-fb1c627d3ac8e13f.arrow
Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\test_25k.hf\cache-52664ed69309d356.arrow


In [45]:
from datasets import load_dataset
import random


# Get the total number of rows in the dataset
total_rows = len(test)

# Set the number of rows you want to select
num_rows_to_select = 2000

# Generate a list of random indices without duplicates
random_indices = random.sample(range(total_rows), num_rows_to_select)

# Select the rows based on the random indices
test = test.select(random_indices)

# Now `selected_rows` contains your randomly selected 2000 rows


In [46]:
test

Dataset({
    features: ['id', 'lang', 'words', 'ner', 'ratio'],
    num_rows: 2000
})

In [47]:
from transformers import AutoTokenizer
from pathlib import Path
parsbert_model_name = "HooshvareLab/bert-fa-zwnj-base"
parsbert_tokenizer = AutoTokenizer.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())

text = "Jack Sparrow loves New York!"
print(parsbert_tokenizer.tokenize(text))

text = "سلام خوب هستید؟"
print(parsbert_tokenizer.tokenize(text))

['Jack', 'Sp', '##ar', '##row', 'love', '##s', 'New', 'York', '!']
['سلام', 'خوب', 'هستید', '؟']


In [48]:
from transformers import AutoModel, AutoConfig
from pathlib import Path
parsbert_model = AutoModel.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())
parsbert_config = AutoConfig.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())

In [49]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.bert.modeling_bert import BertPreTrainedModel,BertModel 
from torch import nn


class CustomBertModel(BertModel):
    def __init__(self, config):
        super(CustomBertModel, self).__init__(config)
        # Remove the pooler part
        self.pooler = None

# Load the pre-trained BERT model
model_name = "HooshvareLab/bert-fa-zwnj-base"  # Replace with the appropriate model name
config = AutoConfig.from_pretrained(model_name,cache_dir=Path.cwd())


class ParsBertForTokenClassification(BertPreTrainedModel):
    config_class = config
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.parsbert = CustomBertModel(config)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
    
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        
        # Use model body to get encoder representations
        outputs = self.parsbert(input_ids, attention_mask=attention_mask,
        token_type_ids=token_type_ids, **kwargs)
        
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions)
        

In [50]:
ner_tags = train["ner"]
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [51]:
from transformers import AutoConfig

parsbert_config = AutoConfig.from_pretrained(parsbert_model_name,num_labels=4,
                                            id2label=index2tag, label2id=tag2index)

In [52]:
import pandas as pd

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
per_text = " ".join(train['words'][0])
input_ids = parsbert_tokenizer.encode(per_text, return_tensors="pt")
bert_tokens = parsbert_tokenizer(per_text).tokens()
pd.DataFrame([bert_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,[CLS],It,was,rel,##e,##ase,##d,on,October,6,",",2003,.,[SEP]
Input IDs,2,16159,20588,25308,1145,10814,1159,9384,40002,129,119,22323,121,3


In [53]:
from transformers import RobertaTokenizerFast
parsbert_tokenizer = AutoTokenizer.from_pretrained(parsbert_model_name,
                                                   use_fast = True,
                                                   add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = parsbert_tokenizer(examples["words"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_token = label[word_idx]
                # Use the label map to get the numerical value for each entity
                label_ids.append(tag2index[label_token])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [54]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['id','words','lang','ner','ratio'])

In [55]:
train_encoded = encode_panx_dataset(train)
test_encoded = encode_panx_dataset(test)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [56]:
train_encoded

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [57]:
test_encoded

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

In [58]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [59]:
def model_init():
    return (ParsBertForTokenClassification
                  .from_pretrained(parsbert_model_name, config=parsbert_config,cache_dir=Path.cwd())
                  .to(device))

In [66]:
from transformers import TrainingArguments
import torch

num_epochs = 3
batch_size = 24

logging_steps = len(train_encoded) // batch_size

training_args = TrainingArguments(
    output_dir="output", log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
#     fp16=True,
    eval_accumulation_steps=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    seed=42,
    logging_strategy="steps",
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)


In [67]:
from transformers import DataCollatorForTokenClassification

class CustomDataCollator(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = super().__call__(features)
        # Filter out the '-100' label IDs (padding tokens)
        batch["labels"] = torch.where(batch["labels"] != -100, batch["labels"], -100)
        return batch

# Use the custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer=parsbert_tokenizer)

In [68]:
from seqeval.metrics import f1_score,recall_score,precision_score,accuracy_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
    eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true, y_pred)}

In [69]:
from transformers import Trainer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

training_args.set_optimizer(learning_rate=5e-05,epsilon=1e-08)
# print(training_args)
trainer = Trainer(model_init=model_init, args=training_args,
                        data_collator=data_collator, compute_metrics=compute_metrics,
                        train_dataset=train_encoded,
                        eval_dataset=test_encoded,
                        tokenizer=parsbert_tokenizer)
trainer.train()
  



Epoch,Training Loss,Validation Loss,F1,Recall,Precision,Accuracy
1,0.3074,0.227603,0.374849,0.327696,0.437853,0.91577
2,0.2098,0.204557,0.472063,0.4537,0.491976,0.923104
3,0.1726,0.196831,0.503502,0.50148,0.505541,0.926503




TrainOutput(global_step=2502, training_loss=0.22982711100178085, metrics={'train_runtime': 441.464, 'train_samples_per_second': 135.911, 'train_steps_per_second': 5.668, 'total_flos': 3199416167499648.0, 'train_loss': 0.22982711100178085, 'epoch': 3.0})

In [72]:
example = "در سال ۲۰۱۳ درگذشت و آندرتیکر و کین برای او مراسم یادبود گرفتند."
tokens = parsbert_tokenizer(example).tokens()
input_ids = parsbert_tokenizer(example, return_tensors="pt").input_ids.to(device)
outputs = trainer.model(input_ids)[0]
# Take argmax to get most likely class per token
predictions = torch.argmax(outputs, dim=2)
# Convert to DataFrame
tags = list(ner_tag_names)
preds = [tags[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
Tokens,[CLS],در,سال,۲۰۱۳,درگذشت,و,آ,##ندر,##تیک,##ر,و,کین,برای,او,مراسم,یادبود,گرفتند,.,[SEP]
Tags,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [71]:
words, labels = train[100]["words"], train[100]["ner"]
tokenized_input = parsbert_tokenizer(words, is_split_into_words=True)
word_ids = tokenized_input.word_ids()
tokens = parsbert_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

#hide_output
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx
    
labels = [tag2index[l] if l != -100 else "IGN" for l in label_ids ]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
Tokens,[CLS],این,مجله,با,نگاهی,تحلیلی,به,رویدادها,و,جریان,[ZWNJ],های,مهم,موسیقی,ایران,و,جهان,می,[ZWNJ],پردازد,.,[SEP]
Word IDs,,0,1,2,3,4,5,6,7,8,8,8,9,10,11,12,13,14,14,14,15,
Label IDs,-100,O,O,O,O,O,O,O,O,O,-100,-100,O,O,O,O,O,O,-100,-100,O,-100
Labels,IGN,1,1,1,1,1,1,1,1,1,IGN,IGN,1,1,1,1,1,1,IGN,IGN,1,IGN
