In [1]:
from datasets import load_from_disk
data = load_from_disk("PEYMA_ARMAN_MIXED.hf")

In [2]:
data = data.shuffle(seed=42)

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 26417
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 3303
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 3302
    })
})

In [4]:
from transformers import AutoTokenizer
from pathlib import Path
parsbert_model_name = "HooshvareLab/bert-fa-zwnj-base"
parsbert_tokenizer = AutoTokenizer.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())

text = "Jack Sparrow loves New York!"
print(parsbert_tokenizer.tokenize(text))

text = "سلام خوب هستید؟"
print(parsbert_tokenizer.tokenize(text))

['Jack', 'Sp', '##ar', '##row', 'love', '##s', 'New', 'York', '!']
['سلام', 'خوب', 'هستید', '؟']


In [5]:
from transformers import AutoModel, AutoConfig
from pathlib import Path
parsbert_model = AutoModel.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())
parsbert_config = AutoConfig.from_pretrained(parsbert_model_name,cache_dir=Path.cwd())

Some weights of BertModel were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.bert.modeling_bert import BertPreTrainedModel,BertModel 
from torch import nn


class CustomBertModel(BertModel):
    def __init__(self, config):
        super(CustomBertModel, self).__init__(config)
        # Remove the pooler part
        self.pooler = None

# Load the pre-trained BERT model
model_name = "HooshvareLab/bert-fa-zwnj-base"  # Replace with the appropriate model name
config = AutoConfig.from_pretrained(model_name,cache_dir=Path.cwd())


class ParsBertForTokenClassification(BertPreTrainedModel):
    config_class = config
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.parsbert = CustomBertModel(config)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
    
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        
        # Use model body to get encoder representations
        outputs = self.parsbert(input_ids, attention_mask=attention_mask,
        token_type_ids=token_type_ids, **kwargs)
        
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions)
        

In [7]:
ner_tags = data["train"]["ner_tags_names"]
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [12]:
from transformers import AutoConfig

parsbert_config = AutoConfig.from_pretrained(parsbert_model_name,num_labels=len(index2tag),
                                            id2label=index2tag, label2id=tag2index)

In [14]:
import pandas as pd

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
per_text = " ".join(data["train"]['tokens'][0])
input_ids = parsbert_tokenizer.encode(per_text, return_tensors="pt")
bert_tokens = parsbert_tokenizer(per_text).tokens()
pd.DataFrame([bert_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
Tokens,[CLS],رهبر,کوبا,در,خاتمه,گفت,:,هرچند,[UNK],داشتن,قوانین,عادلانه,به,نظر,بسیاری,ناممکن,می,[ZWNJ],رسد,ولی,به,اعتقاد,ما,مبارزه,برای,ناممکن,باید,شعار,این,نهاد,بین,[ZWNJ],المللی,باشد,[UNK],امروز,ما,را,گرد,هم,آ,##ورده,است,.,[SEP]
Input IDs,2,4055,10867,1921,10127,2228,133,4459,1,3328,4227,12847,1923,2161,2489,18590,1924,9,2784,2515,1923,5567,2121,4779,1959,18590,2129,7614,1930,5501,2136,9,3166,2094,1,2902,2121,1937,2309,1951,595,4832,1933,121,3


In [23]:
from transformers import RobertaTokenizerFast
parsbert_tokenizer = AutoTokenizer.from_pretrained(parsbert_model_name,
                                                   use_fast = True,
                                                   add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = parsbert_tokenizer(examples["tokens"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags_names"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 921f1a10-d5d5-48f7-a802-7d69e65d8fbb)')' thrown while requesting HEAD https://huggingface.co/HooshvareLab/bert-fa-zwnj-base/resolve/main/tokenizer_config.json


In [24]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['ner_tags',"ner_tags_names",'tokens'])

In [25]:
encoded_data = encode_panx_dataset(data)

Map:   0%|          | 0/26417 [00:00<?, ? examples/s]

Map:   0%|          | 0/3303 [00:00<?, ? examples/s]

Map:   0%|          | 0/3302 [00:00<?, ? examples/s]

In [26]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26417
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3303
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3302
    })
})

In [27]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [28]:
def model_init():
    return (ParsBertForTokenClassification
                  .from_pretrained(parsbert_model_name, config=parsbert_config,cache_dir=Path.cwd())
                  .to(device))

In [39]:
from transformers import TrainingArguments
import torch

num_epochs = 3
batch_size = 24

logging_steps = len(data["train"]) // batch_size

training_args = TrainingArguments(
    output_dir="output", log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
#     fp16=True,
    eval_accumulation_steps=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    seed=42,
    logging_strategy="steps",
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)


In [40]:
from transformers import DataCollatorForTokenClassification
# Use the custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer=parsbert_tokenizer)

In [41]:
from seqeval.metrics import f1_score,recall_score,precision_score,accuracy_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
    eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true, y_pred)}

In [42]:
from transformers import Trainer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# print(training_args)
trainer = Trainer(model_init=model_init, args=training_args,
                        data_collator=data_collator, compute_metrics=compute_metrics,
                        train_dataset=encoded_data["train"],
                        eval_dataset=encoded_data["validation"],
                        tokenizer=parsbert_tokenizer)
trainer.train()
  



Epoch,Training Loss,Validation Loss,F1,Recall,Precision,Accuracy
1,0.3277,0.223917,0.391096,0.46396,0.338012,0.931491
2,0.1716,0.148905,0.558523,0.629623,0.501851,0.952535
3,0.1128,0.122653,0.626144,0.69465,0.569936,0.961441




TrainOutput(global_step=3303, training_loss=0.20393840521077766, metrics={'train_runtime': 572.0385, 'train_samples_per_second': 138.541, 'train_steps_per_second': 5.774, 'total_flos': 4077424220431086.0, 'train_loss': 0.20393840521077766, 'epoch': 3.0})

In [38]:
example = 'ناسا در تاریخ 28 شهریور با کمک شرکت اسپیس ایکس به مالکیت ایلان ماسک، فضانوردان خود را از ایالت مینه سوتا به سمت ایستگاه فضایی بین المللی فرستاد.'
tokens = parsbert_tokenizer(example).tokens()
input_ids = parsbert_tokenizer(example, return_tensors="pt").input_ids.to(device)
outputs = trainer.model(input_ids)[0]
# Take argmax to get most likely class per token
predictions = torch.argmax(outputs, dim=2)
# Convert to DataFrame
tags = list(ner_tag_names)
preds = [tags[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
Tokens,[CLS],ناسا,در,تاریخ,28,شهریور,با,کمک,شرکت,اسپیس,ایکس,به,مالکیت,ایلان,ماسک,،,فضانوردان,خود,را,از,ایالت,مینه,سوتا,به,سمت,ایستگاه,فضایی,بین,المللی,فرستاد,.,[SEP]
Tags,O,B_PER,O,O,O,I_DAT,O,O,B_ORG,I_ORG,I_ORG,O,O,I_ORG,I_ORG,O,O,O,O,O,B_LOC,O,O,O,O,O,O,O,O,O,O,O
