In [1]:
import os
import pandas as pd
from collections import defaultdict , Counter
import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy
from datasets import Dataset , DatasetDict , Sequence , Value , Features , ClassLabel
from transformers import AutoTokenizer , XLMRobertaConfig , AutoConfig , TrainingArguments , DataCollatorForTokenClassification , Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from seqeval.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay , confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_from_disk
import warnings
from transformers import AutoTokenizer
from pathlib import Path


warnings.filterwarnings('ignore')
data = load_from_disk("PEYMA_ARMAN_MIXED.hf")

In [2]:
data = data.shuffle(seed=42)

Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\PEYMA_ARMAN_MIXED.hf\train\cache-57424060d946b530.arrow
Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\PEYMA_ARMAN_MIXED.hf\test\cache-adcc49eea328fe12.arrow
Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\PEYMA_ARMAN_MIXED.hf\validation\cache-430e59aedb1cb7f5.arrow


In [3]:
roberta_model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)

In [4]:
class ParsXMLRobertaForTokenClassification(RobertaPreTrainedModel):
    
    class_config = XLMRobertaConfig
    
    def __init__(self , config):
        
        super().__init__(config)
        
        self.num_labels = config.num_labels
        
        self.roberta = RobertaModel(config , add_pooling_layer=False)
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.classifier = nn.Linear(config.hidden_size , config.num_labels)
        
        self.init_weights()
        
    def forward(self , input_ids=None , attention_mask=None , token_type_ids=None , labels=None , **kwargs):

        outputs = self.roberta(input_ids , attention_mask=attention_mask , token_type_ids=token_type_ids , **kwargs)

        sequence_output = self.dropout(outputs[0])

        logits = self.classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1 , self.num_labels) , labels.view(-1))

        return TokenClassifierOutput(
            loss = loss,
            logits = logits,
            hidden_states = outputs.hidden_states,
            attentions = outputs.attentions
        )

In [5]:
ner_tags = data['train']['ner_tags_names']
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [6]:
import pandas as pd

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
per_text = " ".join(data['train']['tokens'][0])
input_ids = tokenizer.encode(per_text, return_tensors="pt")
roberta_tokens = tokenizer(per_text).tokens()
pd.DataFrame([roberta_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
Tokens,<s>,▁رهبر,▁کو,با,▁در,▁خاتم,ه,▁گفت,▁:,▁هرچند,▁,رؤ,ی,ای,▁داشتن,▁قوانین,▁عادل,انه,▁به,▁نظر,▁بسیاری,▁نام,م,کن,▁می,▁رسد,▁ولی,▁به,▁اعتقاد,▁ما,▁مبارزه,▁برای,▁نام,م,کن,▁باید,▁شعار,▁این,▁نهاد,▁بین,▁المللی,▁باشد,▁كه,▁امروز,▁ما,▁را,▁گرد,▁هم,▁آورده,▁است,▁,.,</s>
Input IDs,0,50773,554,6779,175,118483,176,5228,152,180173,6,47044,140,6223,47613,32642,106257,7189,178,2580,30778,2618,376,15329,383,54606,11174,178,114251,877,56636,1012,2618,376,15329,3969,75543,498,73830,5184,35033,3105,6695,15199,877,406,28849,1149,109008,477,6,5,2


In [7]:
from transformers import RobertaTokenizerFast
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name, add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tokenizer(examples["tokens"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags_names"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_token = label[word_idx]
                # Use the label map to get the numerical value for each entity
                label_ids.append(tag2index[label_token])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['tokens' , 'ner_tags' , 'ner_tags_names'])

In [9]:
encoded_data = encode_panx_dataset(data)

Map:   0%|          | 0/26417 [00:00<?, ? examples/s]

Map:   0%|          | 0/3303 [00:00<?, ? examples/s]

Map:   0%|          | 0/3302 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoConfig

roberta_config  = AutoConfig.from_pretrained(
    roberta_model_name,
    num_labels = len(index2tag),
    id2label = index2tag,
    label2id = tag2index
)

In [11]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 26417
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3303
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3302
    })
})

In [12]:
import numpy as np

def align_predictions(predictions, label_ids):
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape
        labels_list, preds_list = [], []
        for batch_idx in range(batch_size):
            example_labels, example_preds = [], []
            for seq_idx in range(seq_len):
                # Ignore label IDs = -100
                if label_ids[batch_idx, seq_idx] != -100:
                    example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                    example_preds.append(index2tag[preds[batch_idx][seq_idx]])
            labels_list.append(example_labels)
            preds_list.append(example_preds)
        return preds_list, labels_list

In [13]:
from transformers import TrainingArguments
import torch

num_epochs = 6
batch_size = 24
logging_steps = len(encoded_data['train']) // batch_size

training_args = TrainingArguments(
    output_dir="output", log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
#     fp16=True,

    eval_accumulation_steps=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    seed=42,
    logging_strategy="steps", evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)


In [14]:
def model_init():
    return (ParsXMLRobertaForTokenClassification
                  .from_pretrained(roberta_model_name, config=roberta_config,cache_dir=Path.cwd())
                  .to(device))

In [15]:
from transformers import DataCollatorForTokenClassification
# Use the custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer=roberta_tokenizer)

In [16]:
from seqeval.metrics import f1_score,recall_score,precision_score,accuracy_score
import wandb

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
    eval_pred.label_ids)
    wandb.log({"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true, y_pred)})
    return {"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true, y_pred)}

In [17]:
from transformers import Trainer
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainer = Trainer(model_init=model_init, args=training_args,
                    data_collator=data_collator, compute_metrics=compute_metrics,
                    train_dataset=encoded_data['train'],
                    eval_dataset=encoded_data['validation'],
                    tokenizer=tokenizer)


In [18]:
import wandb
wandb.init(project="XMLRoberta_number1")
result = trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mali-fartout[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Epoch,Training Loss,Validation Loss,F1,Recall,Precision,Accuracy
1,0.1417,0.075124,0.805203,0.841218,0.772146,0.974965
2,0.0521,0.049409,0.868926,0.884397,0.853987,0.985016
3,0.0274,0.038347,0.908677,0.926028,0.891964,0.98957
4,0.0151,0.036428,0.927879,0.938414,0.917578,0.991048
5,0.0083,0.035513,0.937579,0.952176,0.923423,0.992074
6,0.0048,0.037769,0.939868,0.951832,0.9282,0.992277


0,1
Accuracy,▁▅▇███
Precision,▁▅▆███
Recall,▁▄▆▇██
eval/Accuracy,▁▅▇███
eval/Precision,▁▅▆███
eval/Recall,▁▄▆▇██
eval/f1,▁▄▆▇██
eval/loss,█▃▂▁▁▁
eval/runtime,▁▃▂█▃▂
eval/samples_per_second,█▆▇▁▅▆

0,1
Accuracy,0.99228
Precision,0.9282
Recall,0.95183
eval/Accuracy,0.99228
eval/Precision,0.9282
eval/Recall,0.95183
eval/f1,0.93987
eval/loss,0.03777
eval/runtime,7.7726
eval/samples_per_second,424.825


In [27]:
def tag_text(text,tags,tokenizer):
    
    """
    Output a DataFrame showing tokens with their predicted label
    
        text : [string] User inputed text
        model : Model object
        tags : [ClassLabel] Tags
        tokenizer : Model tokenizer
    """
    
    tokens = tokenizer(text).tokens()
    
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = trainer.model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)    
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens , preds] , index=['token' , 'predicted label'])

In [28]:
text = 'ناسا در تاریخ 28 شهریور با کمک شرکت اسپیس ایکس به مالکیت ایلان ماسک، فضانوردان خود را از ایالت مینه سوتا به سمت ایستگاه فضایی بین المللی فرستاد.'
tag_text(text , list(ner_tag_names) , tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
token,<s>,▁ناس,ا,▁در,▁تاریخ,▁28,▁شهریور,▁با,▁کمک,▁شرکت,▁اسپ,یس,▁ایکس,▁به,▁مالک,یت,▁ایل,ان,▁ما,سک,،,▁فضا,نورد,ان,▁خود,▁را,▁از,▁ایالت,▁می,نه,▁سو,تا,▁به,▁سمت,▁ایستگاه,▁فضایی,▁بین,▁المللی,▁فرستاد,.,</s>
predicted label,O,B_ORG,I_ORG,O,O,B_DAT,I_DAT,O,O,B_ORG,I_ORG,I_ORG,I_ORG,O,O,O,B_PER,I_PER,I_PER,I_PER,O,O,O,O,O,O,O,B_LOC,I_LOC,I_LOC,I_LOC,I_LOC,O,O,B_LOC,I_LOC,I_LOC,I_LOC,O,O,O


In [29]:
text = " ".join(data['test']['tokens'][3][:-1]) + data['test']['tokens'][3][-1]
tag_text(text , list(ner_tag_names) , tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
token,<s>,▁در,▁بیماری,▁,لائم,▁،,▁درد,های,▁بند,ها,▁ناشی,▁از,▁عفونت,▁می,کر,بی,▁است,.,</s>
predicted label,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [22]:
len(data['test']['ner_tags_names'][3]),len(data['test']['tokens'][3])

(12, 12)

In [23]:
tag_text(text , list(ner_tag_names) , tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
token,<s>,▁در,▁بیماری,▁,لائم,▁،,▁درد,های,▁بند,ها,▁ناشی,▁از,▁عفونت,▁می,کر,بی,▁است,.,</s>
predicted label,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
