In [70]:
from datasets import load_from_disk
train = load_from_disk("train_25k.hf")
test = load_from_disk("train_25k.hf")

In [2]:
train = train.shuffle(seed=42)
test = test.shuffle(seed=42)

Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\train.hf\cache-aab69560c24eae2b.arrow
Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\test.hf\cache-6c7c52e5435094d8.arrow


In [5]:
from transformers import AutoTokenizer
from pathlib import Path
roberta_model_name = "HooshvareLab/roberta-fa-zwnj-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name,cache_dir=Path.cwd())

In [6]:
from transformers import RobertaConfig, RobertaModel
from pathlib import Path
roberta_model = RobertaModel.from_pretrained(roberta_model_name)
roberta_config = RobertaConfig.from_pretrained(roberta_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at HooshvareLab/roberta-fa-zwnj-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from transformers.models.roberta.modeling_roberta import RobertaModel
from torch import nn
class ParsRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = roberta_config
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.parsbert = RobertaModel(config)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
    
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        
        # Use model body to get encoder representations
        outputs = self.parsbert(input_ids, attention_mask=attention_mask,
        token_type_ids=token_type_ids, **kwargs)
        
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions)
        

In [8]:
ner_tags = train["ner"]
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [9]:
from transformers import AutoConfig

roberta_config = AutoConfig.from_pretrained(roberta_model_name,num_labels=4,
                                            id2label=index2tag, label2id=tag2index)

In [10]:
import pandas as pd

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
per_text = " ".join(train['words'][0])
input_ids = roberta_tokenizer.encode(per_text, return_tensors="pt")
roberta_tokens = roberta_tokenizer(per_text).tokens()
pd.DataFrame([roberta_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128
Tokens,<s>,U,ers,us,ĠO,ro,d,in,um,Ġet,ĠCh,and,in,um,Ġpro,ced,ere,Ġin,ce,per,unt,"Ġ,",Ġin,Ġmed,io,Ġe,or,um,"Ġ,",ĠN,ou,a,-,u,ill,i,Ġd,im,iss,a,"Ġ,",Ġn,omen,Ġc,u,ius,ĠPer,eg,Ġf,u,er,at,"Ġ,",Ġin,Ġqu,a,Ġh,om,ines,Ġs,ept,u,ag,int,a,Ġu,ill,ar,um,Ġf,u,er,ant,Ġc,ong,reg,ati,"Ġ,",Ġet,Ġd,im,iss,o,Ġmon,aster,io,ĠE,g,res,ĠC,ister,ci,ens,is,Ġ,ord,in,is,"Ġ,",Ġin,Ġqu,od,Ġt,an,qu,am,Ġm,un,it,um,Ġc,ast,rum,Ġse,Ġm,il,ites,Ġet,Ġmult,e,Ġd,om,ine,Ġre,ce,per,ant,Ġ.,</s>
Input IDs,0,63,4219,2831,2083,1692,78,949,3751,23590,5475,2885,949,3751,6467,28654,11916,4142,3707,4682,16346,4221,4142,37180,6232,6088,1172,3751,4221,1881,2088,75,23,95,6078,83,3094,3515,12046,75,4221,4120,36980,2578,95,21077,14288,7048,2927,95,837,1276,4221,4142,16492,75,4472,2221,24707,2529,31578,95,2963,7689,75,19506,6078,1201,3751,2927,95,837,5744,2578,8682,24941,35610,4221,23590,3094,3515,12046,89,35230,13078,6232,1696,81,13190,1040,27467,30634,7841,2433,231,6543,949,2433,4221,4142,16492,3591,1753,1105,6240,2350,2807,3171,1582,3751,2578,5345,40186,20796,2807,2544,27639,23590,27793,79,3094,2221,5372,5750,3707,4682,5744,19225,2


In [62]:
from transformers import RobertaTokenizerFast
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name, add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tokenizer(examples["words"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_token = label[word_idx]
                # Use the label map to get the numerical value for each entity
                label_ids.append(tag2index[label_token])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['id','words','lang','ner'])

In [13]:
train_encoded = encode_panx_dataset(train)
test_encoded = encode_panx_dataset(test)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoConfig

roberta_config = AutoConfig.from_pretrained(roberta_model_name,num_labels=4,
                                            id2label=index2tag, label2id=tag2index)

In [15]:
train_encoded

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500000
})

In [16]:
import numpy as np

def align_predictions(predictions, label_ids):
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape
        labels_list, preds_list = [], []
        for batch_idx in range(batch_size):
            example_labels, example_preds = [], []
            for seq_idx in range(seq_len):
                # Ignore label IDs = -100
                if label_ids[batch_idx, seq_idx] != -100:
                    example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                    example_preds.append(index2tag[preds[batch_idx][seq_idx]])
            labels_list.append(example_labels)
            preds_list.append(example_preds)
        return preds_list, labels_list

In [17]:
from transformers import TrainingArguments
import torch

num_epochs = 2
batch_size = 18
logging_steps = len(train_encoded) 

training_args = TrainingArguments(
    output_dir="output", log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    eval_accumulation_steps=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=18,
    learning_rate = 1e-4,
    seed=42,
    logging_strategy="steps", evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)


In [18]:
def model_init():
    return (ParsRobertaForTokenClassification
                  .from_pretrained(roberta_model_name, config=roberta_config,cache_dir=Path.cwd())
                  .to(device))

In [19]:
from transformers import DataCollatorForTokenClassification

class CustomDataCollator(DataCollatorForTokenClassification):
    def __call__(self, features):
        batch = super().__call__(features)
        # Filter out the '-100' label IDs (padding tokens)
        batch["labels"] = torch.where(batch["labels"] != -100, batch["labels"], -100)
        return batch

# Use the custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer=roberta_tokenizer)

In [20]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
    eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [21]:
from transformers import Trainer
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainer = Trainer(model_init=model_init, args=training_args,
                    data_collator=data_collator, compute_metrics=compute_metrics,
                    train_dataset=train_encoded,
                    eval_dataset=test_encoded,
                    tokenizer=roberta_tokenizer)


In [22]:
result = trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.09258,0.576666
2,No log,0.082258,0.618711




In [63]:
text_de = "Jack Sparrow loves New York!"
tokens = roberta_tokenizer(text_de).tokens()
input_ids = roberta_tokenizer(text_de, return_tensors="pt").input_ids.to(device)
outputs = trainer.model(input_ids)[0]
# Take argmax to get most likely class per token
predictions = torch.argmax(outputs, dim=2)
# Convert to DataFrame
tags = list(ner_tag_names)
preds = [tags[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,<s>,ĠJack,ĠSp,ar,row,Ġl,ov,es,ĠNew,ĠYork,!,</s>
Tags,O,O,O,O,PER,O,O,O,O,O,LOC,O
