In [2]:
!pip install transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer,AutoTokenizer, BertConfig, BertForTokenClassification,AutoModelForTokenClassification,DataCollatorWithPadding

In [4]:
def gpu_check():
  torch.__version__
  #If ther 's a GPU available...
  if torch.cuda.is_available():
    #Tell Pytorch to use GPU.
    device = torch.device("cuda")
    print("There are %d GPU(s) avalilable." % torch.cuda.device_count())
    print('We will use the GPU:',torch.cuda.get_device_name(0))
    #If not..
  else:
    print('No GPU available,using the CPU instead.')
    device = torch.device("cpu")
    return device

In [5]:
device=gpu_check()
print(device)

There are 1 GPU(s) avalilable.
We will use the GPU: Tesla T4
None


In [6]:
data = pd.read_csv("/content/ArmanPersoNERCorpus.csv", encoding='utf-8-sig')

In [7]:
data.drop(columns='Unnamed: 0',inplace=True)

In [8]:
data.head()

Unnamed: 0,text,label,label_entity_word
0,افقی : 0 ـ از عوامل دوران پهلوی و نخست‌وزیر ای...,O O O O O O O O O O B-loc O O O O O O O O O O ...,"[{'Entity': 'O', 'Word': 'افقی'}, {'Entity': '..."
1,طاهایی گفت : 0 طرح عمرانی ، 0 طرح مدرسه‌سازی د...,B-pers O O O O O O O O O O O O O O O O O O O O...,"[{'Entity': 'B-PERS', 'Word': 'طاهایی'}, {'Ent..."
2,وي افزود : از آن جا كه منطقه یادشده به عنوان ي...,O O O O O O O O O O O O O O O O O O O B-loc I-...,"[{'Entity': 'O', 'Word': 'وي'}, {'Entity': 'O'..."
3,هیأت باستانی منطقه سمیرم همچنین موفق به کشف 0 ...,O O B-loc I-loc O O O O O O O O O O O O O,"[{'Entity': 'O', 'Word': 'هیأت'}, {'Entity': '..."
4,شهرضا ـ وزیر نیرو ، زمان بهره‌برداری از طرح تأ...,B-loc O O B-org O O O O O O O O B-loc I-loc O ...,"[{'Entity': 'B-LOC', 'Word': 'شهرضا'}, {'Entit..."


In [9]:
label2id ={'O':0,'B-pers':1,'I-pers':2, 'B-pro':3, 'I-pro':4,'B-loc':5,'I-loc':6,'B-fac':7,'I-fac':8,'B-event':9,'I-event':10,'B-org':11,'I-org':12}
id2label = {v: k for k, v in label2id.items()}
label2id

{'O': 0,
 'B-pers': 1,
 'I-pers': 2,
 'B-pro': 3,
 'I-pro': 4,
 'B-loc': 5,
 'I-loc': 6,
 'B-fac': 7,
 'I-fac': 8,
 'B-event': 9,
 'I-event': 10,
 'B-org': 11,
 'I-org': 12}

In [10]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(" ")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [11]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.text[index]
        word_labels = self.data.label[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [12]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

In [13]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (23043, 3)
TRAIN Dataset: (18434, 3)
TEST Dataset: (4609, 3)


In [14]:
training_set[0]["ids"]

tensor([    2,  4531, 28283, 65939,  4237, 26419,  2786, 26321,  2847,  4113,
         1014,  4985,  3322,     1,  3033,  1368,  2038,  3372,  2991, 26679,
         2834,  1379,  3099,  2831,  4615,  2791,  5926,  2808,  3013,  2789,
         4860,  4851,  2910,  8390,  1012,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [15]:
training_set[0]["mask"]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [16]:
training_set[0]["targets"]

tensor([0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
روزنامه     B-pro
انگلیسیزبان  I-pro
نیشن        I-pro
چاپ         I-pro
اسلاماباد   I-pro
در          O
سرمقاله     O
خود         O
نوشت        O
:           O
کشته        O
شدن         O
[UNK]       O
تن          O
ط           O
##ي         O
چهار        O
روز         O
هشداردهنده  O
بود         O
و           O
دولت        O
برای        O
جلوگیری     O
از          O
تکرار       O
ان          O
باید        O
به          O


In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
model= AutoModelForTokenClassification.from_pretrained(
    'HooshvareLab/bert-fa-base-uncased',num_labels=13,id2label=id2label,label2id=label2id
).to(device)
data_collector = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.6547, grad_fn=<NllLossBackward0>)

In [21]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 13])

In [22]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [26]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [39]:
def valid(model):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    return labels, predictions

In [32]:
for epoch in range(EPOCHS):
    print("شروع آموزش...")
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

شروع آموزش...
Training epoch: 1
Training loss per 100 training steps: 0.0639883503317833
Training loss per 100 training steps: 0.14157950797920474
Training loss per 100 training steps: 0.117106607209298
Training loss per 100 training steps: 0.10483638394625541
Training loss per 100 training steps: 0.0921778612964069
Training loss per 100 training steps: 0.08242060954611889
Training loss per 100 training steps: 0.07550579705478982
Training loss per 100 training steps: 0.06957629896272254
Training loss per 100 training steps: 0.06442215605394755
Training loss per 100 training steps: 0.06111088871911166
Training loss per 100 training steps: 0.05824234040028841
Training loss per 100 training steps: 0.055795691575315554
Training loss per 100 training steps: 0.05298421479788063
Training loss per 100 training steps: 0.05075500330512897
Training loss per 100 training steps: 0.04879907761803239
Training loss per 100 training steps: 0.046800594018414465
Training loss per 100 training steps: 0.04

In [33]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=5edffd906af160e418b1959c27e3de1556dc81a59dc1dddbab5ddc3966821af5
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [47]:
from seqeval.metrics import classification_report

labels, predictions = valid(model)

print(classification_report([labels], [predictions]))

Validation loss per 100 evaluation steps: 0.0006449534557759762
Validation loss per 100 evaluation steps: 0.011401472311479323
Validation loss per 100 evaluation steps: 0.010721187446311951
Validation loss per 100 evaluation steps: 0.009722951968204904
Validation loss per 100 evaluation steps: 0.009862083781187001
Validation loss per 100 evaluation steps: 0.009537956475168051
Validation loss per 100 evaluation steps: 0.010037708946567088
Validation loss per 100 evaluation steps: 0.010147155808701529
Validation loss per 100 evaluation steps: 0.010077157447139985
Validation loss per 100 evaluation steps: 0.01003164639515872
Validation loss per 100 evaluation steps: 0.009869450099833083
Validation loss per 100 evaluation steps: 0.009599580936300185
Validation loss per 100 evaluation steps: 0.0096291652455017
Validation loss per 100 evaluation steps: 0.009629251601747803
Validation loss per 100 evaluation steps: 0.009666557916320559
Validation loss per 100 evaluation steps: 0.0094875000027

In [42]:
tokenizer.save_pretrained('Persian_NER_tockenizer')

('Persian_NER_tockenizer/tokenizer_config.json',
 'Persian_NER_tockenizer/special_tokens_map.json',
 'Persian_NER_tockenizer/vocab.txt',
 'Persian_NER_tockenizer/added_tokens.json',
 'Persian_NER_tockenizer/tokenizer.json')

In [35]:
model.save_pretrained('Persian_NER')

In [43]:
!zip -r persian_ner_model.zip 'Persian_NER'

  adding: Persian_NER/ (stored 0%)
  adding: Persian_NER/config.json (deflated 56%)
  adding: Persian_NER/pytorch_model.bin (deflated 8%)


In [44]:
!zip -r persian_ner_tockenizer.zip 'Persian_NER_tokenizer'

  adding: Persian_NER_tokenizer/ (stored 0%)
  adding: Persian_NER_tokenizer/vocab.txt (deflated 62%)
  adding: Persian_NER_tokenizer/special_tokens_map.json (deflated 42%)
  adding: Persian_NER_tokenizer/tokenizer_config.json (deflated 45%)
  adding: Persian_NER_tokenizer/tokenizer.json (deflated 72%)


In [41]:
sentence = "مسئول سیاست خارجه اتحادیه اروپا تحریماتی علیه ایران وضع کرد"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

مسيول سیاست خارجه اتحادیه اروپا تحریماتی علیه ایران وضع کرد
['O', 'B-org', 'I-org', 'I-org', 'I-org', 'O', 'O', 'O', 'B-org', 'O', 'O']


In [46]:
from transformers import pipeline

pipline = pipeline(task="token-classification", model=model.to(device), tokenizer=tokenizer, aggregation_strategy="simple")
pipline("مسئول سیاست خارجه اتحادیه اروپا تحریماتی علیه ایران وضع کرد")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'org',
  'score': 0.7772873,
  'word': 'سیاست خارجه اتحادیه اروپا',
  'start': 6,
  'end': 31},
 {'entity_group': 'org',
  'score': 0.9475831,
  'word': 'ایران',
  'start': 46,
  'end': 51}]