In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers seqeval[gpu]

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m108.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetens

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [8]:
def read_data(path):
    sentences = []
    ner_tags = []
    with open(path, "r",  encoding='utf-8') as file:
        sentence = []
        ner_tag = []
        flag = 0
        for line in file:
            line = line.strip()
            # print(line.strip())
            if line != "":
                word, tag = line.split("\t")
                sentence.append(word)
                ner_tag.append(tag)
                flag = 0
            elif line=="" and flag==0:
                flag = 1
            elif line =="" and flag==1:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
                sentence = []
                ner_tag = []
                flag = 2
            elif flag==2:
                flag = 0

    return sentences, ner_tags

In [9]:
train_sentences, train_ner_tags = read_data("/content/drive/MyDrive/LLM/NER_DATA/train.conll")
test_sentences, test_ner_tags = read_data("/content/drive/MyDrive/LLM/NER_DATA/test.conll")

print(train_sentences[1])
print(train_ner_tags[1])

['دکتر', 'اصغری', 'دبیر', 'چهارمین', 'همایش', 'انجمن', 'زمین\u200cشناسی', 'ایران', 'در', 'این', 'زمینه', 'گفت', ':', 'از', 'مجموع', 'چهار', 'صد', 'مقاله', 'رسیده', 'به', 'دبیرخانه', 'همایش', '،', 'يك', 'صد', 'و', 'هشتاد', 'مقاله', 'ظرف', 'مدت', 'دو', 'روز', 'در', 'هشت', 'سالن', 'همایش', 'برگزار', 'شد', '.']
['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [10]:
def get_counts(sentences, ner_tags):
    # Total counts
    total_words = sum(len(sentence) for sentence in sentences)
    total_tags = sum(len(tag_list) for tag_list in ner_tags)

    # Unique counts
    unique_words = set(word for sentence in sentences for word in sentence)
    unique_tags = set(tag for tag_list in ner_tags for tag in tag_list)

    return total_words, total_tags, len(unique_words), len(unique_tags)


# Get counts for train data
train_total_words, train_total_tags, train_unique_words, train_unique_tags = get_counts(train_sentences, train_ner_tags)
print(f"Train data - Total words: {train_total_words}, Total tags: {train_total_tags}, Unique words: {train_unique_words}, Unique tags: {train_unique_tags}")

# Get counts for test data
test_total_words, test_total_tags, test_unique_words, test_unique_tags = get_counts(test_sentences, test_ner_tags)
print(f"Test data - Total words: {test_total_words}, Total tags: {test_total_tags}, Unique words: {test_unique_words}, Unique tags: {test_unique_tags}")

Train data - Total words: 24981986, Total tags: 24981986, Unique words: 361051, Unique tags: 7
Test data - Total words: 164503, Total tags: 164503, Unique words: 17824, Unique tags: 7


In [11]:
from collections import Counter

def get_tag_counts(ner_tags):
    # Flatten the tag lists and compute counts
    flat_tags = [tag for tag_list in ner_tags for tag in tag_list]
    tag_counts = Counter(flat_tags)
    return tag_counts

# Using the function on your train and test data
train_tag_counts = get_tag_counts(train_ner_tags)
test_tag_counts = get_tag_counts(test_ner_tags)

# Printing the counts for train data
print("Train data tag counts:")
for tag, count in train_tag_counts.items():
    print(f"{tag}: {count}")

print("\nTest data tag counts:")
for tag, count in test_tag_counts.items():
    print(f"{tag}: {count}")

Train data tag counts:
O: 22405858
B-PER: 381480
I-PER: 427473
B-LOC: 618852
I-LOC: 384798
B-ORG: 288365
I-ORG: 475160

Test data tag counts:
O: 153522
B-LOC: 3186
I-LOC: 702
B-PER: 1154
I-PER: 648
B-ORG: 2382
I-ORG: 2909


In [12]:
# Create label2id and id2label using the keys (tags) from train_tag_counts
label2id = {tag: idx for idx, tag in enumerate(train_tag_counts.keys())}
id2label = {idx: tag for tag, idx in label2id.items()}

print("label2id:", label2id)

label2id: {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-ORG': 5, 'I-ORG': 6}


In [None]:
# label2id = {k: v for v, k in enumerate(data.Tag.unique())}
# id2label = {v: k for v, k in enumerate(data.Tag.unique())}
# label2id

{'B-geo': 1,
 'B-gpe': 2,
 'B-org': 5,
 'B-per': 3,
 'B-tim': 7,
 'I-geo': 4,
 'I-gpe': 9,
 'I-org': 6,
 'I-per': 8,
 'I-tim': 10,
 'O': 0}

In [76]:
def data_to_dataframe(sentences, ner_tags):
    """Convert lists of words and NER tags into a DataFrame."""
    # Convert lists of words and tags to single strings
    sentence_strs = [' '.join(sentence) for sentence in sentences]
    ner_tag_strs = [','.join(tags) for tags in ner_tags]

    # Create and return a DataFrame
    return pd.DataFrame({
        'sentence': sentence_strs,
        'word_labels': ner_tag_strs
    })


train_df = data_to_dataframe(train_sentences, train_ner_tags)
test_df = data_to_dataframe(test_sentences, test_ner_tags)

In [50]:
train_df.head()

Unnamed: 0,sentence,word_labels
0,به عنوان مثال وقتی نشریات مدافع اصول و ارزشها ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,دکتر اصغری دبیر چهارمین همایش انجمن زمین‌شناسی...,"O,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,دکتر اکبر میرعرب در همایش بررسی و پیشگیری از ب...,"O,B-PER,I-PER,O,O,O,O,O,O,O,O,O,B-LOC,O,O,O,O,..."
3,اردبیل ـ استاندار اردبیل گفت : به مناسبت هفته ...,"B-LOC,O,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,حمید طاهایی افزود : برای اجرای این طرحها 0 میل...,"B-PER,I-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [51]:
test_df.head()

Unnamed: 0,sentence,word_labels
0,بنابراین نمی‌شود با ارزشها شوخی کرد و باید آن ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,اردبیل ـ استاندار اردبیل گفت : به مناسبت هفته ...,"B-LOC,O,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,اصفهان ـ 0 تپه و محوطه باستانی دوره‌های تاریخی...,"B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LOC,I-LOC,..."
3,پیشینه این محوطه‌های باستانی ، مربوط به پیش از...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,محسن جاوری ، سرپرست هیأت باستان‌شناسی این مناط...,"B-PER,I-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [52]:
train_df.iloc[41].sentence

'اعضای هیأت\u200cرئیسه مجمع نمایندگان شهر تهران و شهرستانهای استان تهران در اولین نشست این مجمع كه به دعوت مهدی کروبی رئیس مجلس برگزار شد ، برگزیده شدند .'

In [53]:
train_df.iloc[41].word_labels

'O,O,B-ORG,I-ORG,I-ORG,I-ORG,O,B-LOC,I-LOC,I-LOC,O,O,O,O,O,O,O,O,B-PER,I-PER,O,B-ORG,O,O,O,O,O,O'

#### **Preparing the dataset and dataloader**

In [54]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [55]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-zwnj-base")

In [56]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [57]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [58]:
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

training_set = dataset(train_df, tokenizer, MAX_LEN)
testing_set = dataset(test_df, tokenizer, MAX_LEN)

TRAIN Dataset: (100, 2)
TEST Dataset: (20, 2)


In [59]:
training_set[1]

{'ids': tensor([    2,  4215, 41665,  4788,  8091,  8052,  4672,  2311,     9,  4228,
          2141,  1921,  1930,  2681,  2228,   133,  1925,  2654,  2569,  2857,
          3382,  3472,  1923, 12963,  8052,   590,     1,  2857,   623, 10988,
          3382,  3333,  2729,  2008,  2090,  1921,  3651,  5674,  8052,  3182,
          1948,   121,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [60]:
training_set[1]["ids"]

tensor([    2,  4215, 41665,  4788,  8091,  8052,  4672,  2311,     9,  4228,
         2141,  1921,  1930,  2681,  2228,   133,  1925,  2654,  2569,  2857,
         3382,  3472,  1923, 12963,  8052,   590,     1,  2857,   623, 10988,
         3382,  3333,  2729,  2008,  2090,  1921,  3651,  5674,  8052,  3182,
         1948,   121,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [61]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
به          O
عنوان       O
مثال        O
وقتی        O
نشریات      O
مدافع       O
اصول        O
و           O
ارزش        O
##ها        O
و           O
منا         O
##دی        O
انقلاب      O
و           O
اسلام       O
در          O
بالاترین    O
درجه        O
،           O
اولین       O
و           O
درشت        O
[ZWNJ]      O
ترین        O
تیتر        O
نشریه       O
خود         O
را          O


In [62]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#### **Defining the model**

In [63]:
model = BertForTokenClassification.from_pretrained("HooshvareLab/bert-fa-zwnj-base",
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [64]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.2274, device='cuda:0', grad_fn=<NllLossBackward0>)

In [65]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 7])

In [66]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [67]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

And let's train the model!

In [68]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 1.8624955415725708
Training loss epoch: 0.5834132045507431
Training accuracy epoch: 0.770858608660387


#### **Evaluating the model**

In [69]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [70]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.3050558567047119
Validation Loss: 0.257312048971653
Validation Accuracy: 0.8839773650349775


In [71]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00        31
         ORG       0.00      0.00      0.00         9
         PER       0.00      0.00      0.00        10

   micro avg       0.00      0.00      0.00        50
   macro avg       0.00      0.00      0.00        50
weighted avg       0.00      0.00      0.00        50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### **Inference**

In [72]:
sentence = "ابراهیم رییسی به همدان رفت."

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

ابراهیم رییسی به همدان رفت .
['O', 'O', 'O', 'O', 'O', 'O', 'O']


In [75]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("سلام اسم من علی است و در ایران زندگی میکنم.")

[]

#### **Saving the model for future use**

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 2s (899 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155332 files and directories cur

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
model_name = "bert-finetuned-ner"

# upload files to the hub
tokenizer.push_to_hub(
    repo_path_or_name=model_name,
    organization="nielsr",
    commit_message="Add tokenizer",
    use_temp_dir=True,
)
model.push_to_hub(
    repo_path_or_name=model_name,
    organization="nielsr",
    commit_message="Add model",
    use_temp_dir=True,
)

Cloning https://huggingface.co/nielsr/bert-finetuned-ner into local empty directory.
To https://huggingface.co/nielsr/bert-finetuned-ner
   7fb33a6..12404bd  main -> main

Cloning https://huggingface.co/nielsr/bert-finetuned-ner into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.39k/415M [00:00<?, ?B/s]

To https://huggingface.co/nielsr/bert-finetuned-ner
   12404bd..77117f5  main -> main



'https://huggingface.co/nielsr/bert-finetuned-ner/commit/77117f560fc015ea11bb2b1f0496bfc09cf792b3'

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "nielsr/bert-finetuned-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/369 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]