# Load dataset

In [264]:
from datasets import load_from_disk
train = load_from_disk("train.hf")
test = load_from_disk("test.hf")

In [265]:
train = train.shuffle(seed=42)

Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\train.hf\cache-aab69560c24eae2b.arrow


In [266]:
test = test.shuffle(seed=42)

Loading cached shuffled indices for dataset at E:\ML\NLP_Toolbox\NER\test.hf\cache-6c7c52e5435094d8.arrow


# Processing Bert without cleaning

In [267]:
from transformers import AutoTokenizer
bert_model_name = "HooshvareLab/bert-fa-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

In [268]:
text = " ".join(train["words"][0])
bert_tokens = bert_tokenizer(text).tokens()
bert_tokens

['[CLS]',
 'جمعیت',
 'جمعیت',
 'این',
 'دهستان',
 'بر',
 'اساس',
 'سرشماری',
 'سال',
 '۱۳۸۵',
 '(',
 '۱',
 '٬',
 '۲۱۲',
 '##خانوار',
 ')',
 '۵',
 '٬',
 '۱۱۸',
 '##نفر',
 'بوده',
 'است',
 '.',
 '[SEP]']

In [269]:
" ".join(train["words"][0])

'جمعیت جمعیت این دهستان بر اساس سرشماری سال ۱۳۸۵ ( ۱٬۲۱۲خانوار ) ۵٬۱۱۸نفر بوده است .'

## Create a custom model for token classification

In [270]:
from transformers import BertModel, BertConfig

from pathlib import Path

bert_model = BertModel.from_pretrained("HooshvareLab/bert-fa-base-uncased",cache_dir=Path.cwd())
bert_config = BertConfig.from_pretrained("HooshvareLab/bert-fa-base-uncased",cache_dir=Path.cwd())

In [271]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.models.bert.modeling_bert import BertModel
from torch import nn
class ParsBertForTokenClassification(BertPreTrainedModel):
    config_class = bert_config
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.parsbert = BertModel(config)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
    
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        
        # Use model body to get encoder representations
        outputs = self.parsbert(input_ids, attention_mask=attention_mask,
        token_type_ids=token_type_ids, **kwargs)
        
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions)
        

In [272]:
ner_tags = train["ner"]
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [273]:
from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(bert_model_name,num_labels=4,
                                            id2label=index2tag, label2id=tag2index)

In [274]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parsbert_model = (ParsBertForTokenClassification
                  .from_pretrained(bert_model_name, config=bert_config,cache_dir=Path.cwd())
                  .to(device))

Some weights of ParsBertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['bert.parsbert.encoder.layer.0.attention.self.query.weight', 'bert.parsbert.encoder.layer.10.attention.self.value.bias', 'bert.parsbert.encoder.layer.10.output.LayerNorm.weight', 'bert.parsbert.pooler.dense.weight', 'bert.parsbert.encoder.layer.9.attention.self.value.weight', 'bert.parsbert.encoder.layer.5.attention.self.key.bias', 'bert.parsbert.encoder.layer.10.intermediate.dense.bias', 'bert.parsbert.encoder.layer.3.attention.output.dense.bias', 'bert.parsbert.embeddings.position_embeddings.weight', 'bert.parsbert.encoder.layer.11.output.LayerNorm.bias', 'bert.parsbert.encoder.layer.0.output.dense.weight', 'bert.parsbert.encoder.layer.1.intermediate.dense.bias', 'bert.parsbert.encoder.layer.5.attention.self.value.weight', 'bert.parsbert.encoder.layer.4.attention.output.LayerNorm.bias', 'bert.parsbert.encoder.layer.7.attention.

In [275]:
import pandas as pd
en_text = "Jack Sparrow loves New York!"
input_ids = bert_tokenizer.encode(en_text, return_tensors="pt")
bert_tokens = bert_tokenizer(en_text).tokens()
pd.DataFrame([bert_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,[CLS],jack,spar,##row,love,##s,new,york,!,[SEP]
Input IDs,2,37223,77258,36783,27594,2032,14745,45148,1001,4


In [276]:
import pandas as pd

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
per_text = " ".join(train['words'][0])
input_ids = bert_tokenizer.encode(per_text, return_tensors="pt")
bert_tokens = bert_tokenizer(per_text).tokens()
pd.DataFrame([bert_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
Tokens,[CLS],جمعیت,جمعیت,این,دهستان,بر,اساس,سرشماری,سال,۱۳۸۵,(,۱,٬,۲۱۲,##خانوار,),۵,٬,۱۱۸,##نفر,بوده,است,.,[SEP]
Input IDs,2,4172,4172,2802,8806,2801,3561,8907,2844,9815,1006,1455,1394,30064,92751,1007,1459,1394,15658,18783,3225,2806,1012,4


In [277]:
outputs = parsbert_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(bert_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

Number of tokens in sequence: 24
Shape of outputs: torch.Size([1, 24, 4])


In [278]:
def tag_text(text, tags, model, tokenizer):
    tags = list(ner_tag_names)
    
    # Get tokens with special characters
    tokens = bert_tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = bert_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(inputs)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [300]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(examples["words"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append('-100')
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [301]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['id','words','lang','ner'])

In [302]:
train_encoded = encode_panx_dataset(train)

Map:   0%|          | 0/733507 [00:00<?, ? examples/s]

In [None]:
print(train_encoded['words'][0])
print('----------------------')
print(train_encoded['labels'][0])
print('----------------------')
print(train['ner'][0])


In [None]:
errors = []
for i in range(1000):
    if len(train_encoded['labels'][i]) ==  len(train_encoded['input_ids'][i]):
        errors.append(i)

errors

In [None]:
def align_predictions(predictions, label_ids):
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape
        labels_list, preds_list = [], []
        for batch_idx in range(batch_size):
            example_labels, example_preds = [], []
            for seq_idx in range(seq_len):
                # Ignore label IDs = -100
                if label_ids[batch_idx, seq_idx] != -100:
                    example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                    example_preds.append(index2tag[preds[batch_idx][seq_idx]])
            labels_list.append(example_labels)
            preds_list.append(example_preds)
        return preds_list, labels_list