In [1]:
import torch
import pandas as pd
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel, BertModel


In [2]:
PATH_TRAIN = "data/train/train_dataset.txt"
PATH_VAL = "data/val/val_dataset.txt"
PATH_AQMAR_TEST = "data/test/aqmar_test_dataset.txt"
PATH_NEWS_TEST = "data/test/news_test_dataset.txt"

In [3]:
label_to_id = {"O":0, "B-ORG":1, "I-ORG":2, "B-PER":3, "I-PER":4, "B-LOC":5, "I-LOC":6}
id_to_label = {value: key for key, value in label_to_id.items()}

In [4]:
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)

In [5]:
def preprocess_data(PATH_DATASET, tokenizer, max_length=512):
    data = pd.read_csv(PATH_DATASET, delim_whitespace=True, header=None, skip_blank_lines=False)
    dataset = []
    tweet = ["[CLS]"]
    labels = ["O"]
    for w, l in zip(data[0], data[1]):
        if str(w) == "nan" and str(l) == "nan":
            tweet.append("[SEP]")
            labels.append("O")
            
            str_tweet = " ".join(tweet)
            tokenized_tweet = arabert_tokenizer.tokenize(str_tweet)
            
            cnt = 0 
            new_labels = []
            label_ids = []
            for i in tokenized_tweet:
                if "##" in i:
                    tok_label = labels[cnt - 1]
                    if "B-" in tok_label:
                        tok_label = tok_label.replace("B-", "I-")
                    new_labels.append(tok_label)
                    label_ids.append(label_to_id[tok_label])
                else:
                    new_labels.append(labels[cnt])
                    label_ids.append(label_to_id[labels[cnt]])
                    cnt += 1
                                    
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_tweet)
                
            input_mask = [1] * len(input_ids)
            
            while len(input_ids) < max_length:
                input_ids.append(0)
                input_mask.append(0)
            
            dataset.append([tokenized_tweet, input_ids, input_mask, new_labels, label_ids])

            tweet = ["[CLS]"]
            labels = ["O"]
            continue
            
        
        tweet.append(str(w))
        labels.append(str(l))
        
        
    return dataset

In [6]:
class ModifiedBertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=7):
        super().__init__(config)
        self.num_labels = num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [7]:
arabert_model = ModifiedBertForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv01")

In [8]:
dataset = preprocess_data(PATH_TRAIN, arabert_tokenizer)

In [9]:
print(dataset[0][0])
print(dataset[0][1])
print(dataset[0][2])
print(dataset[0][3])

arabert_model(torch.tensor([dataset[0][1]]))

['[CLS]', 'ففي', 'مساف', '##ه', '300', '##0', 'متر', 'موانع', 'خالف', 'العداء', 'المغربي', 'عبد', 'الغني', 'آيت', 'باح', '##ما', '##د', 'التوقعات', 'التي', 'كانت', 'ترشحه', 'للفوز', 'باحد', '##ي', 'الميداليات', 'واكتف', '##ي', 'بالمرتب', '##ه', 'الرابع', '##ه', 'في', 'المس', '##ابق', '##ه', 'النهائي', '##ه', 'لهذه', 'المس', '##ا', '##ف', '##ه', 'مسجلا', 'توقيت', '8', '##د', '##و', '##20', '##ث', '##و', '##05', '##ج', '[SEP]']
[17028, 3020, 11335, 909, 1128, 808, 3475, 25813, 8167, 33405, 47140, 2811, 19131, 1259, 1561, 4768, 891, 53091, 5878, 10348, 20847, 24464, 6251, 912, 59421, 26498, 912, 48065, 909, 32741, 909, 660, 6081, 14660, 909, 47500, 909, 10929, 6081, 883, 903, 909, 25262, 21440, 18, 891, 910, 4288, 887, 910, 4271, 888, 17030, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

(tensor([[[ 0.2745,  0.6423, -0.8280,  ..., -0.4021,  0.4501,  0.4468],
          [ 0.0801,  0.4459, -0.9089,  ..., -0.4047,  0.1387,  0.3629],
          [ 0.1287,  0.4684, -0.9252,  ..., -0.3287,  0.3051,  0.3348],
          ...,
          [ 0.2671,  0.7023, -0.8001,  ..., -0.3694,  0.4411,  0.4452],
          [ 0.2644,  0.6985, -0.8136,  ..., -0.3863,  0.4299,  0.4377],
          [ 0.2750,  0.7018, -0.8385,  ..., -0.4575,  0.4643,  0.4280]]],
        grad_fn=<AddBackward0>),)