In [28]:
import torch
import pandas as pd
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel, BertModel


In [29]:
PATH_TRAIN = "data/train/train_dataset.txt"
PATH_VAL = "data/val/val_dataset.txt"
PATH_AQMAR_TEST = "data/test/aqmar_test_dataset.txt"
PATH_NEWS_TEST = "data/test/news_test_dataset.txt"

In [None]:
label_to_id = {"O":0, "B-ORG":1, "I-ORG":2, "B-PER":3, "I-PER":4, "B-LOC":5, "I-LOC":6}
id_to_label = {value: key for key, value in label_to_id.items()}

In [30]:
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)

In [64]:
def preprocess_data(PATH_DATASET, tokenizer, max_length=512):
    data = pd.read_csv(PATH_DATASET, delim_whitespace=True, header=None, skip_blank_lines=False)
    dataset = []
    tweet = ["[CLS]"]
    labels = ["O"]
    for w, l in zip(data[0], data[1]):
        if str(w) == "nan" and str(l) == "nan":
            tweet.append("[SEP]")
            labels.append("O")
            
            str_tweet = " ".join(tweet)
            tokenized_tweet = arabert_tokenizer.tokenize(str_tweet)
            
            cnt = 0 
            new_labels = []
            label_ids = []
            for i in tokenized_tweet:
                if "##" in i:
                    tok_label = labels[cnt - 1]
                    if "B-" in tok_label:
                        tok_label = tok_label.replace("B-", "I-")
                    new_labels.append(tok_label)
                    label_ids.append(label_to_id[tok_label])
                else:
                    new_labels.append(labels[cnt])
                    label_ids.append(label_to_id[labels[cnt]])
                    cnt += 1
                                    
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_tweet)
                
            input_mask = [1] * len(input_ids)
            
            while len(input_ids) < max_length:
                input_ids.append(0)
                input_mask.append(0)
            
            dataset.append([tokenized_tweet, input_ids, input_mask, new_labels, label_ids])

            tweet = ["[CLS]"]
            labels = ["O"]
            continue
            
        
        tweet.append(str(w))
        labels.append(str(l))
        
        
    return dataset

In [65]:
class ModifiedBertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=7):
        super().__init__(config)
        self.num_labels = num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [66]:
arabert_model = ModifiedBertForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv01")

In [67]:
dataset = preprocess_data(PATH_NEWS_TEST, arabert_tokenizer)

In [72]:
for i,j, in  zip(dataset[0][1], dataset[0][2]):
    print(i,j)

17028 1
1429 1
500 1
894 1
909 1
34272 1
6131 1
33774 1
52817 1
17030 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0


In [49]:
arabert_model(torch.tensor([dataset[0][1]]))

(tensor([[[-0.5815, -0.2678,  0.9537, -0.1876, -0.0735, -0.3350, -0.2344,
            0.0920],
          [-0.2125, -0.1957, -0.1214, -0.6950, -0.5942, -0.4129, -0.4650,
           -0.3354],
          [-0.7344,  0.1268, -0.3726, -0.6327, -0.5517,  0.2383, -0.4185,
           -0.2734],
          [ 0.1305,  0.0089,  0.1024, -0.1013, -0.3829,  0.2336, -0.5354,
           -0.3834],
          [-0.2254, -0.2040, -0.4411, -0.2509, -0.0516,  0.0088, -0.1244,
           -0.2285],
          [ 0.4821, -0.1052,  0.1394, -0.4682, -0.6113, -0.4717, -0.2342,
           -0.3640],
          [-0.5591, -0.2826,  0.6281, -0.3305, -0.4688, -0.3615, -0.1165,
           -0.0826],
          [-0.1495, -0.6609,  0.2846, -0.4250,  0.2454,  0.0622, -0.8206,
            0.4120],
          [ 0.5827, -0.3020, -0.2854, -0.2232,  0.2105, -0.3705, -0.2747,
            0.1033],
          [ 0.5592, -0.2954,  0.7138, -0.0664, -0.1785,  0.0428, -0.1397,
            0.0924]]], grad_fn=<AddBackward0>),)

In [50]:
import torch

labels = torch.tensor([1, 2, 3, 5])
one_hot = torch.zeros(4, 6)
one_hot[torch.arange(4), labels] = 1

reverted = torch.argmax(one_hot, dim=1)
assert (labels == reverted).all().item()


In [61]:
one_hot = torch.zeros(1, 6)

In [63]:
arabert_tokenizer.vocab

OrderedDict([('', 0),
             ('!', 1),
             ('"', 2),
             ('%', 3),
             ("'", 4),
             ('(', 5),
             (')', 6),
             ('*', 7),
             (',', 8),
             ('-', 9),
             ('/', 10),
             ('1', 11),
             ('2', 12),
             ('3', 13),
             ('4', 14),
             ('5', 15),
             ('6', 16),
             ('7', 17),
             ('8', 18),
             ('9', 19),
             (':', 20),
             ('<', 21),
             ('>', 22),
             ('?', 23),
             ('A', 24),
             ('B', 25),
             ('C', 26),
             ('D', 27),
             ('E', 28),
             ('F', 29),
             ('G', 30),
             ('H', 31),
             ('I', 32),
             ('J', 33),
             ('K', 34),
             ('L', 35),
             ('M', 36),
             ('N', 37),
             ('O', 38),
             ('P', 39),
             ('Q', 40),
             ('R', 41),
   