In [None]:
from collections import namedtuple
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from conlleval import eval_f1score
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel, BertModel, AdamW

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
PATH_TRAIN = "data/train/train_dataset.txt"
PATH_VAL = "data/val/val_dataset.txt"
PATH_AQMAR_TEST = "data/test/aqmar_test_dataset.txt"
PATH_NEWS_TEST = "data/test/news_test_dataset.txt"
PATH_TWEETS_TEST = "data/test/tweets_test_dataset.txt"
FULL_FINETUNE = True

In [None]:
label_to_id = {"O":0, "B-ORG":1, "I-ORG":2, "B-PER":3, "I-PER":4, "B-LOC":5, "I-LOC":6}
id_to_label = {value: key for key, value in label_to_id.items()}

In [None]:
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)

In [None]:
def clean_label(label):
    if "B-ORG" in label:
        return "B-ORG"
    elif "I-ORG" in label:
        return "I-ORG"
    elif "B-PER" in label:
        return "B-PER"
    elif "I-PER" in label:
        return "I-PER"
    elif "B-LOC" in label:
        return "B-LOC"
    elif "I-LOC" in label:
        return "I-LOC"
    elif "O" in label:
        return "O"

In [None]:
def preprocess_data(PATH_DATASET, tokenizer, max_length=512):
    data = pd.read_csv(PATH_DATASET, encoding="utf-8", delim_whitespace=True, header=None, skip_blank_lines=False)
    Instance = namedtuple("Instance", ["tokenized_text", "input_ids", "input_mask", "labels", "label_ids"])
    dataset = []
    text = ["[CLS]"]
    labels = ["O"]
    for w, l in zip(data[0], data[1]):
        if str(w) == "nan" and str(l) == "nan":
            text.append("[SEP]")
            labels.append("O")
            
            str_text = " ".join(text)
            tokenized_text = arabert_tokenizer.tokenize(str_text)
            
            cnt = 0 
            new_labels = []
            label_ids = []
            for i in tokenized_text:
                if "##" in i:
                    tok_label = labels[cnt - 1]
                    if "B-" in tok_label:
                        tok_label = tok_label.replace("B-", "I-")
                        
                    tok_label = clean_label(tok_label)
                    new_labels.append(tok_label)
                    label_ids.append(label_to_id[tok_label])
                else:
                    new_labels.append(labels[cnt])
                    label_ids.append(label_to_id[clean_label(labels[cnt])])
                    cnt += 1
                                    
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
                
            input_mask = [1] * len(input_ids)
            
            while len(input_ids) < max_length:
                input_ids.append(0)
                input_mask.append(0)
                label_ids.append(label_to_id["O"])
            
            dataset.append(Instance(tokenized_text, input_ids,
                            input_mask, new_labels, label_ids))

            text = ["[CLS]"]
            labels = ["O"]
            continue
            
        
        text.append(str(w))
        labels.append(str(l))
        
        
    return dataset

In [None]:
def transform_to_tensors(dataset):
    tensors_input_ids = []
    tensors_input_mask = []
    tensors_label_ids = []
    for i in dataset:
        tensors_input_ids.append(i.input_ids)
        tensors_input_mask.append(i.input_mask)
        tensors_label_ids.append(i.label_ids)
        
    return torch.tensor(tensors_input_ids), torch.tensor(tensors_input_mask), torch.tensor(tensors_label_ids)

In [None]:
class ModifiedBertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=7):
        super().__init__(config)
        self.num_labels = num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        outputs =  logits # (logits,) + outputs[2:] add hidden states and attention if they are here
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs =  loss # (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [None]:
def evaluate(model, filename, dataset, dataloader):
    global id_to_label
    model.eval()
    f1_score = 0
    
    with torch.no_grad():
        fw =  open("{}".format(filename), "w")
        cnt = 0
        for batch in tqdm(dataloader):
            input_ids, input_mask, _ = batch
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            output = model(input_ids=input_ids, attention_mask=input_mask)

            length = len(dataset[cnt].tokenized_text)
            for w in range(length):
                word = dataset[cnt].tokenized_text[w]
                true_label = clean_label(dataset[cnt].labels[w])
                pred_label = id_to_label[torch.argmax(output.squeeze(0)[w]).item()]
                fw.write("{} {} {}\n".format(word, true_label, pred_label))
            fw.write("\n")
            cnt += 1
        fw.close()

        _, f1_score_arr = eval_f1score("{}".format(filename))
        
    return f1_score_arr
    

In [None]:
def train(model, optimizer, train_dataloader, val_dataloader, accumulation_steps=32, epochs=1, device="cpu"):
    model.to(device)
    best_f1_score = 0
    best_model = None
    
    for epoch in range(epochs):
        training_loss = 0.0
        val_loss = 0.0

        model.train()
        cnt_step = 0
        for batch in tqdm(train_dataloader):
            
            input_ids, input_mask, label_ids = batch
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            label_ids = label_ids.to(device)
            
            loss = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
            training_loss += loss.data.item()
            
            loss = loss / accumulation_steps
            loss.backward()
            
            if (cnt_step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            cnt_step += 1

        training_loss /= cnt_step
        
        model.eval()
        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                input_ids, input_mask, label_ids = batch
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                label_ids = label_ids.to(device)

                loss = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
                val_loss += loss.data.item()

            val_loss /= len(val_dataloader)

            print("epoch {}: training loss {}, val loss {}".format(epoch, training_loss, val_loss))
            
        f1_score_arr = evaluate(model, "val.txt", dataset_val, val_dataloader)
        
        if best_f1_score > f1_score_arr[3]:
                best_f1_score = f1_score_arr[3]
                best_model = model
                print("We have a better model with an F1 Score: {}".format(best_f1_score))
            
    return best_model

In [None]:
arabert_model = ModifiedBertForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv01")

In [None]:
dataset_train = preprocess_data(PATH_TRAIN, arabert_tokenizer)
dataset_val = preprocess_data(PATH_VAL, arabert_tokenizer)
dataset_aqmar_test = preprocess_data(PATH_AQMAR_TEST, arabert_tokenizer)
dataset_news_test = preprocess_data(PATH_NEWS_TEST, arabert_tokenizer)
dataset_tweets_test = preprocess_data(PATH_TWEETS_TEST, arabert_tokenizer)

In [None]:
train_tensors_input_ids, train_tensors_input_mask, train_tensors_label_ids = transform_to_tensors(dataset_train)
val_tensors_input_ids, val_tensors_input_mask, val_tensors_label_ids = transform_to_tensors(dataset_val)
test_aqmar_tensors_input_ids, test_aqmar_tensors_input_mask, test_aqmar_tensors_label_ids = transform_to_tensors(dataset_aqmar_test)
test_news_tensors_input_ids, test_news_tensors_input_mask, test_news_tensors_label_ids = transform_to_tensors(dataset_news_test)
test_tweets_tensors_input_ids, test_tweets_tensors_input_mask, test_tweets_tensors_label_ids = transform_to_tensors(dataset_tweets_test)

In [None]:
train_tensor_dataset = TensorDataset(train_tensors_input_ids, train_tensors_input_mask, train_tensors_label_ids)
val_tensor_dataset = TensorDataset(val_tensors_input_ids, val_tensors_input_mask, val_tensors_label_ids)
test_aqmar_tensor_dataset = TensorDataset(test_aqmar_tensors_input_ids, test_aqmar_tensors_input_mask, test_aqmar_tensors_label_ids)
test_news_tensor_dataset = TensorDataset(test_news_tensors_input_ids, test_news_tensors_input_mask, test_news_tensors_label_ids)
test_tweets_tensor_dataset = TensorDataset(test_tweets_tensors_input_ids, test_tweets_tensors_input_mask, test_tweets_tensors_label_ids)

In [None]:
train_dataloader = DataLoader(train_tensor_dataset, batch_size=1)
val_dataloader = DataLoader(val_tensor_dataset, batch_size=1)
test_aqmar_dataloader = DataLoader(test_aqmar_tensor_dataset, batch_size=1)
test_news_dataloader = DataLoader(test_news_tensor_dataset, batch_size=1)
test_tweets_dataloader = DataLoader(test_tweets_tensor_dataset, batch_size=1)

In [None]:
optimizer_grouped_parameters = None
param_optimizer = list(arabert_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

if FULL_FINETUNE:
    print('ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    print('NO ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': arabert_model.classifier.parameters(),
         'weight_decay_rate': 0.01}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

In [None]:
trained_model = train(arabert_model, optimizer, train_dataloader, val_dataloader, epochs=1, device=device)

In [None]:
evaluate(trained_model, "val.txt", dataset_val, val_dataloader)
evaluate(trained_model, "test_aqmar.txt", dataset_aqmar_test, test_aqmar_dataloader)
evaluate(trained_model, "test_news.txt", dataset_news_test, test_news_dataloader)
evaluate(trained_model, "test_tweets.txt", dataset_tweets_test, test_tweets_dataloader)