In [1]:
!pip install torch transformers conllu unidecode pandas numpy datasets evaluate colorama lime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
base_path = 'gdrive/MyDrive/NLP_Projects/lexical_resources'
lang = 'maltese'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!ls 

gdrive	sample_data


In [4]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# Change dirs

Key to change directories for imports of modules and facilitates file import

In [5]:

%cd gdrive/MyDrive/NLP_Projects/lexical_resources  

/content/gdrive/.shortcut-targets-by-id/13KkCD2fkNEO2nduVuPwn1HhOMzKANE0K/lexical_resources


In [6]:
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F
import transformers
from transformers import PreTrainedTokenizerFast, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoTokenizer
from lime.lime_text import LimeTextExplainer
from datasets import Dataset
import pandas as pd
import conllu
from unidecode import unidecode
import matplotlib.pyplot as plt
from utils.dataloader import PreDataCollator
from utils.utils import *

In [7]:
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

True
cuda


In [8]:
#model_name = "MLRS/mBERTu"  # DONE
#model_name = "Zappandy/mBERTu-arabic"  # DONE
#model_name = "bert-base-multilingual-cased"  # DONE
model_name = "Zappandy/mBERT-rom-arabic"

#model_name = "distilbert-base-uncased"  # "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # AutoTokenizerFast



Downloading (…)okenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

# Offset mapping

Example of masking and why it works

In [9]:
encoding = tokenizer(['Il-', 'Membru', 'tal-', 'Kumitat'], is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=512)  # using it for aligning
print(encoding.keys())
tokenizer.convert_ids_to_tokens(encoding['input_ids'])
encoded_tags = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
encoded_tags
print(tokenizer.convert_ids_to_tokens(encoding['input_ids']))
print(encoding['offset_mapping'])
for idx, mapping in enumerate(encoding['offset_mapping']):
  if mapping[0] == 0 and mapping[1] != 0:
    print(idx)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])
['[CLS]', 'Il', '-', 'Mem', '##bru', 'tal', '-', 'Ku', '##mita', '##t', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

# Helper functions

In [10]:
def get_tag_mappings(unique_tags):
    """
    It creates a dictionary that maps each tag to a unique integer, and another dictionary that maps
    each unique integer to a tag
    :return: A dictionary of tags to ids and a dictionary of ids to tags.
    """    
    
    tags_to_ids = {k: v for v, k in enumerate(unique_tags)}
    ids_to_tags = {v: k for v, k in enumerate(unique_tags)}

    return tags_to_ids, ids_to_tags


In [11]:
import re

def sentence_generator(tokens, spaces):
    sentences = []
    for sent, space in zip(tokens, spaces):  
      join_sent = sent[0]
      for tok, status in zip(sent[1:], space):
        if status:
            if 'SpaceAfter' in status.keys():  #  sometimes this can be conj?
              
              if status['SpaceAfter'] == 'No':
                  join_sent += tok
              else:
                  join_sent += (' ' + tok)
            else:
                join_sent += (' ' + tok)
        else:
            join_sent += (' ' + tok)
      sentences.append(join_sent)     
    return sentences

def read_conllu(file):
    """
    file: conllu format file
    returns: sentences and upostags per sentence
    """
    corpus = open(file, "r", encoding="utf-8")
    data = conllu.parse(corpus.read())
    tokens = [[unidecode(token['form']) for token in sentence] for sentence in data]
    spaces = [[token['misc'] for token in sentence] for sentence in data]
    tags = [[token['upos'] for token in sentence] for sentence in data]  # xpos or upos
    sentences = sentence_generator(tokens, spaces)
    #align_tokenizations(sentences, tags)

    return {"tokens": tokens, "labels": tags, "space": spaces, "sentences": sentences}


In [12]:
malt_base_path = os.getcwd() + f"/Languages/{lang}/UD_{lang}/"
#malt_base_path = f"/Languages/{lang}/UD_{lang}/"

train_data = read_conllu(malt_base_path + "mt_mudt-ud-train.conllu")
dev_data = read_conllu(malt_base_path + "mt_mudt-ud-dev.conllu")
malt_tags = train_data["labels"]



malt_tag_set = list(set(tag for tags in malt_tags for tag in tags))


# Defining Model - Important!


In [13]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(malt_tag_set))  # AutoModelForTokenClassification

Downloading (…)lve/main/config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at Zappandy/mBERT-rom-arabic were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at Zappandy/mBERT-rom-arabic and a

In [14]:
MAX_LEN = 256
tags_to_ids, ids_to_tags = get_tag_mappings(malt_tag_set)
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids=tags_to_ids)

In [15]:

train_dataset = Dataset.from_dict(train_data)
dev_dataset = Dataset.from_dict(dev_data)
train_tokenized = train_dataset.map(collator, remove_columns=train_dataset.column_names, batch_size=4, num_proc=4, batched=True)
dev_tokenized = dev_dataset.map(collator, remove_columns=dev_dataset.column_names, batch_size=4, num_proc=4, batched=True)

      

#0:   0%|          | 0/71 [00:00<?, ?ba/s]

#2:   0%|          | 0/71 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/71 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/70 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/28 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/27 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/27 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/27 [00:00<?, ?ba/s]

In [16]:
EPOCHS = 7
LEARNING_RATE = 5e-4
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
SAVE_STEPS = 100
EVAL_STEPS = 50
SAVE_LIMIT = 1  # was 2
WARMUP_STEPS = 50

output_dir = f"./Languages/{lang}/outputs/{model_name}-{lang}"

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='pt')



from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=output_dir,
  group_by_length=True,
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=EPOCHS,
  fp16=False,
  save_steps=SAVE_STEPS,
  eval_steps=EVAL_STEPS,
  logging_steps=EVAL_STEPS,
  learning_rate=LEARNING_RATE,
  warmup_steps=WARMUP_STEPS,
  save_total_limit=SAVE_LIMIT,
)




Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [18]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)



In [19]:

# If you want to continue training from a checkpoint
# CHECKPOINT = 2500
# chkpt_model = f'{output_dir}/checkpoint-{CHECKPOINT}'
# trainer.train(chkpt_model)



In [20]:
trainer.train()

***** Running training *****
  Num examples = 1123
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 490
  Number of trainable parameters = 177275921
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.1027,0.491105,0.862376,0.748685
100,0.4017,0.378671,0.894407,0.818692
150,0.326,0.409189,0.888138,0.773624
200,0.2226,0.373746,0.90479,0.831589
250,0.1515,0.373348,0.910667,0.82271
300,0.1235,0.395273,0.915761,0.834014
350,0.0839,0.354213,0.919483,0.845498
400,0.0584,0.367828,0.926046,0.851107
450,0.0351,0.349599,0.930258,0.845134


***** Running Evaluation *****
  Num examples = 433
  Batch size = 8
***** Running Evaluation *****
  Num examples = 433
  Batch size = 8
Saving model checkpoint to ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-100
Configuration saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-100/config.json
Model weights saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 433
  Batch size = 8
***** Running Evaluation *****
  Num examples = 433
  Batch size = 8
Saving model checkpoint to ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/checkpoint-200
Configuration saved in ./L

TrainOutput(global_step=490, training_loss=0.25737274398609084, metrics={'train_runtime': 507.7089, 'train_samples_per_second': 15.483, 'train_steps_per_second': 0.965, 'total_flos': 1026774183373824.0, 'train_loss': 0.25737274398609084, 'epoch': 6.99})

In [21]:
trainer.save_model(f"{output_dir}/Final")

Saving model checkpoint to ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final
Configuration saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final/config.json
Model weights saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final/pytorch_model.bin
tokenizer config file saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final/tokenizer_config.json
Special tokens file saved in ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final/special_tokens_map.json


In [22]:
import collections
tagset = collections.defaultdict(int)

for tagging in malt_tags:
  for tag in tagging:
    tagset[tag] += 1

print('number of different tags:', len(tagset))

# print count and tag sorted by decreasing count
for tag, count in sorted(tagset.items(), reverse=True, key=lambda x: x[1]):
  print(count, tag)


number of different tags: 17
4459 NOUN
3197 VERB
2636 ADP
2560 PUNCT
1857 DET
1298 SCONJ
1215 ADJ
1209 PRON
892 ADV
867 AUX
786 PROPN
776 CCONJ
336 NUM
298 X
228 SYM
227 PART
39 INTJ


In [23]:
test_data = read_conllu(malt_base_path + "mt_mudt-ud-test.conllu")
test_dataset = Dataset.from_dict(test_data)
test_tokenized = test_dataset.map(collator, remove_columns=test_dataset.column_names, batch_size=4, num_proc=4, batched=True)

     

#0:   0%|          | 0/33 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/33 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/33 [00:00<?, ?ba/s]

#2:   0%|          | 0/33 [00:00<?, ?ba/s]

In [24]:
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

True
cuda


# mbert + MLM Arabic

In [25]:
unique_tags = malt_tag_set
# outputs, vis = feval(test_dataset, test_tokenized, model, device, unique_tags)

In [26]:
def outputs_to_csv(eval_model):
  eval_device = device
  eval_device = 'cpu'
  modelRegex = re.compile(r'(\w+(-)?\w+)+$')
  pattern = modelRegex.search(model_name)
  mo = pattern.group() if pattern else None
  print(mo)
  with torch.no_grad():
    eval_model.eval()
    outputs, vis = feval(test_dataset, test_tokenized, eval_model, eval_device, unique_tags)
  print(vis[10])
  sents = [outputs[i][0] for i in range(len(outputs))]
  tokens = [outputs[i][1] for i in range(len(outputs))]
  preds = [outputs[i][2] for i in range(len(outputs))]
  truths = [outputs[i][-1] for i in range(len(outputs))]
  data_output = {'Sentence': sents, 'Tokenized': tokens, 'Predictions': preds, 'Gold truths': truths}
  df = pd.DataFrame(data_output)
  df.to_csv(f"{mo}.csv", index=False)
  return df

In [27]:
print(vis[10])

NameError: ignored

In [28]:
test_model = AutoModelForTokenClassification.from_pretrained(output_dir + "/Final", num_labels=len(malt_tag_set))
#test_outputs, test_vis = feval(test_dataset, test_tokenized, test_model, device, unique_tags)
outputs_to_csv(test_model)

loading configuration file ./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final/config.json
Model config BertConfig {
  "_name_or_path": "./Languages/maltese/outputs/Zappandy/mBERT-rom-arabic-maltese/Final",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
 

mBERT-rom-arabic


100%|██████████| 518/518 [05:16<00:00,  1.64it/s]

Accuracy: 0.9336223245732864
F1: 0.8367187367823087
Tonio [42m[PROPN][PROPN][0m Fenech [42m[PROPN][PROPN][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m hu [42m[PRON][PRON][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m oggezzjoni [42m[NOUN][NOUN][0m ghat- [42m[ADP][ADP][0m talbiet [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m Oppozizzjoni [42m[NOUN][NOUN][0m imma [42m[CCONJ][CCONJ][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m wiehed [42m[NUM][NUM][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m jorbot [42m[VERB][VERB][0m il- [42m[DET][DET][0m kwistjoni [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m mercaptan [42m[PROPN][0m[41m[NOUN][0m mal- [42m[ADP][ADP][0m proceduri [42m[NOUN][NOUN][0m jekk [42m[SCONJ][SCONJ][0m certu [42m[ADJ][ADJ][0m xiri [42m[NOUN][NOUN][0m li [42m[SCONJ][SCONJ][0m sar [42m[VERB][VERB][0m mill- [42m[ADP][ADP][0m Enemalta [42m[PROPN][PROPN][0m hux [42m[PRON][PRON][0m korrett [42




Unnamed: 0,Sentence,Tokenized,Predictions,Gold truths
0,"Philip Schembri, ic-Chairman tal-Bord ta' Inkj...","[Philip, Schembri, ,, ic-, Chairman, tal-, Bor...",PROPN PROPN PUNCT DET NOUN ADP NOUN ADP NOUN A...,PROPN PROPN PUNCT DET NOUN ADP NOUN ADP NOUN A...
1,Ic-Chairman qal li l-Bord qal lil Joe Mizzi li...,"[Ic-, Chairman, qal, li, l-, Bord, qal, lil, J...",DET NOUN VERB SCONJ DET NOUN VERB ADP PROPN PR...,DET NOUN VERB SCONJ DET NOUN VERB ADP PROPN PR...
2,Dan hareg meta l-Kumitat Parlamentari dwar il-...,"[Dan, hareg, meta, l-, Kumitat, Parlamentari, ...",PRON VERB SCONJ DET NOUN ADJ ADP DET NOUN ADJ ...,PRON VERB SCONJ DET NOUN ADJ ADP DET NOUN ADJ ...
3,Ir-rapport kien tpogga fuq il-Mejda tal-Kamra ...,"[Ir-, rapport, kien, tpogga, fuq, il-, Mejda, ...",DET NOUN AUX VERB ADP DET NOUN ADP NOUN ADP NO...,DET NOUN AUX VERB ADP DET NOUN ADP NOUN ADP NO...
4,"Fil-bidu tal-laqgha, li tmexxiet mid-Deputat L...","[Fil-, bidu, tal-, laqgha, ,, li, tmexxiet, mi...",ADP NOUN ADP NOUN PUNCT SCONJ VERB ADP NOUN AD...,ADP NOUN ADP NOUN PUNCT SCONJ VERB ADP NOUN AD...
...,...,...,...,...
513,Il-Unicode hu sistema li qablu fuqha hafna paj...,"[Il-, Unicode, hu, sistema, li, qablu, fuqha, ...",DET NOUN PRON NOUN SCONJ VERB PRON DET NOUN SC...,DET PROPN PRON NOUN SCONJ VERB PRON DET NOUN S...
514,"Kwazi kull lingwa tad-dinja hija inkluza fih, ...","[Kwazi, kull, lingwa, tad-, dinja, hija, inklu...",NOUN DET NOUN ADP NOUN PRON VERB PRON PUNCT CC...,ADV DET NOUN ADP NOUN PRON VERB PRON PUNCT CCO...
515,It-tastiera Maltija li attivajt tuza din is-si...,"[It-, tastiera, Maltija, li, attivajt, tuza, d...",DET NOUN ADJ SCONJ VERB NUM PRON DET NOUN ADJ ...,DET NOUN ADJ SCONJ VERB VERB PRON DET NOUN ADJ...
516,Bil-Unicode tista' facilment tikteb u tikkorri...,"[Bil-, Unicode, tista', facilment, tikteb, u, ...",ADP PROPN VERB NOUN VERB CCONJ VERB ADP NOUN C...,ADP PROPN VERB ADV VERB CCONJ VERB ADP NOUN CC...


# Lime

In [29]:
class NERExplainerGenerator:
    
    def __init__(self, model_dir, number_of_labels, device):
        #self.model = AutoModelForTokenClassification.from_pretrained(model_dir, num_labels=number_of_labels)
        self.model = test_model
        #self.model = self.model.to(device)
        #self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)  # bug
        self.tokenizer = tokenizer
        
    def clean(self, sent):
        
        sentence = sent.strip().split() 
        pattern = re.compile("\ufffd|\u200e|\u200b\u200b|\u200b|\u200c|\u200f|\xad|\u0654|\u0652|\u0651|\u0650|\u0657|\u0656|\u064e|\u064b|\u0670|\u064f|\u064f",re.UNICODE)
        sentence = [pattern.sub('-',e) for e in sentence]
        sentence = [e.replace('-','') if len(e)>1 else e for e in sentence]
        
        return sentence
    
    def tokenize(self, sent):
        
        tokenized = self.tokenizer(sent,
         is_split_into_words=True, 
         return_offsets_mapping=True, 
         padding='max_length', 
         truncation=True, 
         max_length=256)
        
        
        return tokenized
    

    def get_predict_function(self, word_index, batch_size = 8):
        def predict_func(texts):
            
            tokenized = [self.tokenize(self.clean(text)) for text in texts]
            
            probas = None
            for i in range(0,len(tokenized),batch_size):
                
                if i+batch_size > len(tokenized):
                    j = len(tokenized)
                else:
                    j = i+batch_size
                    
                batch = tokenized[i:j]
                
                inp_ids = torch.as_tensor([b['input_ids'] for b in batch])#.to(device)
                mask = torch.as_tensor([b['attention_mask'] for b in batch])#.to(device)
                logits = self.model(input_ids=inp_ids, attention_mask=mask).logits
                probas_batch = F.softmax(logits, dim=-1).detach().numpy()
                
                if probas is None:
                    probas = probas_batch
                else:
                    probas = np.vstack((probas,probas_batch))
      

            print(probas.shape)
            return probas[:,word_index,:]
        
        return predict_func

In [30]:
# test_sent = "Ir-rapport kien tpogga fuq il-Mejda tal-Kamra tar-Rapprezentanti fit-30 ta' April li ghadda."
# test_labels = "DET NOUN AUX VERB ADP DET NOUN ADP NOUN ADP NOUN ADP NUM ADP PROPN SCONJ VERB PUNCT"
# get_token_idx(lime_model, test_sent, test_labels, unique_tags)

In [31]:
def get_token_idx(lime_model, sent, labels, tags):
    tokenized = lime_model.tokenize(sent.split())
    offset = tokenized['offset_mapping']
    index = [i for i,(a,b) in enumerate(offset) if a==0 and b!=0 and tokenized['input_ids'][i]!=6]
    tag_to_id = {t:i for i,t in enumerate(tags)}
    labels = labels.split()
    wordIds_to_tokenidx = [(ti,tag_to_id[labels[wi]]) for wi,ti in enumerate(index)]
    
    return wordIds_to_tokenidx

def explain(lime_model, explainer, tags, data, idx):
    
    original_sent = data.iloc[idx].Sentence
    tokens = eval(data.iloc[idx].Tokenized)
    #augmented_sent = data.iloc[idx].augmented_sen
    labels = data.iloc[idx].Predictions
    
    ids = get_token_idx(lime_model, original_sent, labels, tags)    
    
    for i, (word_index, label_index) in enumerate(ids):
        
        func = lime_model.get_predict_function(word_index)
        
        
        #exp = explainer.explain_instance(augmented_sent, func, 
                                         #num_features=20, num_samples=20, labels=(label_index,))
        exp = explainer.explain_instance(original_sent, func, 
                                         num_features=20, num_samples=20, labels=(label_index,))
        
        
        dir_ = './visualizations'
        if not os.path.exists(dir_):
            os.mkdir(dir_)
        if not os.path.exists(f'{dir_}/{str(idx)}'):
            os.mkdir(f'{dir_}/{str(idx)}')   
        
        
        filename = f'{dir_}/{str(idx)}/{tokens[i]}.html'
        exp.save_to_file(filename, text=original_sent)

In [32]:
#model = NERExplainerGenerator(f'../Experiment/output/{MODEL_NAME}-{LANG}-{SET}', len(tags), device)
lime_model = NERExplainerGenerator(test_model, unique_tags, device)
#lime_model = NERExplainerGenerator(model, unique_tags, device)
explainer = LimeTextExplainer(class_names=unique_tags, random_state=42)

In [33]:
modelRegex = re.compile(r'(\w+(-)?\w+)+$')
pattern = modelRegex.search(model_name)
mo = pattern.group() if pattern else None
data = pd.read_csv(mo + '.csv')

In [34]:
explain(lime_model, explainer, unique_tags, data, 10)


(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)
(20, 256, 17)


In [35]:
data.iloc[5]

Sentence       Il-Membru tal-Kumitat Leo Brincat talab li bha...
Tokenized      ['Il-', 'Membru', 'tal-', 'Kumitat', 'Leo', 'B...
Predictions    DET NOUN ADP NOUN PROPN PROPN VERB SCONJ ADP N...
Gold truths    DET NOUN ADP NOUN PROPN PROPN VERB SCONJ ADP N...
Name: 5, dtype: object

# mbert out of the box

In [None]:
unique_tags = malt_tag_set
outputs, vis = feval(test_dataset, test_tokenized, model, device, unique_tags)

100%|██████████| 518/518 [00:29<00:00, 17.85it/s]


Accuracy: 0.9264878533369457
F1: 0.8449541368490094


In [None]:
print(len(outputs))
sents = [outputs[i][0] for i in range(len(outputs))]
tokens = [outputs[i][1] for i in range(len(outputs))]
preds = [outputs[i][2] for i in range(len(outputs))]
truths = [outputs[i][-1] for i in range(len(outputs))]

df = pd.DataFrame(data_output)
df.to_csv("mBERT.csv", index=False)

518


In [None]:
print(vis[10])

Tonio [42m[PROPN][PROPN][0m Fenech [42m[PROPN][PROPN][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m hu [42m[PRON][PRON][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m oggezzjoni [42m[NOUN][NOUN][0m ghat- [42m[ADP][ADP][0m talbiet [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m Oppozizzjoni [42m[NOUN][NOUN][0m imma [42m[CCONJ][CCONJ][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m wiehed [42m[NUM][NUM][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m jorbot [42m[VERB][VERB][0m il- [42m[DET][DET][0m kwistjoni [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m mercaptan [42m[PROPN][0m[41m[NOUN][0m mal- [42m[ADP][ADP][0m proceduri [42m[NOUN][NOUN][0m jekk [42m[SCONJ][SCONJ][0m certu [42m[ADJ][ADJ][0m xiri [42m[NOUN][NOUN][0m li [42m[SCONJ][SCONJ][0m sar [42m[VERB][VERB][0m mill- [42m[ADP][ADP][0m Enemalta [42m[PROPN][PROPN][0m hux [42m[PRON][PRON][0m korrett [42m[ADJ][0m[41m[NOUN][0m . [42m[PUNCT][PUNCT][0m


In [None]:
device = 'cpu'
test_model = AutoModelForTokenClassification.from_pretrained(output_dir + "/Final", num_labels=len(malt_tag_set))
test_outputs, test_vis = feval(test_dataset, test_tokenized, test_model, device, unique_tags)

loading configuration file ./Languages/maltese/outputs/bert-base-multilingual-cased-maltese/Final/config.json
Model config BertConfig {
  "_name_or_path": "./Languages/maltese/outputs/bert-base-multilingual-cased-maltese/Final",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12":

Accuracy: 0.9326289171859478
F1: 0.8434185518183983





#mbertu arabic

In [None]:
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

True
cuda


In [None]:
unique_tags = malt_tag_set
outputs, vis = feval(test_dataset, test_tokenized, model, device, unique_tags)

100%|██████████| 518/518 [00:32<00:00, 15.95it/s]


Accuracy: 0.9546644992323671
F1: 0.862520006579891


In [None]:
print(vis[10])

Tonio [42m[PROPN][PROPN][0m Fenech [42m[PROPN][PROPN][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m hu [42m[PRON][PRON][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m oggezzjoni [42m[NOUN][NOUN][0m ghat- [42m[ADP][ADP][0m talbiet [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m Oppozizzjoni [42m[NOUN][NOUN][0m imma [42m[CCONJ][CCONJ][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m wiehed [42m[NUM][NUM][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m jorbot [42m[VERB][VERB][0m il- [42m[DET][DET][0m kwistjoni [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m mercaptan [42m[PROPN][0m[41m[NOUN][0m mal- [42m[ADP][ADP][0m proceduri [42m[NOUN][NOUN][0m jekk [42m[SCONJ][SCONJ][0m certu [42m[ADJ][ADJ][0m xiri [42m[NOUN][NOUN][0m li [42m[SCONJ][SCONJ][0m sar [42m[VERB][VERB][0m mill- [42m[ADP][ADP][0m Enemalta [42m[PROPN][PROPN][0m hux [42m[PRON][PRON][0m korrett [42m[ADJ][ADJ][0m . [42m[PUNCT][PUNCT][0m


# MBERTU NON-ARABIC RESUTLS

In [None]:
unique_tags = malt_tag_set
outputs, vis = feval(test_dataset, test_tokenized, model, device, unique_tags)

100%|██████████| 518/518 [00:24<00:00, 21.07it/s]


Accuracy: 0.9653210512056354
F1: 0.9060362201619968


In [None]:
print(vis[10])

Tonio [42m[PROPN][PROPN][0m Fenech [42m[PROPN][PROPN][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m hu [42m[PRON][PRON][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m oggezzjoni [42m[NOUN][NOUN][0m ghat- [42m[ADP][ADP][0m talbiet [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m Oppozizzjoni [42m[NOUN][NOUN][0m imma [42m[CCONJ][CCONJ][0m qal [42m[VERB][VERB][0m li [42m[SCONJ][SCONJ][0m wiehed [42m[NUM][NUM][0m m' [42m[PART][PART][0m ghandux [42m[VERB][VERB][0m jorbot [42m[VERB][VERB][0m il- [42m[DET][DET][0m kwistjoni [42m[NOUN][NOUN][0m tal- [42m[ADP][ADP][0m mercaptan [42m[PROPN][0m[41m[NOUN][0m mal- [42m[ADP][ADP][0m proceduri [42m[NOUN][NOUN][0m jekk [42m[SCONJ][SCONJ][0m certu [42m[ADJ][ADJ][0m xiri [42m[NOUN][NOUN][0m li [42m[SCONJ][SCONJ][0m sar [42m[VERB][VERB][0m mill- [42m[ADP][ADP][0m Enemalta [42m[PROPN][PROPN][0m hux [42m[PRON][PRON][0m korrett [42m[ADJ][ADJ][0m . [42m[PUNCT][PUNCT][0m


#Zero shot

In [None]:
from transformers import TokenClassificationPipeline

In [None]:
z_model_name = "bert-base-multilingual-cased"
z_tokenizer = AutoTokenizer.from_pretrained(z_model_name)
z_model = AutoModelForTokenClassification.from_pretrained(z_model_name)

pipeline = TokenClassificationPipeline(model=z_model, tokenizer=z_tokenizer)
outputs = pipeline("A test example")
print(outputs)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

[{'entity': 'LABEL_1', 'score': 0.55929786, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, {'entity': 'LABEL_1', 'score': 0.56222904, 'index': 2, 'word': 'test', 'start': 2, 'end': 6}, {'entity': 'LABEL_1', 'score': 0.5032979, 'index': 3, 'word': 'example', 'start': 7, 'end': 14}]


In [None]:
MAX_LEN = 256
tags_to_ids, ids_to_tags = get_tag_mappings(malt_tag_set)
z_collator = PreDataCollator(tokenizer=z_tokenizer, max_len=MAX_LEN, tags_to_ids=tags_to_ids)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [None]:
test_data = read_conllu(malt_base_path + "mt_mudt-ud-test.conllu")
test_dataset = Dataset.from_dict(test_data)
test_tokenized = test_dataset.map(z_collator, remove_columns=test_dataset.column_names, batch_size=4, num_proc=4, batched=True)

     

#0:   0%|          | 0/33 [00:00<?, ?ba/s]

   

#3:   0%|          | 0/33 [00:00<?, ?ba/s]

#1:   0%|          | 0/33 [00:00<?, ?ba/s]

#2:   0%|          | 0/33 [00:00<?, ?ba/s]

In [None]:
unique_tags = malt_tag_set
outputs, vis = feval(test_dataset, test_tokenized, z_model, device, unique_tags)

100%|██████████| 518/518 [04:57<00:00,  1.74it/s]

Accuracy: 0.010114693398356363
F1: 0.004451648938665045



