# Translate to English

In [1]:
from easynmt import EasyNMT

In [2]:
def translate_sentence(sent, model):
    translation = model.translate(sent, target_lang='en')
    return translation

# Create alignment

In [3]:
import transformers
import itertools
import torch

In [4]:
def create_alignment(src, tgt, model, tokenizer):
    # preprocessing
    sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
        sub2word_map_src += [i for x in word_list]
    sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
        sub2word_map_tgt += [i for x in word_list]

    # alignment
    align_layer = 8
    threshold = 1e-3
    model.eval()
    with torch.no_grad():
        out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
        out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]

        dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))

        softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
        softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)

        softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
    align_words = set()
    for i, j in align_subwords:
        align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )

    return sent_src, sent_tgt, sorted(align_words)

# Named entity recognition

In [6]:
from deeppavlov import configs, build_model



In [13]:
ner_model_m = build_model(configs.ner.ner_ontonotes_bert_mult_torch, download=True)

2021-10-21 20:01:05.699 INFO in 'deeppavlov.core.data.utils'['utils'] at line 95: Downloading from http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch.tar.gz to /home/tootiredone/.deeppavlov/ner_ontonotes_bert_mult_torch.tar.gz
100%|██████████| 1.38G/1.38G [03:29<00:00, 6.58MB/s]
2021-10-21 20:04:35.503 INFO in 'deeppavlov.core.data.utils'['utils'] at line 272: Extracting /home/tootiredone/.deeppavlov/ner_ontonotes_bert_mult_torch.tar.gz archive into /home/tootiredone/.deeppavlov/models
2021-10-21 20:04:54.321 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /home/tootiredone/.deeppavlov/models/ner_ontonotes_bert_mult_torch/bert-base-multilingual-cased/tag.dict]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias

# Propagate labels to the original language

In [20]:
def propagate_labels(src, tgt, ner_sent, ner_tags, model, tokenizer):
    # create alignment
    sent_src, sent_tgt, alignment = create_alignment(src, tgt, model, tokenizer)
    alignment = {v: k for k, v in alignment}

    # fix composite words
    new_labels = []
    i = 0
    while i < len(ner_sent) - 1:
        j = i
        while ner_sent[j + 1] == '-':
            if j + 3 < len(ner_sent):
                j += 2
            else:
                j = len(ner_sent) - 1
                break
        if j == i:
            new_labels.append(ner_tags[i])
            i += 1
        else:
            label = ner_tags[i]
            for k in range(i + 1, j + 1):
                if ner_tags[k][0] == 'I' and label[0] != 'B' or ner_tags[k][0] == 'B':
                    label = ner_tags[k]
            new_labels.append(label)
            i = j + 1
    if i < len(ner_sent):
        new_labels.append(ner_tags[-1])
                
    # align tags
#     for i in range(len(sent_tgt)):
#         if i in alignment:
#             print(sent_tgt[i], sent_src[alignment[i]])
#         else:
#             print(sent_tgt[i])
    res = [[], []]
    for i in range(len(sent_tgt)):
        res[0].append(sent_tgt[i])
        if i in alignment:
            if alignment[i] < len(new_labels):
                res[1].append(new_labels[alignment[i]])
            else:
                res[1].append('O')
        else:
            res[1].append('O')
    return res

# Read the dataset

In [8]:
def parse_dataset(filename):
    sentences = []
    tokens = []
    labels = []
    id = ''
    with open(filename, 'r', encoding='utf8') as f:
        for line in f:
            # if line refers to id
            line_t = line.replace('\n', '')
            if len(line_t) == 0:
                if len(tokens) > 0:
                    sentences.append((id, tokens, labels))
            # if line is empty
            elif line_t[0] == '#':
                id = line_t[5:line_t.find('domain') - 1]
                tokens = []
                labels = []
            # if line refers to token and label
            else:
                token = line_t[:line_t.find('_') - 1]
                label = line_t[line_t.rfind('_') + 2:]
                tokens.append(token)
                labels.append(label)
    return sentences

In [9]:
# we do not consider GRP tag, as there is no similar one in deeppavlov

label_map = {
    "CORP": "CORP",
    "ORG": "CORP",
    "ORGANIZATION": "CORP",
    "GPE": "LOC",
    "LOCATION": "LOC",
    "LOC": "LOC",
    "PER": "PER",
    "PERSON": "PER",
    "WORK_OF_ART": "CW",
    "CW": "CW",
    "PRODUCT": "PROD",
    "PROD": "PROD"
}

In [10]:
sentences = parse_dataset('data/RU-Russian/ru_dev.conll')

In [11]:
len(sentences)

800

In [12]:
sentences[0]

('01997b3e-1ab7-431f-a461-309a8226586a',
 ['важным',
  'традиционным',
  'промыслом',
  'является',
  'производство',
  'пальмового',
  'масла',
  '.'],
 ['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O'])

# Run translation -> NER -> alignment pipeline

In [13]:
def prepare_target(labels, label_map):
    new_lbls = []
    for label in labels:
        if label.startswith("B-") or label.startswith("I-"):
            prefix, tag = label.split("-")
            if tag in label_map:
                new_lbls.append(label_map[tag])
            else:
                new_lbls.append("O")
        else:
            new_lbls.append("O")
    
    return new_lbls

In [14]:
translation_model = EasyNMT('mbart50_m2m')
model = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')

ner_model = build_model(configs.ner.ner_ontonotes_bert_torch, download=True)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2021-10-25 23:20:50.538 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_torc

In [15]:
from tqdm.notebook import tqdm

In [21]:
gts = []
preds = []
for _, sent, tags in tqdm(sentences):
    translation = translate_sentence(' '.join(sent), translation_model)
    ner_res = ner_model([translation])
    try:
        propagation = propagate_labels(translation, ' '.join(sent), ner_res[0][0], ner_res[1][0], model, tokenizer)
        gts.append(prepare_target(tags, label_map))
        preds.append(prepare_target(propagation[1], label_map))
    except:
        print(f'FAILED {" ".join(sent)}')

  0%|          | 0/800 [00:00<?, ?it/s]

In [23]:
ground_truth = []
for t in gts:
    ground_truth.extend(t)
predictions = []
for t in preds:
    predictions.extend(t)

## Calculate metrics

In [24]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [26]:
acc = accuracy_score(ground_truth, predictions)
pr = precision_score(ground_truth, predictions, average='micro')
rec = recall_score(ground_truth, predictions, average='micro')
f1 = f1_score(ground_truth, predictions, average='micro')

In [27]:
print(f'Accuracy = {acc}')
print(f'Precision = {pr}')
print(f'Recall = {rec}')
print(f'F1-score = {f1}')

Accuracy = 0.8258928571428571
Precision = 0.8258928571428571
Recall = 0.8258928571428571
F1-score = 0.8258928571428571


In [30]:
result = {}
avg = "micro"
labels = ["CORP", "LOC", "PER", "CW", "PROD", "O"]
for label in labels:
    precision = precision_score(ground_truth, predictions, labels=[label], average=avg)
    recall = recall_score(ground_truth, predictions, labels=[label], average=avg)
    f1 = f1_score(ground_truth, predictions, labels=[label], average=avg)
    
    result[label] = {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [31]:
import pandas as pd

In [32]:
pd.DataFrame(result)

Unnamed: 0,CORP,LOC,PER,CW,PROD,O
precision,0.156177,0.229358,0.372849,0.321739,0.266667,0.914493
recall,0.240143,0.395257,0.505181,0.318966,0.038647,0.891014
f1,0.189266,0.290276,0.429043,0.320346,0.067511,0.902601


# Run multilingual NER

In [33]:
ner_model_m = build_model(configs.ner.ner_ontonotes_bert_mult_torch, download=True)

2021-10-26 00:42:49.397 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch.tar.gz download because of matching hashes
2021-10-26 00:42:56.884 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /home/tootiredone/.deeppavlov/models/ner_ontonotes_bert_mult_torch/bert-base-multilingual-cased/tag.dict]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g

In [36]:
gts = []
preds = []
for _, sent, tags in tqdm(sentences):
    ner_res = ner_model_m([sent])
    try:
        gts.append(prepare_target(tags, label_map))
        preds.append(prepare_target(ner_res[1][0], label_map))
    except:
        print(f'FAILED {" ".join(sent)}')

  0%|          | 0/800 [00:00<?, ?it/s]

In [37]:
ground_truth_m = []
for t in gts:
    ground_truth_m.extend(t)
predictions_m = []
for t in preds:
    predictions_m.extend(t)

## Calculate metrics

In [38]:
acc = accuracy_score(ground_truth_m, predictions_m)
pr = precision_score(ground_truth_m, predictions_m, average='micro')
rec = recall_score(ground_truth_m, predictions_m, average='micro')
f1 = f1_score(ground_truth_m, predictions_m, average='micro')

In [39]:
print(f'Accuracy = {acc}')
print(f'Precision = {pr}')
print(f'Recall = {rec}')
print(f'F1-score = {f1}')

Accuracy = 0.8818922305764411
Precision = 0.8818922305764411
Recall = 0.8818922305764411
F1-score = 0.881892230576441


In [40]:
result_m = {}
avg = "micro"
labels = ["CORP", "LOC", "PER", "CW", "PROD", "O"]
for label in labels:
    precision = precision_score(ground_truth_m, predictions_m, labels=[label], average=avg)
    recall = recall_score(ground_truth_m, predictions_m, labels=[label], average=avg)
    f1 = f1_score(ground_truth_m, predictions_m, labels=[label], average=avg)
    
    result_m[label] = {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [41]:
pd.DataFrame(result_m)

Unnamed: 0,CORP,LOC,PER,CW,PROD,O
precision,0.424242,0.381679,0.564516,0.4375,0.52,0.91661
recall,0.301075,0.395257,0.453368,0.201149,0.125604,0.956618
f1,0.352201,0.38835,0.502874,0.275591,0.202335,0.936187


In [42]:
pd.DataFrame(result)

Unnamed: 0,CORP,LOC,PER,CW,PROD,O
precision,0.156177,0.229358,0.372849,0.321739,0.266667,0.914493
recall,0.240143,0.395257,0.505181,0.318966,0.038647,0.891014
f1,0.189266,0.290276,0.429043,0.320346,0.067511,0.902601
