In [None]:

# !git clone https://github.com/nerel-ds/NEREL

In [None]:

# !unzip NEREL-v1.0.zip -d NEREL

In [1]:

## Read the data# Reading files
from collections import namedtuple
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk

Ann = namedtuple('annotation', ['tag', 'start1', 'end1', 'start2', 'end2', 'text'])
Rel = namedtuple('relationship', ['tag', 'arg1', 'arg2'])

def read_files(folder):
    filenames = sorted(set(e[:e.rfind('.')] for e in os.listdir(folder)))[1:]
    
    texts, entities, relationships  = [], [], []
    for file in tqdm(filenames):
       # if (not file): continue
        path1 = os.path.join(folder, file+'.txt')
        path2 = os.path.join(folder, file+'.ann')
        if (not os.path.exists(path2)):
            print(f'{path2} not found')
            continue
            #with open(path2, 'w'):
            #    pass
            
        with open(path1, 'r', encoding="utf8") as text, open(path2, 'r', encoding="utf8") as ann:
            texts.append(text.read())

            file_entities = {}
            file_relationship = []
            regex_ent = r'T(?P<id>\d+)\s(?P<tag>\w+)\s(?P<start1>\d+) (?P<end1>\d+)(;(?P<start2>\d+) (?P<end2>\d+))?\s(?P<text>.*)'
            regex_rel = r'R(?P<id>\d+)\s(?P<tag>\w+)\sArg1:T(?P<arg1>\d+) Arg2:T(?P<arg2>\d+)'
            
            
            for row in sorted(ann.readlines(), reverse=True):
                #print(row)
                match_ent = re.match(regex_ent, row)
                match_rel = re.match(regex_rel, row)
                if (match_ent):
                    res = match_ent.groupdict()
                    res['start1'] = int(res['start1'])
                    res['end1'] = int(res['end1'])
                    if (res['start2'] is not None):
                        res['start2'] = int(res['start2'])
                        res['end2'] = int(res['end2'])
                    id = res.pop('id')
                    file_entities[id] = Ann(**res)
                elif (match_rel):
                    try:
                        res = match_rel.groupdict()
                        res['arg1'] = file_entities[res['arg1']]
                        res['arg2'] = file_entities[res['arg2']]
                        id = res.pop('id')
                        file_relationship.append(Rel(**res))
                    except KeyError as e:
                        print(f'not found T{e} row={row}')
                else:
                    print(f'incorrect format in: row={row} file={file}')
            entities.append(file_entities)
            relationships.append(file_relationship)
    entities = [sorted(e.values(), key = lambda x: (x.start1, x.end1)) for e in entities]
    return texts, entities, relationships, filenames

In [2]:

folder = 'NEREL/NEREL-v1.0/train'
texts, entities, relationships, filenames = read_files(folder)

incorrect format in: row=По словам очевидцев пешехо
 file=21013_text


100%|██████████| 745/745 [00:02<00:00, 289.36it/s]


In [3]:

def in_range(range1, range2):
    # range1 in range2
    if range1[0] is None: return True
    if range2[0] is None: return False
    return int(range2[0]) <= int(range1[0]) and int(range2[1]) >= int(range1[1])

def is_nested_anns(ent1: Ann, ent2: Ann):
    # ent 1 in ent 2
    res = True
    ent1_range1 = (ent1.start1, ent1.end1)
    ent1_range2 = (ent1.start2, ent1.end2)
    ent2_range1 = (ent2.start1, ent2.end1)
    ent2_range2 = (ent2.start2, ent2.end2)
    #print(f'{ent1_range1=} {ent1_range2=} {ent1_range2=} {ent2_range2=}')
    res = res and (in_range(ent1_range1, ent2_range1) or in_range(ent1_range1, ent2_range2))
    res = res and (in_range(ent1_range2, ent2_range1) or in_range(ent1_range2, ent2_range2))
    return res


def is_nested_anns2(ent1: Ann, ent2: Ann):
    return is_nested_anns(ent1,ent2) or is_nested_anns(ent2, ent1)

def is_nested(rel: Rel):
    return is_nested_anns(rel.arg1, rel.arg2) or is_nested_anns(rel.arg2, rel.arg1)

![](https://i.imgur.com/tgDfc8i.png)             | ![](https://i.imgur.com/oWa5vWo.png)
:-------------------------:|:-------------------------:


In [4]:
pre_dataset = []

for text_id in range(len(texts)):
    relationships_nested = {(e.arg1, e.arg2):e for e in relationships[text_id] if is_nested(e)}
    nes = []
    for i in range(len(entities[text_id])):
        # O(n^2) eeeeeeeeee
        for j in range(i+1, len(entities[text_id])):
            if (is_nested_anns2(entities[text_id][i], entities[text_id][j])):
                nes.append((entities[text_id][i], entities[text_id][j]))
    for e in nes:
        if (e in relationships_nested):
            pre_dataset.append((*e, relationships_nested[e].tag))
        elif ((e[1], e[0]) in relationships_nested):
            pre_dataset.append((e[1], e[0], relationships_nested[(e[1], e[0])].tag))
        else:
            pre_dataset.append((*e, 'None'))
#             pre_dataset.append((e[1], e[0], 'None'))

# AAA

In [18]:
from random import shuffle
from copy import copy
rs = copy(pre_dataset)
l = len(rs)
shuffle(rs)
split = int(l*0.85)
train = rs[:split]
test = rs[split:]
len(train), len(test)

(9498, 1677)

In [20]:
from collections import defaultdict
d = defaultdict(int)
for e in train:
    if (e[2] == 'None'):
        d[tuple(sorted([e[0].tag, e[1].tag]))] -= 1
    else:
        d[tuple(sorted([e[0].tag, e[1].tag]))] += 1

In [21]:
d

defaultdict(int,
            {('ORGANIZATION', 'PROFESSION'): 413,
             ('PROFESSION', 'PROFESSION'): -1289,
             ('ORGANIZATION', 'ORGANIZATION'): -366,
             ('COUNTRY', 'ORGANIZATION'): 489,
             ('LOCATION', 'ORGANIZATION'): 8,
             ('COUNTRY', 'LAW'): -35,
             ('PROFESSION', 'STATE_OR_PROVINCE'): 50,
             ('LAW', 'LOCATION'): 1,
             ('EVENT', 'NUMBER'): -100,
             ('AWARD', 'EVENT'): 6,
             ('EVENT', 'EVENT'): -248,
             ('DATE', 'PENALTY'): -62,
             ('ORDINAL', 'PROFESSION'): -18,
             ('LAW', 'ORDINAL'): -181,
             ('COUNTRY', 'PROFESSION'): 257,
             ('IDEOLOGY', 'ORGANIZATION'): 56,
             ('ORGANIZATION', 'PRODUCT'): 8,
             ('EVENT', 'ORDINAL'): -36,
             ('PROFESSION', 'RELIGION'): -2,
             ('CITY', 'ORGANIZATION'): 208,
             ('LAW', 'LAW'): -121,
             ('PERSON', 'WORK_OF_ART'): -26,
             ('ORGANIZAT

In [22]:
correct = 0
wrong = 0
undeterment = 0
for e in test:
    ture = e[2] != 'None' # True = relation present
    pred = d[tuple(sorted([e[0].tag, e[1].tag]))] > 0 # False = relation abcent
    if (d[tuple(sorted([e[0].tag, e[1].tag]))] == 0):
        undeterment+=1
    if (ture == pred):
        correct +=1
    else:
        wrong += 1

In [23]:
correct, wrong, undeterment

(1322, 355, 7)

In [24]:
(correct)/(correct + wrong + undeterment)

0.7850356294536817

In [27]:
y_true = []
y_pred = []
for e in test:
    ture = e[2] != 'None' # True = relation present
    pred = d[tuple(sorted([e[0].tag, e[1].tag]))] > 0 # False = relation abcent
    y_true.append(ture)
    y_pred.append(pred)
    
from sklearn.metrics import f1_score
f1_score(y_true, y_pred)

0.7491166077738516