In [None]:
!pip install torch torchvision datasets evaluate transformers[torch] --quiet

In [None]:
import pandas as pd
import os
import torch
import random
import pyarrow as pa
import numpy as np
import evaluate
import json


from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from datasets import Dataset


from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
component = 2

model_type = "distilbert-base-cased" if component == 1 else "distilbert-base-uncased"

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    data_dir = '/content/drive/MyDrive/Colab Notebooks/text_mining_re_data/'
else:
    data_dir = os.getcwd()

data_dir = os.path.join(data_dir, f'component_{component}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open(os.path.join(data_dir, 'id2label.json'), 'r') as fp:
    id2label = json.load(fp)

id2label = {int(key): id2label[key] for key in id2label.keys()}
label2id = {id2label[key]: key for key in id2label.keys()}

In [None]:
def get_latest_annotations(test_raw):
    def get_latest(x):
        latest = x[0]
        for annot in x[1:]:
            if annot['updated_at'] > latest['updated_at']:
                latest = annot
        return [latest]
    test_raw['annotations'] = test_raw['annotations'].apply(get_latest)

    return test_raw

In [None]:
def get_relation_text(text, first, second):
    first_mark = '[{}]'.format(first['entity'])
    first_entity = first_mark + text[first['start']: first['end']] + first_mark
    second_mark = '[{}]'.format(second['entity'])
    second_entity = second_mark + text[second['start']: second['end']] + second_mark
    subtext = text[first['end']: second['start']]
    return first_entity + subtext + second_entity

def existing_relation(relations , ent1_id, ent2_id):
    for rel in relations:
        if rel['from'] == ent1_id and rel['to'] == ent2_id:
            return rel
    return False

if component==1:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'PLAYER' and ent2['entity'] in ['PLAYER', 'CLUB', 'NATIONALITY', 'COUNTRY', 'POSITION', 'DATE'] \
            or ent1['entity'] == 'CLUB' and ent2['entity'] in ['CLUB']:
            return True
        return False
else:
    def possible_relation(entities, ent1_id, ent2_id):
        ent1, ent2 = entities[ent1_id], entities[ent2_id]
        if ent1['entity'] == 'UNEXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION', 'CAUSE', 'SOLUTION'] \
            or ent1['entity'] == 'EXPECTED EVENT' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'ACTIVITY' and ent2['entity'] in ['EMPLOYEE', 'TIME', 'DATE', 'LOCATION']\
            or ent1['entity'] == 'CAUSE' and ent2['entity'] in ['ACTIVITY']:
            return True
        return False

def generate_possible_relations(entities, entities_list, relations=None, ratio=None):
    entities_list.sort(key=lambda x: x[1])
    possible_relations = []
    for i, ent_one in enumerate(entities_list):
        for j, ent_two in enumerate(entities_list[i+1:i+11]): # change to 10 entities
            if possible_relation(entities, ent_one[0], ent_two[0]) and \
             ((relations is not None and not existing_relation(relations, ent_one[0], ent_two[0])) \
              or relations is None):
                possible_relations.append([ent_one[0], ent_two[0]])
    if ratio:
        possible_relations = random.sample(possible_relations, min(len(possible_relations), ratio * len(relations)))
    return possible_relations

def generate_test_relations(documents):
    texts = []
    entity_pairs = []
    for i, doc in documents.iterrows():
        annotations = doc['annotations'][0]['result']
        entities = {}
        entities_list = []
        text = doc['data']['text']
        for item in annotations:
            if item['type'] == 'labels':
                entities[item['id']] = {'start': item['value']['start'], 'end': item['value']['end'], 'entity': item['value']['labels'][0],
                                         'text': item['value']['text']}
                entities_list.append([item['id'], item['value']['start'], item['value']['end'], item['value']['labels'][0]])

        entities_list = sorted(entities_list, key=lambda x: x[1])

        possible_relations = generate_possible_relations(entities, entities_list)

        for relation in possible_relations:
            first = entities[relation[0]]
            second = entities[relation[1]]
            texts.append(get_relation_text(text, first, second))
            entity_pairs.append([first['text'], second['text']])




    return pd.DataFrame(data={'text': texts, 'entity_pairs':entity_pairs})

def parse_ner_output(output):
    import random
    import string

    out_list = []

    prev_lab = None

    for lab in output['label']:

        new_lab = {'id': ''.join(random.choices(string.ascii_uppercase, k=10)), 'type': 'labels',
        'value': {'end': lab['end'], 'text': lab['text'], 'start': lab['start'], 'labels': lab['labels']}}
        out_list.append(new_lab)

    return pd.DataFrame({'annotations': [[{'result':out_list}]], 'data': [{'text':output['text'][0]}]})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_type)


def tokenize_function(examples):
    examples['input_ids'], examples['attention_mask'] = \
    tokenizer(examples['text'], padding="max_length", truncation=True).values()

    return examples


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model_dir = os.path.join(data_dir, 'models')

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(model_dir, "best_model"), num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
).to(device)

In [None]:
import re

def output_select(logits, texts):
    # mask = torch.zeros(logits.size()).to(bool)
    # mask[:, label2id['None']] = True
    # for i, text in enumerate(texts):
    #     text = re.split(r'\[|]|\s', batch['text'][i])
    #     ent1 = text[1]
    #     ent2 = text[-2]
    #     print
    #     if ent1 == 'PLAYER':
    #         if ent2 == 'PLAYER':
    #             mask[i, [label2id['teammate_of'], label2id['refers_to']]] = True
    #         if ent2 == 'BIRTHDATE':
    #             mask[i, label2id['born']] = True
    #         if ent2 == 'NATIONALITY':
    #             mask[i, label2id['has_nationality']] = True
    #         if ent2 == 'COUNTRY':
    #             mask[i, label2id['originates_from']] = True
    #         if ent2 == 'POSITION':
    #             mask[i, label2id['plays_as']] = True
    #         if ent2 == 'CLUB':
    #             mask[i, [label2id['plays_for'], label2id['played_for']]] = True
    #         if ent2 == 'REFERENCE':
    #             mask[i, label2id['refers_to']] = True
    #     else:
    #         if ent2 == 'CLUB':
    #             mask[i, label2id['refers_to']] = True
    #         if ent2 == 'REFERENCE':
    #             mask[i, label2id['refers_to']] = True
    # logits = (logits + 100) * mask
    # logits *= mask
    return logits.argmax(dim=-1)

In [None]:
import json

with open(os.path.join(data_dir, 'predictions.json'), 'r') as f:
    data = pd.read_json(f)

files_dfs = []
for i, row in data.iterrows():
    row_text = row.text
    texts = []
    labels = []
    for ent in row.entities:
        texts.append(row_text)
        labels.append({'end':ent[1], 'text': row_text[ent[0]:ent[1]], 'start': ent[0], 'labels': [ent[2]]})

    tmp_df = pd.DataFrame({'text': texts, 'label': labels})

    files_dfs.append(parse_ner_output(tmp_df))

In [None]:
len(files_dfs)

5

In [None]:
class REset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = {key: val[idx] if key in ['entity_pairs', 'text'] else torch.tensor(val[idx]) for key, val in self.data.items()}

        return item

    def __len__(self):
        return len(self.data['input_ids'])

In [None]:
output_dir = os.path.join(data_dir, 'test_output')

In [None]:
import pickle

for i in range(len(files_dfs)):
    file_raw = generate_test_relations(files_dfs[i])
    file_raw = file_raw.apply(tokenize_function, axis=1)
    test_dataset = REset(file_raw)
    test_loader = DataLoader(test_dataset, batch_size=16)
    results = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            text_tensor, mask = batch['input_ids'].to(model.device), batch['attention_mask'].to(model.device)
            output = model(text_tensor, mask).logits.cpu()
            output = output_select(output, batch['text'])
            output = [id2label[out] for out in output.tolist()]
            results.extend(list(zip(batch['entity_pairs'][0],  output, batch['entity_pairs'][1])))
    final_results = []
    for res in results:
        if res[1] not in ['None']:
            final_results.append(res)

    with open(os.path.join(output_dir, f'file_{i}'), "wb") as fp:   #Pickling
        pickle.dump(final_results, fp)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
test_dataset.data

Unnamed: 0,text,entity_pairs,input_ids,attention_mask
0,"[ACTIVITY]May 1988,[ACTIVITY] in [LOCATION]Mod...","[May 1988,, Mode 1]","[101, 1031, 4023, 1033, 2089, 2997, 1010, 1031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[ACTIVITY]May 1988,[ACTIVITY] in Mode 1 at ful...","[May 1988,, The operators]","[101, 1031, 4023, 1033, 2089, 2997, 1010, 1031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[ACTIVITY]May 1988,[ACTIVITY] in Mode 1 at ful...","[May 1988,, 2330 hours]","[101, 1031, 4023, 1033, 2089, 2997, 1010, 1031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[ACTIVITY]May 1988,[ACTIVITY] in Mode 1 at ful...","[May 1988,, 0005 hours]","[101, 1031, 4023, 1033, 2089, 2997, 1010, 1031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
final_results

[('May 1988,', 'happened_during', 'Mode 1')]