In [1]:
import os
import re
import spacy
import math
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchimpl.src.misc import save_as_pickle, load_pickle, get_subject_objects
from tqdm import tqdm
import logging
import nltk
from nltk.tokenize import word_tokenize
from cleantext import clean
import transformers
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForMaskedLM
import datasets
from datasets import Features, Value, Sequence, load_metric, load_dataset
from datasets import Dataset
import torch
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
import random
import seaborn as sns

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


# preprocessing functions

In [2]:
def process_sent(sent):
    if sent not in [" ", "\n", ""]:
        sent = sent.strip("\n")            
        sent = re.sub('<[A-Z]+/*>', '', sent) # remove special tokens eg. <FIL/>, <S>
        sent = re.sub(r"[\*\"\n\\…\+\-\/\=\(\)‘•€\[\]\|♫:;—”“~`#]", " ", sent)
        sent = re.sub(' {2,}', ' ', sent) # remove extra spaces > 1
        sent = re.sub("^ +", "", sent) # remove space in front
        sent = re.sub(r"([\.\?,!]){2,}", r"\1", sent) # remove multiple puncs
        sent = re.sub(r" +([\.\?,!])", r"\1", sent) # remove extra spaces in front of punc
        sent = re.sub(r"([A-Z]{2,})", lambda x: x.group(1).capitalize(), sent) # Replace all CAPS with capitalize
        sent=sent.replace("?","")
        sent=sent.replace("@","")
        sent=sent.replace("®","")
        return sent
    return

In [3]:
def contractions(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase) # 's could mean possession
    phrase = re.sub(r"won't", "will not", phrase)  
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"can't", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"n't", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"'re", " are", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"'m", " am", phrase)
    phrase = re.sub(r"wont", "will not", phrase)
    phrase = re.sub(r"dont", "do not", phrase)
    phrase = re.sub(r"werent", "were not", phrase)
    phrase = re.sub(r"'m", " am", phrase)

    return phrase

In [4]:
def cleanfunc(t):
    return clean(t,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,  #if YES lowercase targets            # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       
)

# NER pipeline

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_loaded = AutoModelForTokenClassification.from_pretrained(os.getcwd()+"/ner/distilbert_merged_.pt",local_files_only=True)
label_list=['O', 'B-ORG','I-ORG','B-PSM','I-PSM','B-AMNT','I-AMNT','B-PSN','I-PSN','B-LOC','I-LOC']
trainer = Trainer(
      model=model_loaded,
      tokenizer=tokenizer,
  )

In [6]:
def make_sents(text): # has to be strings (handles multi-sentence strings)
    r={}
    r['text']=[]
    i=0
    text=process_sent(contractions(cleanfunc(text)))
    split=text.split('\n')
    for ss in split:
        sents=nltk.sent_tokenize(ss)
        for s_ in sents:
            r['text'].append(s_)
    return pd.DataFrame(r)

def tokenize_(examples): # examples has to be a list
    tokenized_inputs = tokenizer(examples["text"], truncation=True, is_split_into_words=False, max_length=385,padding="max_length")
    return tokenized_inputs

In [172]:
def idx_inarow(predictions): # get predicted entity from tokenized sequence
    ids_idxs=[]
    for si in range(predictions.shape[0]):
        ids_idx=[]
        i=1
        for z in range(len(predictions[si])): #loop necessary to fix start and end token mismatch
            if predictions[si][z]>0:
                if predictions[si][z]%2==1:
                    predictions[si][z]=predictions[si][z]+1
        while i<len(predictions[si])-1:
            if predictions[si][i]!=0:
                ii=i
                i=i+1
                j=i
                while predictions[si][ii]==predictions[si][j]:
                    j=j+1
                    i=i+1
                    if j>=len(predictions[si]):
                        break
                ids_idx.append([ii,j])
            else:
                i=i+1
        ids_idxs.append(ids_idx)
    return ids_idxs

def seq_withEnts(ds,predictions):   # outputs sequence with list of predicted entities
    to_pred=[]
    inarow=idx_inarow(predictions) # compute ents indexes
    for s in range(len(ds)): # sentence loop
        text=ds[s]['text'].lower()
        ents=[]
        for id in inarow[s]:
            if tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ds[s]['input_ids'][id[0]:id[1]])) in text: # translate sequence of token ids to string
                ents.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ds[s]['input_ids'][id[0]:id[1]])))
        uniqueents=list(set(ents)) # get unique list of entities
        to_pred.append([text,uniqueents])
    return to_pred

In [158]:
def search_idxs(e,s): # find index range of entity 
    spec=['$','¥','€','£','(',')'] # deals with characters that do not work with the re.search method
    ss=s
    c=0
    presence=False
    for i in e:
        if i in spec:
            c=c+1 
            presence=True     
    if presence==True:
        for ii in spec:
            ss=ss.replace(ii,'')
            e=e.replace(ii,'')
        idx=re.search(e,ss).span()
        idx_=[idx[0],idx[0]+c]  
    else:
        idx_=re.search(e,ss).span()       
    return idx_

def combination(l): # combine all possible matches of entities
    ll=[]
    for i in range(len(l)):
        for j in range(len(l)):
            if j==i:
                continue
            else:
                ll.append([l[i],l[j]])
    return ll

def create_combinedSeqs(seqs): # add special tokens to the relation candidates
    data=[]
    for s in seqs:
        text=s[0]
        combs=combination(s[1])
        d=[]
        for c in combs:
            dd={}
            idx1=search_idxs(c[0],text)
            idx2=search_idxs(c[1],text)
            if idx1[0]<idx2[0]:
                input=text[:idx1[0]]+"[E1] "+c[0]+" [/E1] "+text[idx1[1]:idx2[0]]+"[E2] "+c[1]+" [/E2] "+text[idx2[1]:]
                dd['text']=[text]
                dd['inputs']=[input]
                dd['headFirst']=[True]
                dd["head"]=[c[0]]
                dd["child"]=[c[1]]
                d.append(dd)
            else:
                input=text[:idx2[0]]+"[E2] "+c[1]+" [/E2] "+text[idx2[1]:idx1[0]]+"[E1] "+c[0]+" [/E1] "+text[idx1[1]:]
                dd['text']=[text]
                dd['inputs']=[input]
                dd['headFirst']=[False]
                dd["head"]=[c[1]]
                dd["child"]=[c[0]]
                d.append(dd)
        data.append(d)
    return data
         

# train relation classifier

In [9]:
# import fine-tuned model and tokenizer, prepare dataset for outputing embeddings from hidden layers (data to be sent to the classifier)
dataset = load_dataset('json',data_files={'train':'labelledData/reldatatrain.json', 'test': 'labelledData/reldatatest.json'}, field='data')
tokenizer_ = AutoTokenizer.from_pretrained(os.getcwd()+"/tokenizer_fine_tuned_.pt",local_files_only=True)
model_loaded_ = AutoModelForMaskedLM.from_pretrained(os.getcwd()+"/fine_tuned_.pt",local_files_only=True, output_hidden_states=True)

Using custom data configuration default-fb6f12f54e4b24af
Reusing dataset json (/Users/romainbourgeois/.cache/huggingface/datasets/json/default-fb6f12f54e4b24af/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 2/2 [00:00<00:00, 689.46it/s]
loading file /Users/romainbourgeois/Desktop/matching_the_blanks/tokenizer_fine_tuned_.pt/vocab.txt
loading file /Users/romainbourgeois/Desktop/matching_the_blanks/tokenizer_fine_tuned_.pt/tokenizer.json
loading file /Users/romainbourgeois/Desktop/matching_the_blanks/tokenizer_fine_tuned_.pt/added_tokens.json
loading file /Users/romainbourgeois/Desktop/matching_the_blanks/tokenizer_fine_tuned_.pt/special_tokens_map.json
loading file /Users/romainbourgeois/Desktop/matching_the_blanks/tokenizer_fine_tuned_.pt/tokenizer_config.json
loading configuration file /Users/romainbourgeois/Desktop/matching_the_blanks/fine_tuned_.pt/config.json
Model config BertConfig {
  "_name_or_path": "yiyanghkust/finbert-pretrain",
  "ar

In [12]:
labels=['0','PARTNERSHIP','RESEARCH_PROJECT','SUBSIDIARY','PURCHASE','FINANCING','RECRUITMENT','LAUNCH_PRODUCT-SERVICE','HAS_PRODUCT-SERVICE',
'OPERATES_IN_MARKET','BASED_IN','WORKS_IN']

def firsttoken(output):
    return output[0]

def EntTokens(output, reverse, e11, e22):
    if reverse==True:
        return np.concatenate((output[e11],output[e22]),axis=0)
    else:
        return np.concatenate((output[e22],output[e11]),axis=0)

def maxpool_ents(output, reverse, e11, _e11, e22, _e22):
    ee1=output[e11+1:_e11]
    if ee1.shape[0]==0:
        ee1=np.zeros(768)
    elif ee1.shape[0]>1:
        ee1=ee1.max(axis=0)
    else:
        ee1=ee1[0]
    ee2=output[e22+1:_e22]
    if ee2.shape[0]==0:
        ee2=np.zeros(768)
    elif ee2.shape[0]>1:
        ee2=ee2.max(axis=0)
    else:
        ee2=ee2[0]
    if reverse==True:
        return np.concatenate((ee1,ee2),axis=0)
    else:
        return np.concatenate((ee1,ee2),axis=0)

def data(dataset, e1=30873, _e1=30875, e2=30874, _e2=30876): 
    inp=tokenizer_(dataset["inputs"], is_split_into_words=True, truncation=True, max_length=512, padding='max_length')
    outputs=model_loaded_(torch.tensor(inp['input_ids']).reshape(1,-1))
    output=outputs[-1][-1].detach().numpy()[0,:,:]
    reverse=dataset['head_first']
    label=dataset['label']
    e11=inp['input_ids'].index(e1)
    _e11=inp['input_ids'].index(_e1)
    e22=inp['input_ids'].index(e2)
    _e22=inp['input_ids'].index(_e2)
    l=labels.index(label)
    firsttokendata=firsttoken(output)
    EntTokensdata=EntTokens(output, reverse, e11, e22)
    maxpool_entsdata=maxpool_ents(output, reverse,e11,_e11,e22,_e22)
    return l,firsttokendata,EntTokensdata,maxpool_entsdata

In [13]:
y_train=np.array(data(dataset['train'][0])[0]).reshape(1,-1)
X_train1=data(dataset['train'][0])[1].reshape(1,-1)
X_train2=data(dataset['train'][0])[2].reshape(1,-1)
X_train3=data(dataset['train'][0])[3].reshape(1,-1)

for i in range(len(dataset['train'])):
    o,one,two,three=data(dataset['train'][i])
    y_train=np.concatenate((y_train,np.array(o).reshape(1,-1)),axis=0)
    X_train1=np.concatenate((X_train1,one.reshape(1,-1)),axis=0)
    X_train2=np.concatenate((X_train2,two.reshape(1,-1)),axis=0)
    X_train3=np.concatenate((X_train3,three.reshape(1,-1)),axis=0)

y_test=np.array(data(dataset['test'][0])[0]).reshape(1,-1)
X_test1=data(dataset['test'][0])[1].reshape(1,-1)
X_test2=data(dataset['test'][0])[2].reshape(1,-1)
X_test3=data(dataset['test'][0])[3].reshape(1,-1)

for i in range(len(dataset['test'])):
    o,one,two,three=data(dataset['test'][i])
    y_test=np.concatenate((y_test,np.array(o).reshape(1,-1)),axis=0)
    X_test1=np.concatenate((X_test1,one.reshape(1,-1)),axis=0)
    X_test2=np.concatenate((X_test2,two.reshape(1,-1)),axis=0)
    X_test3=np.concatenate((X_test3,three.reshape(1,-1)),axis=0)


choosing nuber of nearest neighbors and selecting method 2 for feature extraction

In [14]:
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train2, y_train)
y_pred=neigh.predict(X_test2)
print(f1_score(y_test, y_pred, average=None))
print(f1_score(y_test, y_pred, average=None)[1:].mean())

  return self._fit(X, y)


[0.87915937 0.10909091 0.         0.4        0.         0.
 0.18604651 0.11428571 0.26666667 0.16666667]
0.13808405203754043


In [15]:
neigh = KNeighborsClassifier(n_neighbors=4)
neigh.fit(X_train2, y_train)
y_pred=neigh.predict(X_test2)
print(f1_score(y_test, y_pred, average=None))
print(f1_score(y_test, y_pred, average=None)[1:].mean())

  return self._fit(X, y)


[0.88734835 0.08823529 0.         0.4        0.         0.
 0.16216216 0.20512821 0.23529412 0.16666667]
0.13972071619130444


In [16]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train2, y_train)
y_pred=neigh.predict(X_test2)
print(f1_score(y_test, y_pred, average=None))
print(f1_score(y_test, y_pred, average=None)[1:].mean())

  return self._fit(X, y)


[0.86163142 0.17857143 0.         0.5        0.         0.
 0.17142857 0.15789474 0.35294118 0.15384615]
0.16829800746209417


# predicted random sequences

In [175]:
def make_pred(test_, label, e1=30873, _e1=30875, e2=30874, _e2=30876):
    features=Features({'text': datasets.Sequence(datasets.Value("string"))})
    dataset = Dataset.from_pandas(make_sents(test_))
    ds=dataset.map(tokenize_)
    predictions, labels, _=trainer.predict(ds)
    predictions = np.argmax(predictions, axis=2)
    seq_ents=seq_withEnts(ds,predictions)
    print(seq_ents)
    data_=create_combinedSeqs(seq_ents)
    rel=[]
    for i in data_:
        print("predicting sentence :",i[0]['text'])
        print("predicting on ",len(i)," different combinations of entities")
        for j in i:
            datast=Dataset.from_dict(j)
            inp=tokenizer_(datast["inputs"], is_split_into_words=False, truncation=True, max_length=512, padding='max_length')
            outputs=model_loaded_(torch.tensor(inp['input_ids']).reshape(1,-1))
            output=outputs[-1][-1].detach().numpy()[0,:,:]
            reverse=datast['headFirst']
            e11=inp['input_ids'][0].index(e1)
            _e11=inp['input_ids'][0].index(_e1)
            e22=inp['input_ids'][0].index(e2)
            _e22=inp['input_ids'][0].index(_e2)
            EntTokensdata=EntTokens(output, reverse, e11, e22)
            y_pred=neigh.predict(EntTokensdata.reshape(1,-1))
            if y_pred[0]==0:
                print("no relations predicted for head entity ",datast['head']," and child entity ",datast['child'])
            else: 
                l=int(y_pred[0])
                print("predicted (",datast['head'][0],",",label[l],",",datast['child'])
                r={}
                r['relation']=label[l]
                r['head']=datast['head'][0]
                r['child']=datast['child']
                r['text']=i[0]['text']
                rel.append(r)
    return rel
    


In [176]:
inputs="Instagram was bought by Facebook to compete with Tiktok. Eurecom is based in France and offers educational courses. Eurecom will partner with INRIA for various research projects."
rel=make_pred(inputs,labels)


100%|██████████| 3/3 [00:00<00:00, 836.35ex/s]
The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 3
  Batch size = 8
63it [1:10:47, 92.72s/it]

[['instagram was bought by facebook to compete with tiktok.', ['tiktok', 'instagram', 'facebook']], ['eurecom is based in france and offers educational courses.', ['eurecom', 'france']], ['eurecom will partner with inria for various research projects.', ['eurecom', 'inria']]]
predicting sentence : ['instagram was bought by facebook to compete with tiktok.']
predicting on  6  different combinations of entities
predicted ( instagram , PARTNERSHIP , ['tiktok']
predicted ( facebook , PARTNERSHIP , ['tiktok']
no relations predicted for head entity  ['instagram']  and child entity  ['tiktok']
predicted ( instagram , PARTNERSHIP , ['facebook']
predicted ( facebook , PARTNERSHIP , ['tiktok']
predicted ( instagram , PARTNERSHIP , ['facebook']
predicting sentence : ['eurecom is based in france and offers educational courses.']
predicting on  2  different combinations of entities
no relations predicted for head entity  ['eurecom']  and child entity  ['france']
predicted ( eurecom , BASED_IN , ['f