In [1]:
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import spacy
import json
import nltk
from cleantext import clean
nltk.download('punkt')
import re
import random
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
f = open('labelledData/labelled_data.json')
data = json.load(f)
f = open('labelledData/NERdata.json')
ner = json.load(f)

In [7]:
class dataset(object):
    def __init__(self, data, ner, list_relations, list_rel_binary):
        self.data=data
        self.list_relations=list_relations
        self.list_rel_binary=list_rel_binary
        n={}
        for i in ner:
            n[i['documentName']]=[]
            for ii in i['annotation']:
                n[i['documentName']].append(ii['text'])
                n[i['documentName']]=list(set(n[i['documentName']]))
        self.ner=n # list all labeled entities
    
    def rel(self,d): # makes list of dictionaries indicating relationship type, child and head entity, text and whether head entity appears first in the sentence (very important for asymmetric relations)
        rel=[]
        ents={}
        for e in d['tokens']:
            ents[str(e['start'])]=e['text']
            
        relations=d['relations']
        for r in relations:
            if r['relationLabel'] in self.list_relations:
                rel_={}
                rel_['type']=r['relationLabel']  
                rel_['head']=ents[str(r['head'])]
                rel_['child']=ents[str(r['child'])]
                rel_['documentName']=d['documentName']
                if r['head']<=r['child']:
                    rel_['head_first']=True
                    rel_['sub_seq']=d['document'][r['head']:r['child']+len(rel_['child'])]
                else:
                    rel_['head_first']=False
                    rel_['sub_seq']=d['document'][r['child']:r['head']+len(rel_['head'])]
                rel.append(rel_)
        return rel

    def seq(self,d): # split between sentences
        seqs=[]
        split=d['document'].split('\n')
        for ss in split:
            sents=nltk.sent_tokenize(ss)
            for s in sents:
                seqs.append(s)
        return seqs


    def map_fullseq(self): # basically brings rel and seq together
        rel_=[]
        rel=[]
        for d in self.data:
            rd=self.rel(d)
            sd=self.seq(d)
            for s in sd:
                for r in range(len(rd)):
                    if rd[r]['sub_seq'] in s:
                        rd[r]['full_seq']=s
            for rr in rd:
                if len(rr.keys())==7:
                    rel_.append(rr)
        return rel_
        
    # preprocessing functions

    def contractions(self,phrase):
        phrase = re.sub(r"won\'t", "will not", phrase) # 's could mean possession
        phrase = re.sub(r"won't", "will not", phrase)  
        phrase = re.sub(r"can\'t", "can not", phrase)
        phrase = re.sub(r"can't", "can not", phrase)
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"n't", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"'re", " are", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        phrase = re.sub(r"'m", " am", phrase)
        phrase = re.sub(r"wont", "will not", phrase)
        phrase = re.sub(r"dont", "do not", phrase)
        phrase = re.sub(r"werent", "were not", phrase)
        phrase = re.sub(r"'m", " am", phrase)

        return phrase

    def cleanfunc(self, t):
        return clean(t,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=False,  #if YES lowercase targets            # lowercase text
        no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_phone_numbers=True,         # replace all phone numbers with a special token
        no_numbers=False,               # replace all numbers with a special token
        no_digits=False,                # replace all digits with a special token
        no_currency_symbols=False,      # replace all currency symbols with a special token
        no_punct=False,                 # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"                       
    )

    def get_ents(self,d):
        l=[]
        for dd in d['tokens']:
            l.append(dd['text'])
        return l

    def get_ner_ents(self,d):
        l=[]
        for dd in d['annotation']:
            l.append(dd['text'])
        return l

    def combine(self,l):  # combine all possible entities
        ll=[]
        for i in range(len(l)):
            for j in range(len(l)):
                if j==i:
                    continue
                else:
                    ll.append([l[i],l[j]])
        return ll

    def remove_subEntities(self,ner): # 
        rm=[]
        for e in ner:
            for ee in ner:
                if e!=ee:
                    if e in ee:
                        rm.append(e)
        rm=list(set(rm))
        for i in rm:
            ner.remove(i)
        return ner


    def search_idxs(self,e,s):
        spec=['$','¥','€','£','(',')']
        ss=s
        c=0
        presence=False
        for i in e:
            if i in spec:
                c=c+1 
                presence=True     
        if presence==True:
            for ii in spec:
                ss=ss.replace(ii,'')
                e=e.replace(ii,'')
            idx=re.search(e,ss).span()
            idx_=[idx[0],idx[0]+c]  
        else:
            idx_=re.search(e,ss).span()       
        return idx_
        

    def mapped_rel(self):
        dataset_=[]
        dts=self.map_fullseq() # get list of relations 
        for s in dts:
            dataset__=[]
            ents=[]
            ner_=self.ner[s['documentName']]
            ner__=[]
            for e in ner_: 
                if e in s['full_seq']:
                    ner__.append(e)
            ner__=self.remove_subEntities(ner__)
            for e in ner__:
                if s['full_seq'].count(e)>1:
                    ents.append(self.search_idxs(e,s['full_seq']))
                    ss=s['full_seq'][int(self.search_idxs(e,s['full_seq'])[1]):]
                    ents.append(self.search_idxs(e,ss))
                    ss=ss[self.search_idxs(e,ss)[1]:]
                    while ss.count(e)>0:
                        ents.append(self.search_idxs(e,ss))
                        ss=ss[self.search_idxs(e,ss)[1]:]
                else:
                    ents.append(self.search_idxs(e,s['full_seq']))
            ents=self.combine(ents)
            for i in ents:
                h=s['full_seq'][i[0][0]:i[0][1]]  
                t=s['full_seq'][i[1][0]:i[1][1]]
                candidate=False
                c={}
                if s['type'] in self.list_rel_binary: 
                    if [h in s['head'] and t in s['child']] or [h in s['child'] and t in s['head']]:
                        candidate=True
                        c['sequence']=s['full_seq']
                        c['ent1']=h
                        c['ent2']=t
                        c['relation']=s['type']
                        c['head_first']=None

                if s['head_first']==True: 
                    if h==s['head'] and t==s['child']:
                        candidate=True
                        c['sequence']=s['full_seq']
                        c['ent1']=h
                        c['ent2']=t
                        c['relation']=s['type']
                        c['head_first']=True

                if s['head_first']==False: 
                    if h==s['child'] and t==s['head']:
                        candidate=True
                        c['sequence']=s['full_seq']
                        c['ent1']=h
                        c['ent2']=t
                        c['relation']=s['type']
                        c['head_first']=False

                if candidate==False: 
                    c['sequence']=s['full_seq']
                    c['ent1']=h
                    c['ent2']=t
                    c['relation']='0'
                    c['head_first']=None
                    candidate=True
                
                if candidate==True:
                    dataset_.append(c)

        return [i for n, i in enumerate(dataset_) if i not in dataset_[n + 1:]]

    def build_labelled_dataset(self,proportion_test):
        dataset_=self.mapped_rel()
        train_data=[]
        test_data=[]
        datatrain={}
        datatrain['data']=[]
        datatest={}
        datatest['data']=[]
        n=len(dataset_)
        t=random.sample(range(1, n), round(n*proportion_test))
        for i in range(n):
            if i in t:
                test_data.append(dataset_[i])
            else:
                train_data.append(dataset_[i])

        for i in range(len(train_data)):
            trd={}
            indxs=self.search_idxs(train_data[i]['ent1'],train_data[i]['sequence'])
            indxs2=self.search_idxs(train_data[i]['ent2'],train_data[i]['sequence'])
            if indxs[0]<indxs2[0]:
                input=train_data[i]['sequence'][0:indxs[0]]+"[E1] "+train_data[i]['ent1']+" [/E1] "+train_data[i]['sequence'][indxs[1]:indxs2[0]]+"[E2] "+train_data[i]['ent2']+" [/E2] "+train_data[i]['sequence'][indxs2[1]:]
                input=self.contractions(input)
                input=self.cleanfunc(input)
                input=input.split(" ")
                trd['index']=i
                trd['inputs']=input
                trd['text']=train_data[i]['sequence']
                trd['text']=self.contractions(trd['text'])
                trd['text']=self.cleanfunc(trd['text'])
                trd['label']=train_data[i]['relation']
                trd['head_first']=train_data[i]['head_first']
            else:
                input=train_data[i]['sequence'][0:indxs2[0]]+"[E1] "+train_data[i]['ent1']+" [/E1] "+train_data[i]['sequence'][indxs2[1]:indxs[0]]+"[E2] "+train_data[i]['ent2']+" [/E2] "+train_data[i]['sequence'][indxs[1]:]
                input=self.contractions(input)
                input=self.cleanfunc(input)
                input=input.split(" ")
                trd['index']=i
                trd['inputs']=input
                trd['text']=train_data[i]['sequence']
                trd['text']=self.contractions(trd['text'])
                trd['text']=self.cleanfunc(trd['text'])
                trd['label']=train_data[i]['relation']
                trd['head_first']=train_data[i]['head_first']
            datatrain['data'].append(trd)

        for i in range(len(test_data)):
            trd={}
            indxs=self.search_idxs(test_data[i]['ent1'],test_data[i]['sequence'])
            indxs2=self.search_idxs(test_data[i]['ent2'],test_data[i]['sequence'])
            if indxs[0]<indxs2[0]:
                input=test_data[i]['sequence'][0:indxs[0]]+"[E1] "+test_data[i]['ent1']+" [/E1] "+test_data[i]['sequence'][indxs[1]:indxs2[0]]+"[E2] "+test_data[i]['ent2']+" [/E2] "+test_data[i]['sequence'][indxs2[1]:]
                input=self.contractions(input)
                input=self.cleanfunc(input)
                input=input.split(" ")
                trd['index']=i
                trd['inputs']=input
                trd['text']=train_data[i]['sequence']
                trd['text']=self.contractions(trd['text'])
                trd['text']=self.cleanfunc(trd['text'])
                trd['label']=train_data[i]['relation']
                trd['head_first']=train_data[i]['head_first']
            else:
                input=test_data[i]['sequence'][0:indxs2[0]]+"[E1] "+test_data[i]['ent1']+" [/E1] "+test_data[i]['sequence'][indxs2[1]:indxs[0]]+"[E2] "+test_data[i]['ent2']+" [/E2] "+test_data[i]['sequence'][indxs[1]:]
                input=self.contractions(input)
                input=self.cleanfunc(input)
                input=input.split(" ")
                trd['index']=i
                trd['inputs']=input
                trd['text']=train_data[i]['sequence']
                trd['text']=self.contractions(trd['text'])
                trd['text']=self.cleanfunc(trd['text'])
                trd['label']=train_data[i]['relation']
                trd['head_first']=train_data[i]['head_first']

            datatest['data'].append(trd)
        return datatrain, datatest





In [8]:
d=dataset(data,ner, ['PARTNERSHIP','RESEARCH_PROJECT','SUBSIDIARY','PURCHASE','FINANCING','RECRUITMENT','LAUNCH_PRODUCT-SERVICE','HAS_PRODUCT-SERVICE',
'OPERATES_IN_MARKET','BASED_IN','WORKS_IN'],['PARTNERSHIP'])
mr=d.mapped_rel()
datatrain,datatest=d.build_labelled_dataset(0.2)

In [20]:
with open("reldatatrain.json", "w") as final:
   json.dump(datatrain, final)
with open("reldatatest.json", "w") as final:
   json.dump(datatest, final)