In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('../Additional_eurecom_code/Event_extraction/data/joined_train.csv')

In [3]:
data = data.drop(data.columns[[0, 1]], axis=1)

In [None]:
data = data[data['tag'].str.contains('effect')].reset_index(drop=True) #Only keep entries that have an effect and thus a relation
data.head(20)

In [5]:
data.iloc[18]['text']

'The United Nations has imposed a travel ban on North Korea in an effort to prevent the country from acquiring weapons of mass destruction.'

In [23]:
"""
assumtpions:
- One subject and one object
- Every line has an effect
"""

data = data[data['tag'].str.contains('effect')] #Only keep entries that have an effect and thus a relation
triplets = []
labels = []
context = []
skipped = 0

for index, row in data.iterrows():

    sub_obj = {}

    tokenize_text = word_tokenize(row['text'])
    split_tags = row['tag'].split()

    if len(tokenize_text) != len(split_tags): #This means the tags and tokens do not overlap correctly, hence results are wrong.
        skipped += 1
        continue

    for word, tag in zip(tokenize_text, split_tags):

        if tag != '0': #an entity is detected

            if tag in sub_obj: #This means that the entity consists of 2 words
                sub_obj[tag] += f' {word}'
            else:
                sub_obj[tag] = word

    for key, value in sub_obj.items():
        if key != 'effect': #all other relations are the subjects
            subject = value
        else: #Effect is always the object
            object = value

    triplets.append(f"<triplet> {subject} <subj> {object} <obj> {row['label']}")
    labels.append(row['label'])
    context.append(row['text'])

final_data = pd.DataFrame(list(zip(range(len(triplets)), range(len(triplets)), context, triplets)),
                          columns = ['id', 'title', 'context', 'triplets'])

final_data.to_csv('Data/rebel_format.csv', index=False)
print(f"Number of skipped: {skipped}")

Number of skipped: 245


In [21]:
final_data

Unnamed: 0,id,title,context,triplets
0,0,0,The government passed a law to increase access...,<triplet> increase <subj> to <obj> intend
1,1,1,The United Nations has imposed an arms embargo...,<triplet> arms embargo <subj> terrorist activi...
2,2,2,The European Union has imposed economic sancti...,<triplet> sanctions <subj> interfering <obj> p...
3,3,3,It was a surprise move that could clear the wa...,<triplet> move <subj> run <obj> enable
4,4,4,The government introduced a new law to protect...,<triplet> introduced <subj> recognize <obj> in...
...,...,...,...,...
1572,1572,1572,The government passed a law to increase access...,<triplet> passed <subj> reduce <obj> intend
1573,1573,1573,The United States Congress passed a law to inc...,<triplet> passed <subj> improve <obj> intend
1574,1574,1574,The company has acknowledged a software glitch...,<triplet> causing <subj> acknowledged <obj> cause
1575,1575,1575,The government passed a law to increase access...,<triplet> increase <subj> to <obj> intend


In [27]:
# to get some statistics
num_zero = 0
num_no_effect = 0
index = 0
for line, label in zip(data['tag'], data['label']):
    flag = False

    if label == '0':
        num_zero +=1
        continue

    for tag in line.split():
        if tag == 'effect':
            flag = True
            break

    if flag == False:
        num_no_effect +=1
        print(f"{index} no effect")

    index +=1
print(f"number of no relations: {num_zero}")
print(f"number of no effects: {num_no_effect}")

25 no effect
122 no effect
123 no effect
255 no effect
353 no effect
461 no effect
519 no effect
544 no effect
646 no effect
650 no effect
736 no effect
800 no effect
880 no effect
935 no effect
1023 no effect
1097 no effect
1203 no effect
1227 no effect
1272 no effect
1347 no effect
1387 no effect
1704 no effect
1785 no effect
1816 no effect
number of no relations: 96
number of no effects: 24


In [130]:
data = pd.read_csv('Data/rebel/relation_data_updated.csv')
data= data.drop(data.columns[[0, 1, 2, 6, 8]], axis=1)
data = data[data.label != str(0)]

data['triplets'] = '<triplets> ' + data['trigger1'] + ' <subj> ' + data['trigger2'] + ' <obj> ' + data['label'] #Add the suitable format

In [35]:
data

Unnamed: 0,sentence,trigger1,trigger2,label,triplets
0,The government has implemented a series of law...,Laws,Unfair Labor Practices,prevent,<triplets> Laws <subj> Unfair Labor Practices ...
1,The government has implemented a series of law...,Laws,Pollution,prevent,<triplets> Laws <subj> Pollution <obj> prevent
2,The government has implemented a series of law...,Laws,Dangerous Products,prevent,<triplets> Laws <subj> Dangerous Products <obj...
3,The government has implemented a series of law...,Laws,Fraudulent Financial Practices,prevent,<triplets> Laws <subj> Fraudulent Financial Pr...
4,The government has implemented a series of mea...,Measures,Trafficking,prevent,<triplets> Measures <subj> Trafficking <obj> p...
...,...,...,...,...,...
2092,"DPC, an investor group led by New York-based C...",,,intend,
2093,But they failed to sell these stocks to client...,,,cause,
2094,The country has lifted nearly all virus-relate...,,,enable,
2096,"The episode, a ""distributed denial-of-service""...",,,prevent,


In [106]:
#This piece is used to extract the triples again
import re

sample_strings = ['<s><triplet> old <subj> unreliable <obj> cause</s><pad>', '<s><triplet> died <subj> storm <obj> cause</s><pad>']
pattern = r'>([^<]+)'
matches = [re.findall(pattern, instance) for instance in sample_strings]
df = pd.DataFrame(matches, columns =['Subject', 'Object', 'Relation'])
del matches

Unnamed: 0,Subject,Object,Relation
0,old,unreliable,cause
1,died,storm,cause


In [114]:
#Todo:
#Encode it using adjusted labelencoder
#Compute precision, recall

from sklearn import preprocessing
le_subject= preprocessing.LabelEncoder()

le_subject.fit(df['Subject'])
le_dict = dict(zip(le_subject.classes_, le_subject.transform(le_subject.classes_)))

df['Subject_encoding'] = df['Subject'].apply(lambda x: le_dict.get(x, 999999999)) #to run it over a column, high value otherwise to remain a number

In [141]:
import evaluate
f1_metric = evaluate.load("f1")
results = f1_metric.compute(predictions=df['Subject_encoding'].to_list(), references=df['Subject_encoding'].to_list())
print(results)

{'f1': 1.0}
