In [1]:
import pandas as pd
import numpy as np

In [2]:
# annotated events and time expressions
goldevents = pd.read_csv("../../data/TempEval3-data/TE3-Gold_events.csv", skip_blank_lines=True)
platinumevents = pd.read_csv("../../data/TempEval3-data/TE3-Platinum_events.csv", skip_blank_lines=True)
goldtime = pd.read_csv("../../data/TempEval3-data/TE3-Gold_time.csv", skip_blank_lines=True)
platinumtime = pd.read_csv("../../data/TempEval3-data/TE3-Platinum_time.csv", skip_blank_lines=True)

In [3]:
# tokens in the dataset
goldtokens = pd.read_csv("../../data/TempEval3-data/TE3-Gold_tokens.csv", skip_blank_lines=True)
platinumtokens = pd.read_csv("../../data/TempEval3-data/TE3-Platinum_tokens.csv", skip_blank_lines=True)

In [4]:
# function to retrieve the intersection of two lists
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

In [5]:
# type consistency at the level of annotated tokens
goldeventslist = list(goldevents["Token POS"].str.split(' ', expand=True).stack().unique())
platinumeventslist = list(platinumevents["Token POS"].str.split(' ', expand=True).stack().unique())

goldtimelist = list(goldtime["Token POS"].str.split(' ', expand=True).stack().unique())
platinumtimelist = list(platinumtime["Token POS"].str.split(' ', expand=True).stack().unique())

In [6]:
print(intersection(goldeventslist, goldtimelist)) 

['at-IN', 'coming-VB', 'election-NN', 'end-NN', 'flat-JJ', 'following-VB', 'good-JJ', 'half-NN', 'in-IN', 'march-NN', 'part-NN', 'trading-NN', 'up-RB']


In [7]:
print(intersection(goldeventslist, platinumtimelist)) 

['at-IN', 'end-NN', 'following-VB', 'tenure-NN']


In [8]:
print(intersection(platinumeventslist, goldtimelist)) 

['end-NN', 'time-NN', '90-CD', '40-CD', 'good-JJ']


In [9]:
print(intersection(platinumeventslist, platinumtimelist)) 

['season-NN', 'end-NN', 'time-NN', '90-CD']


In [10]:
# type consistency at the level of annotated lemmas
goldeventslist = list(goldevents["Lemma POS"].str.split(' ', expand=True).stack().unique())
platinumeventslist = list(platinumevents["Lemma POS"].str.split(' ', expand=True).stack().unique())

goldtimelist = list(goldtime["Lemma POS"].str.split(' ', expand=True).stack().unique())
platinumtimelist = list(platinumtime["Lemma POS"].str.split(' ', expand=True).stack().unique())

In [11]:
print(intersection(goldeventslist, goldtimelist)) 

['at-IN', 'beginning-NN', 'come-VB', 'election-NN', 'end-NN', 'flat-JJ', 'follow-VB', 'good-JJ', 'half-NN', 'in-IN', 'march-NN', 'part-NN', 'period-NN', 'trading-NN', 'up-RB']


In [12]:
print(intersection(goldeventslist, platinumtimelist)) 

['at-IN', 'end-NN', 'follow-VB', 'tenure-NN']


In [13]:
print(intersection(platinumeventslist, goldtimelist)) 

['end-NN', 'come-VB', 'time-NN', '90-CD', '40-CD', 'good-JJ']


In [14]:
print(intersection(platinumeventslist, platinumtimelist)) 

['season-NN', 'end-NN', 'time-NN', '90-CD']


## Check if there are tokens or lemmas that are annotated as events in one dataset but not in other dataset

In [16]:
p_event_tokens = list(set(platinumevents["Token POS"]))
g_event_tokens = list(set(goldevents["Token POS"]))
p_event_lemmas = list(set(platinumevents["Lemma POS"]))
g_event_lemmas = list(set(goldevents["Lemma POS"]))

p_time_tokens = list(set(platinumtime["Token POS"]))
g_time_tokens = list(set(goldtime["Token POS"]))
p_time_lemmas = list(set(platinumtime["Lemma POS"]))
g_time_lemmas = list(set(goldtime["Lemma POS"]))

p_time_tokens_single = filter(lambda x: ' ' not in x, p_time_tokens)
g_time_tokens_single = filter(lambda x: ' ' not in x, g_time_tokens)
p_time_lemmas_single = filter(lambda x: ' ' not in x, p_time_lemmas)
g_time_lemmas_single = filter(lambda x: ' ' not in x, g_time_lemmas)
        
p_time_tokens_multi = filter(lambda x: ' ' in x, p_time_tokens)
g_time_tokens_multi = filter(lambda x: ' ' in x, g_time_tokens)
p_time_lemmas_multi = filter(lambda x: ' ' in x, p_time_lemmas)
g_time_lemmas_multi = filter(lambda x: ' ' in x, g_time_lemmas)

# unique tokens
p_tokens = list(set(platinumtokens["Token POS"]))
g_tokens = list(set(goldtokens["Token POS"]))
p_lemmas = list(set(platinumtokens["Lemma POS"]))
g_lemmas = list(set(goldtokens["Lemma POS"]))

# all tokens
all_p_tokens = list(platinumtokens["Token POS"])
all_g_tokens = list(goldtokens["Token POS"])
all_p_lemmas = list(platinumtokens["Lemma POS"])
all_g_lemmas = list(goldtokens["Lemma POS"])

# all tokens that are not times
all_p_tokens_not_marked = list(platinumtokens[platinumtokens["Is Time Expression"] == 0]["Token POS"])
all_g_tokens_not_marked = list(goldtokens[goldtokens["Is Time Expression"] == 0]["Token POS"])
all_p_lemmas_not_marked = list(platinumtokens[platinumtokens["Is Time Expression"] == 0]["Lemma POS"])
all_g_lemmas_not_marked = list(goldtokens[goldtokens["Is Time Expression"] == 0]["Lemma POS"])

unique_p_tokens_not_marked = list(set(platinumtokens[platinumtokens["Is Time Expression"] == 0]["Token POS"]))
unique_g_tokens_not_marked = list(set(goldtokens[goldtokens["Is Time Expression"] == 0]["Token POS"]))
unique_p_lemmas_not_marked = list(set(platinumtokens[platinumtokens["Is Time Expression"] == 0]["Lemma POS"]))
unique_g_lemmas_not_marked = list(set(goldtokens[goldtokens["Is Time Expression"] == 0]["Lemma POS"]))

In [17]:
def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

In [18]:
event_tokens_in_p_not_in_g = diff(p_event_tokens, g_event_tokens)
event_tokens_in_g_not_in_p = diff(g_event_tokens, p_event_tokens)
event_lemmas_in_p_not_in_g = diff(p_event_lemmas, g_event_lemmas)
event_lemmas_in_g_not_in_p = diff(g_event_lemmas, p_event_lemmas)

In [19]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

In [20]:
event_tokens_in_p_not_in_g_events_but_in_g_tokens = intersection(event_tokens_in_p_not_in_g, g_tokens)
print(len(event_tokens_in_p_not_in_g_events_but_in_g_tokens))
total_occ = len([i for i in all_g_tokens if i in event_tokens_in_p_not_in_g_events_but_in_g_tokens ])
print(total_occ)

30
188


In [21]:
event_lemmas_in_p_not_in_g_events_but_in_g_lemmas = intersection(event_lemmas_in_p_not_in_g, g_lemmas)
print(len(event_lemmas_in_p_not_in_g_events_but_in_g_lemmas))
total_occ = len([i for i in all_g_lemmas if i in event_lemmas_in_p_not_in_g_events_but_in_g_lemmas ])
print(total_occ)

28
230


In [22]:
event_tokens_in_g_not_in_p_events_but_in_p_tokens = intersection(event_tokens_in_g_not_in_p, p_tokens)
print(len(event_tokens_in_g_not_in_p_events_but_in_p_tokens))
total_occ = len([i for i in all_p_tokens if i in event_tokens_in_g_not_in_p_events_but_in_p_tokens ])
print(total_occ)

162
678


In [23]:
event_lemmas_in_g_not_in_p_events_but_in_p_lemmas = intersection(event_lemmas_in_g_not_in_p, p_lemmas)
print(len(event_lemmas_in_g_not_in_p_events_but_in_p_lemmas))
total_occ = len([i for i in all_p_lemmas if i in event_lemmas_in_g_not_in_p_events_but_in_p_lemmas ])
print(total_occ)

140
552


In [24]:
time_tokens_in_p_not_in_g = diff(p_time_tokens_single, g_time_tokens_single)
time_tokens_in_g_not_in_p = diff(g_time_tokens_single, p_time_tokens_single)
time_lemmas_in_p_not_in_g = diff(p_time_lemmas_single, g_time_lemmas_single)
time_lemmas_in_g_not_in_p = diff(g_time_lemmas_single, p_time_lemmas_single)

In [25]:
time_tokens_in_p_not_in_g_time_but_in_g_tokens = intersection(time_tokens_in_p_not_in_g, unique_g_tokens_not_marked)
print(len(time_tokens_in_p_not_in_g_time_but_in_g_tokens))
total_occ = len([i for i in all_g_tokens_not_marked if i in time_tokens_in_p_not_in_g_time_but_in_g_tokens ])
print(total_occ)
print(time_tokens_in_p_not_in_g_time_but_in_g_tokens)

4
41
['tenure-NN', 'minutes-NN', '12-CD', '58-CD']


In [26]:
time_lemmas_in_p_not_in_g_time_but_in_g_lemmas = intersection(time_lemmas_in_p_not_in_g, unique_g_lemmas_not_marked)
print(len(time_lemmas_in_p_not_in_g_time_but_in_g_lemmas))
total_occ = len([i for i in all_g_lemmas_not_marked if i in time_lemmas_in_p_not_in_g_time_but_in_g_lemmas ])
print(total_occ)
print(time_lemmas_in_p_not_in_g_time_but_in_g_lemmas)

4
40
['tenure-NN', '12-CD', '58-CD', 'minute-NN']


In [27]:
time_tokens_in_g_not_in_p_time_but_in_p_tokens = intersection(time_tokens_in_g_not_in_p, unique_p_tokens_not_marked)
print(len(time_tokens_in_g_not_in_p_time_but_in_p_tokens))
total_occ = len([i for i in all_p_tokens_not_marked if i in time_tokens_in_g_not_in_p_time_but_in_p_tokens ])
print(total_occ)
print(time_tokens_in_g_not_in_p_time_but_in_p_tokens)

13
29
['already-RB', 'current-JJ', 'months-NN', 'previously-RB', 'day-NN', 'summer-NN', 'second-JJ', 'recently-RB', 'then-RB', 'two-CD', 'yet-RB', 'soon-RB', 'last-JJ']


In [28]:
time_lemmas_in_g_not_in_p_time_but_in_p_lemmas = intersection(time_lemmas_in_g_not_in_p, unique_p_lemmas_not_marked)
print(len(time_lemmas_in_g_not_in_p_time_but_in_p_lemmas))
total_occ = len([i for i in all_p_lemmas_not_marked if i in time_lemmas_in_g_not_in_p_time_but_in_p_lemmas ])
print(total_occ)
print(time_lemmas_in_g_not_in_p_time_but_in_p_lemmas)

12
28
['already-RB', 'current-JJ', 'previously-RB', 'summer-NN', 'second-JJ', 'recently-RB', 'then-RB', 'two-CD', 'yet-RB', 'soon-RB', 'last-JJ', 'month-NN']


In [29]:
time_tokens_in_p_not_in_g = diff(p_time_tokens_multi, g_time_tokens_multi)
time_tokens_in_g_not_in_p = diff(g_time_tokens_multi, p_time_tokens_multi)
time_lemmas_in_p_not_in_g = diff(p_time_lemmas_multi, g_time_lemmas_multi)
time_lemmas_in_g_not_in_p = diff(g_time_lemmas_multi, p_time_lemmas_multi)

In [30]:
goldtokens = pd.read_csv("../TempEval3-data/TBAQ-cleaned_tokens.csv", skip_blank_lines=True)
platinumtokens = pd.read_csv("../TempEval3-data/TE3-Platinum_tokens.csv", skip_blank_lines=True)

goldtokens_not_marked = goldtokens[goldtokens["Is Time Expression"] == 0]
platinumtokens_not_marked = platinumtokens[platinumtokens["Is Time Expression"] == 0]

In [31]:
try:
    from itertools import izip as zip
except ImportError: # will be 3.x series
    pass
from itertools import product

def find_continuous_items(data):
    for p in product(*data):
        if all(b-a==1 for a, b in zip(p, p[1:])):
            yield p

In [32]:
distinct = 0
total = 0

for item in time_tokens_in_p_not_in_g:
    
    elems = item.split(" ")
    no_elem = len(elems)
    
    
    indexes = []
    
    for elem in elems:
        elem = elem.split("-")
        indexes.append(goldtokens_not_marked.index[goldtokens_not_marked['Lowercase Token'] == elem[0]].tolist())
         
    #print(indexes) 
    
    occurrences = list(find_continuous_items(indexes))
    
    if len(occurrences) != 0:
        distinct = distinct + 1
        print(item)
    
    total = total + len(occurrences)
    
print(distinct)
print(total)

at-IN the-DT time-NN
an-DT hour-NN
2
3


In [33]:
distinct = 0
total = 0

for item in time_tokens_in_g_not_in_p:
    
    elems = item.split(" ")
    no_elem = len(elems)
    
    
    indexes = []
    
    for elem in elems:
        elem = elem.split("-")
        indexes.append(platinumtokens_not_marked.index[platinumtokens_not_marked['Lowercase Token'] == elem[0]].tolist())
         
    #print(indexes) 
    
    occurrences = list(find_continuous_items(indexes))
    
    if len(occurrences) != 0:
        distinct = distinct + 1
        print(item)
    
    total = total + len(occurrences)
    
print(distinct)
print(total)

the-DT time-NN
1
2


In [34]:
distinct = 0
total = 0

for item in time_lemmas_in_p_not_in_g:

    elems = item.split(" ")
    no_elem = len(elems)
    
    
    indexes = []
    
    for elem in elems:
        elem = elem.split("-")
        indexes.append(goldtokens_not_marked.index[goldtokens_not_marked['Lowercase Lemma Stanford'] == elem[0]].tolist())
         
    #print(indexes) 
    
    occurrences = list(find_continuous_items(indexes))
    
    if len(occurrences) != 0:
        distinct = distinct + 1
        print(item)
    
    total = total + len(occurrences)
    
print(distinct)
print(total)

at-IN the-DT time-NN
a-DT hour-NN
2
3


In [35]:
distinct = 0
total = 0

for item in time_lemmas_in_g_not_in_p:

    elems = item.split(" ")
    no_elem = len(elems)
    
    
    indexes = []
    
    for elem in elems:
        elem = elem.split("-")
        indexes.append(platinumtokens_not_marked.index[platinumtokens_not_marked['Lowercase Lemma Stanford'] == elem[0]].tolist())
         
    #print(indexes) 
    
    occurrences = list(find_continuous_items(indexes))
    
    if len(occurrences) != 0:
        distinct = distinct + 1
        print(item)
    
    total = total + len(occurrences)
    
print(distinct)
print(total)

the-DT time-NN
1
2
