# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [1]:
import sys
sys.path.append("../")
import warnings
import pickle
from pathlib import Path
from wsee.utils import utils
from wsee.data import pipeline

warnings.filterwarnings(action='once')

DATA_DIR = '../data/daystream_corpus'  # replace path to corpus

### SD4M Relation/ Event Arguments

| Number | Code       | Description                                                                 |
|--------|------------|-----------------------------------------------------------------------------|
| -1     | ABSTAIN    | No vote, for Labeling Functions                                             |
| 0      | location   | Required argument for all events denoting the location.                     |
| 1      | delay      | Optional argument denoting the delay associated with the event.             |
| 2      | direction  | Optional argument denoting the direction associated with the event.         |
| 3      | start_loc  | Optional argument denoting the starting location associated with the event. |
| 4      | end_loc    | Optional argument denoting the ending location associated with the event.   |
| 5      | start_date | Optional argument denoting the start date associated with the event.        |
| 6      | end_date   | Optional argument denoting the end date associated with the event.          |
| 7      | cause      | Optional argument (trigger) denoting the cause associated with the event.   |
| 8      | jam_length | Optional argument denoting the jam length of a traffic jam event.           |
| 9      | route      | Optional argument denoting the route affected by a canceled stop event.     |
| 10     | no_arg     | No argument relation with the specified trigger.                            |

In [2]:
loaded_data = pipeline.load_data(DATA_DIR)
sd_train = loaded_data['train']
sd_dev = loaded_data['dev']
sd_test = loaded_data['test']

daystream = loaded_data['daystream']

INFO:root:Reading train data from: ../data/daystream_corpus/train/train_with_events_and_defaults.jsonl
INFO:root:Reading dev data from: ../data/daystream_corpus/dev/dev_with_events_and_defaults.jsonl
INFO:root:Reading test data from: ../data/daystream_corpus/test/test_with_events_and_defaults.jsonl
INFO:root:Reading daystream data from: ../data/daystream_corpus/daystream.jsonl


In [3]:
sd_train.head()

Unnamed: 0,id,text,tokens,docType,pos_tags,ner_tags,entities,sentence_spans,event_triggers,event_roles
0,648762527074095104,"Update: 1 Toter, 1 Schwerverletzter bei Unfall...","[Update, :, 1, Toter, ,, 1, Schwerverletzter, ...",TWITTER_JSON,"[NN, $., CARD, NN, $,, CARD, NE, APPR, NN, APP...","[O, O, B-NUMBER, O, O, B-NUMBER, O, O, B-TRIGG...",[{'id': 'c/017598cd-23d0-4d7e-9d7a-c8c8dd08a07...,"[{'start': 0, 'end': 12, 'char_start': 0, 'cha...",[{'id': 'c/2cfe523f-1270-453b-bbb5-9614ca3c396...,[{'trigger': 'c/2cfe523f-1270-453b-bbb5-9614ca...
1,629705386166255617,JETZT die #Abendschau u.a. mit #Imtech-Pleite ...,"[JETZT, die, #Abendschau, u.a., mit, #Imtech, ...",TWITTER_JSON,"[ADV, ART, NN, ADV, APPR, NN, $[, NN, NE, $,, ...","[O, O, O, O, O, B-ORGANIZATION_COMPANY, O, B-T...",[{'id': 'c/03190ebe-54b6-4dee-9736-74664745640...,"[{'start': 0, 'end': 15, 'char_start': 0, 'cha...",[{'id': 'c/57653502-9eea-4080-8e27-d8669de4eaf...,[{'trigger': 'c/57653502-9eea-4080-8e27-d8669d...
2,http://verkehrsmeldungen.polizei-bw.de//TICRss...,Karlsruhe - Basel zwischen Offenburg und Lahr ...,"[Karlsruhe, -, Basel, zwischen, Offenburg, und...",RSS_XML,"[NE, $[, NE, APPR, NE, KON, NE, APPR, PIDAT, N...","[B-LOCATION_CITY, O, B-LOCATION_CITY, O, B-LOC...",[{'id': 'c/1b7e7f99-c4a9-4648-a254-3ed13eb1d26...,"[{'start': 0, 'end': 19, 'char_start': 0, 'cha...",[],[]
3,728566410813771776,"Streik in GR ,Schiff kommt mit 4 Std. Verspätu...","[Streik, in, GR, ,, Schiff, kommt, mit, 4, Std...",TWITTER_JSON,"[NN, APPR, NN, $,, NN, VVFIN, APPR, CARD, NN, ...","[O, O, B-LOCATION, O, O, O, O, B-DURATION, I-D...",[{'id': 'c/c7c14427-2f9e-4669-a252-b0c71e87c51...,"[{'start': 0, 'end': 10, 'char_start': 0, 'cha...",[],[]
4,743822956195876864,Es ist kalt :((\nIch muss auf die Bahn warten ...,"[Es, ist, kalt, :(, (, Ich, muss, auf, die, Ba...",TWITTER_JSON,"[PPER, VAFIN, ADJD, TRUNC, TRUNC, PPER, VMFIN,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],"[{'start': 0, 'end': 22, 'char_start': 0, 'cha...",[],[]


## Step 1: Create one row for each trigger-entity pair (event role)

In [4]:
dataframe_file = DATA_DIR + '/pickled_sd_train_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_sd_train = None
Y_sd_train = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_sd_train, Y_sd_train = pickle.load(pickled_dataframe)
else:
    df_sd_train, Y_sd_train = pipeline.build_event_role_examples(sd_train)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_sd_train, Y_sd_train), pickled_dataframe)

INFO:root:Building event role examples
INFO:root:DataFrame has 1273 rows
INFO:root:Adding the following attributes to each document: entity_type_freqs, somajo_doc, mixed_ner, mixed_ner_spans
1273it [00:12, 103.50it/s]
INFO:root:Adding the following attributes to each role example: not_an_event, arg_type_event_type_match, between_distance, is_multiple_same_event_type
INFO:root:Number of event roles: 2001
INFO:root:Number of event role examples: 7285


In [5]:
dataframe_file = DATA_DIR + '/pickled_sd_dev_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_sd_dev = None
Y_sd_dev = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_sd_dev, Y_sd_dev = pickle.load(pickled_dataframe)
else:
    df_sd_dev, Y_sd_dev = pipeline.build_event_role_examples(sd_dev)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_sd_dev, Y_sd_dev), pickled_dataframe)

INFO:root:Building event role examples
INFO:root:DataFrame has 147 rows
INFO:root:Adding the following attributes to each document: entity_type_freqs, somajo_doc, mixed_ner, mixed_ner_spans
147it [00:00, 186.29it/s]
INFO:root:Adding the following attributes to each role example: not_an_event, arg_type_event_type_match, between_distance, is_multiple_same_event_type
INFO:root:Number of event roles: 195
INFO:root:Number of event role examples: 491


In [6]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)

['location', 'delay', 'direction', 'start_loc', 'end_loc', 'start_date', 'end_date', 'cause', 'jam_length', 'route', 'no_arg']


## Step 2: Explore the data

In [7]:
from wsee.preprocessors.preprocessors import *
from wsee.data import explore

We can apply all our preprocessors on our data and see if we can find something interesting for our labeling functions. Let's first sample the SD4M training data, which is labeled.

In [8]:
dataframe_file = DATA_DIR + '/pickled_labeled_sd4m_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

labeled_sd4m_roles = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        labeled_sd4m_roles = pickle.load(pickled_dataframe)
else:
    labeled_sd4m_roles = explore.add_labels(df_sd_train, Y_sd_train)
    labeled_sd4m_roles = explore.apply_preprocessors(labeled_sd4m_roles, [pre_between_tokens, pre_between_distance])
    labeled_sd4m_roles = explore.add_event_types(labeled_sd4m_roles)
    labeled_sd4m_roles = explore.add_event_arg_roles(labeled_sd4m_roles)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump(labeled_sd4m_roles, pickled_dataframe)

100%|██████████| 2/2 [00:29<00:00, 14.94s/it]


Let's first take a look at the trigger and argument text, and the entity types!

In [9]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [10]:
# explore.sample_data(labeled_sd4m_roles[labeled_sd4m_roles['label']==6], columns=['text', 'between_tokens', 'trigger', 'argument', 'between_distance', 'label', 'event_types', 'event_arg_roles'])

Now we can collect the most frequent trigger-argument pairs per class.

In [11]:
n = 100
filtered_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] != 10]
class_pairs = {}
print(f"Number of event-roles: {len(labeled_sd4m_roles)}\n")
for idx, class_name in enumerate(ROLE_LABELS):
    class_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] == idx]
    print(f"{class_name}: {len(class_sd4m_roles)} instances")

Number of event-roles: 7285

location: 571 instances
delay: 87 instances
direction: 277 instances
start_loc: 377 instances
end_loc: 352 instances
start_date: 35 instances
end_date: 41 instances
cause: 103 instances
jam_length: 135 instances
route: 23 instances
no_arg: 5284 instances


Only checking the argument text probably does not give us much, but it shall serve as an example.

## Step 3: Evaluate the labeling functions on the SD4M training data

In [12]:
from wsee.labeling import event_argument_role_lfs as role_lfs

### Apply the labeling functions

In [13]:
from snorkel.labeling import PandasLFApplier
from wsee.data.pipeline import get_role_list_lfs

lfs = get_role_list_lfs()

applier = PandasLFApplier(lfs)

In [14]:
L_sd_train = applier.apply(df_sd_train)

100%|██████████| 7285/7285 [01:04<00:00, 112.72it/s]


In [15]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L_sd_train, lfs).lf_summary(Y_sd_train)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_location_adjacent_markers,0,[0],0.007412,0.003844,0.0,47,7,0.87037
lf_location_adjacent_trigger_verb,1,[0],0.003157,0.002471,0.000549,22,1,0.956522
lf_location_beginning_street_stop_route,2,[0],0.027454,0.027316,0.0,190,10,0.95
lf_location_first_sentence_street_stop_route,3,[0],0.05779,0.05779,0.000412,388,33,0.921615
lf_location_first_sentence_priorities,4,[0],0.063555,0.059437,0.000412,411,52,0.887689
lf_delay_event_sentence,5,[1],0.013315,0.006314,0.001098,83,14,0.85567
lf_delay_preceding_arg,6,[1],0.002608,0.002608,0.0,19,0,1.0
lf_delay_preceding_trigger,7,[1],0.002745,0.002745,0.000137,20,0,1.0
lf_direction_markers,8,[2],0.040082,0.038161,0.000549,249,43,0.85274
lf_direction_markers_order,9,[2],0.036925,0.036925,0.000549,238,31,0.884758


## Step 4: Error Analysis

In [16]:
from wsee.labeling import error_analysis

In [17]:
relevant_rows = labeled_sd4m_roles.iloc[L_sd_train[:, 0] == 0]
print(len(relevant_rows))
relevant_rows.sample()[['text', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]

54


Unnamed: 0,text,trigger,argument,label,event_types,event_arg_roles
5603,https://t.co/rJmMnlRvrb Vorangegangene Streckensperrung: BOB 86833 (München Hbf ab 18:05 Uhr) fällt zwischen Schliersee und Bayrischzell a…\n,"{'id': 'c/e7f7cb77-fdff-4043-a571-4384ee60f850', 'text': 'Streckensperrung', 'entity_type': 'trigger', 'start': 2, 'end': 3, 'char_start': 39, 'char_end': 55}","{'id': 'c/46383635-4def-46e3-b84e-d7c72daefcff', 'text': 'BOB 86833', 'entity_type': 'location_route', 'start': 4, 'end': 6, 'char_start': 57, 'char_end': 66}",0,"[(Streckensperrung, (39, 55), 1)]","[((Streckensperrung, (39, 55), 1), (BOB 86833, location_route, (57, 66)), 0), ((Streckensperrung, (39, 55), 1), (18:05, time, (83, 88)), 5), ((Streckensperrung, (39, 55), 1), (Schliersee, location_stop, (109, 119)), 3), ((Streckensperrung, (39, 55), 1), (Bayrischzell, location_stop, (124, 136)), 4)]"


In [18]:
error_analysis.sample_fp(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=36, label_of_interest=10, sample_size=12)[['between_tokens', 'trigger', 'argument', 'somajo_doc', 'label', 'event_types', 'event_arg_roles']]

Unnamed: 0,between_tokens,trigger,argument,somajo_doc,label,event_types,event_arg_roles
5084,"[(, 17:48, ), ohne, Halt]","{'id': 'c/0e68656e-bb16-44ba-899b-d96194262c7b', 'text': 'umgeleitet.Mit', 'entity_type': 'trigger', 'start': 15, 'end': 16, 'char_start': 76, 'char_end': 90}","{'id': 'c/d876c401-979b-40c2-bdb1-c31d7d0506b8', 'text': '#Mönchengladbach', 'entity_type': 'location_stop', 'start': 9, 'end': 10, 'char_start': 41, 'char_end': 57}","{'doc': [[#RE4, wird, ab, #Düsseldorf, Hbf, (, 17:26, ), bis, #Mönchengladbach, (, 17:48, ), ohne, Halt, umgeleitet, .], [Mit, 20, Minuten, Verspätung, muss, in, MG, gerechnet, werden]], 'tokens': ['#RE4', 'wird', 'ab', '#Düsseldorf', 'Hbf', '(', '17:26', ')', 'bis', '#Mönchengladbach', '(', '17:48', ')', 'ohne', 'Halt', 'umgeleitet', '.', 'Mit', '20', 'Minuten', 'Verspätung', 'muss', 'in', 'MG', 'gerechnet', 'werden'], 'sentences': [{'text': '#RE4 wird ab #Düsseldorf Hbf (17:26) bis #Mönchengladbach (17:48) ohne Halt umgeleitet.', 'start': 0, 'end': 17, 'char_start': 0, 'char_end': 87}, {'text': 'Mit 20 Minuten Verspätung muss in MG gerechnet werden', 'start': 17, 'end': 26, 'char_start': 87, 'char_end': 140}]}",4,"[(umgeleitet.Mit, (76, 90), 1)]","[((umgeleitet.Mit, (76, 90), 1), (#RE4, location_route, (0, 4)), 0), ((umgeleitet.Mit, (76, 90), 1), (#Düsseldorf Hbf, location_stop, (13, 28)), 3), ((umgeleitet.Mit, (76, 90), 1), (#Mönchengladbach, location_stop, (41, 57)), 4)]"
3511,"[in, Düsseldorf, Flugh, ., Terminal, ist, die, Strecke, der, #S11]","{'id': 'c/60aa18dc-ba83-4e47-ad21-bc9b9f10f97e', 'text': 'gesperrt', 'entity_type': 'trigger', 'start': 13, 'end': 14, 'char_start': 89, 'char_end': 97}","{'id': 'c/b2aebcd4-58a2-453e-b51e-59bdc453a9b2', 'text': 'polizeilicher Ermittlung', 'entity_type': 'trigger', 'start': 1, 'end': 3, 'char_start': 9, 'char_end': 33}","{'doc': [[Aufgrund, polizeilicher, Ermittlung, in, Düsseldorf, Flugh, .], [Terminal, ist, die, Strecke, der, #S11, gesperrt, ., https://t.co/txnhoB02BN, #bahn, #NW]], 'tokens': ['Aufgrund', 'polizeilicher', 'Ermittlung', 'in', 'Düsseldorf', 'Flugh', '.', 'Terminal', 'ist', 'die', 'Strecke', 'der', '#S11', 'gesperrt', '.', 'https://t.co/txnhoB02BN', '#bahn', '#NW'], 'sentences': [{'text': 'Aufgrund polizeilicher Ermittlung in Düsseldorf Flugh.', 'start': 0, 'end': 7, 'char_start': 0, 'char_end': 54}, {'text': 'Terminal ist die Strecke der #S11 gesperrt. https://t.co/txnhoB02BN #bahn #NW', 'start': 7, 'end': 18, 'char_start': 55, 'char_end': 132}]}",7,"[(polizeilicher Ermittlung, (9, 33), 7), (gesperrt, (89, 97), 1)]","[((gesperrt, (89, 97), 1), (polizeilicher Ermittlung, trigger, (9, 33)), 7), ((gesperrt, (89, 97), 1), (#S11, location_route, (84, 88)), 0)]"
6947,"[Zeitverlust, von, bis, zu]","{'id': 'c/effa4b65-ca94-4b3e-bb1b-3d37ed3868c1', 'text': 'Stau.', 'entity_type': 'trigger', 'start': 13, 'end': 15, 'char_start': 92, 'char_end': 97}","{'id': 'c/cf591883-1a6e-40af-892a-f7b2a0f1847c', 'text': 'ein-einhalb Stunden.', 'entity_type': 'duration', 'start': 19, 'end': 24, 'char_start': 121, 'char_end': 141}","{'doc': [[A6, Nürnberg, Richtung, Heilbronn, zwischen, Herrieden, und, Kreuz, Feuchtwangen, /, Crailsheim, 18, km, Stau, .], [Zeitverlust, von, bis, zu, ein-einhalb, Stunden, .]], 'tokens': ['A6', 'Nürnberg', 'Richtung', 'Heilbronn', 'zwischen', 'Herrieden', 'und', 'Kreuz', 'Feuchtwangen', '/', 'Crailsheim', '18', 'km', 'Stau', '.', 'Zeitverlust', 'von', 'bis', 'zu', 'ein-einhalb', 'Stunden', '.'], 'sentences': [{'text': 'A6 Nürnberg Richtung Heilbronn zwischen Herrieden und Kreuz Feuchtwangen / Crailsheim 18 km Stau.', 'start': 0, 'end': 15, 'char_start': 0, 'char_end': 97}, {'text': 'Zeitverlust von bis zu ein-einhalb Stunden.', 'start': 15, 'end': 22, 'char_start': 98, 'char_end': 141}]}",1,"[(Stau., (92, 97), 6)]","[((Stau., (92, 97), 6), (A6, location_street, (0, 2)), 0), ((Stau., (92, 97), 6), (Heilbronn, location_city, (21, 30)), 2), ((Stau., (92, 97), 6), (Herrieden, location, (40, 49)), 3), ((Stau., (92, 97), 6), (Kreuz Feuchtwangen / Crailsheim, location, (54, 85)), 4), ((Stau., (92, 97), 6), (18 km, distance, (86, 91)), 8), ((Stau., (92, 97), 6), (ein-einhalb Stunden., duration, (121, 141)), 1)]"
5651,"[., Dort, wird, der, Verkehr, über, die, Parallelfahrbahn, geleitet, ., (, Zeitverlust, :, etwa, eine]","{'id': 'c/ab4a9a89-3720-4d18-b27c-f10a42829f31', 'text': 'Stau', 'entity_type': 'trigger', 'start': 15, 'end': 16, 'char_start': 96, 'char_end': 100}","{'id': 'c/9cafe0b7-fa7d-43ac-b247-8e1b3e9e551d', 'text': 'halbe Stunde', 'entity_type': 'duration', 'start': 31, 'end': 33, 'char_start': 184, 'char_end': 196}","{'doc': [[A43, Wuppertal, Richtung, Recklinghausen, zwischen, Witten-Herbede, und, Bochum-Querenburg, Unfall, 5, km, Stau, .], [Dort, wird, der, Verkehr, über, die, Parallelfahrbahn, geleitet, .], [(, Zeitverlust, :, etwa, eine, halbe, Stunde, )]], 'tokens': ['A43', 'Wuppertal', 'Richtung', 'Recklinghausen', 'zwischen', 'Witten-Herbede', 'und', 'Bochum-Querenburg', 'Unfall', '5', 'km', 'Stau', '.', 'Dort', 'wird', 'der', 'Verkehr', 'über', 'die', 'Parallelfahrbahn', 'geleitet', '.', '(', 'Zeitverlust', ':', 'etwa', 'eine', 'halbe', 'Stunde', ')'], 'sentences': [{'text': 'A43 Wuppertal Richtung Recklinghausen zwischen Witten-Herbede und Bochum-Querenburg Unfall 5 km Stau.', 'start': 0, 'end': 13, 'char_start': 0, 'char_end': 101}, {'text': 'Dort wird der Verkehr über die Parallelfahrbahn geleitet.', 'start': 13, 'end': 22, 'char_start': 102, 'char_end': 159}, {'text': '(Zeitverlust: etwa eine halbe Stunde)', 'start': 22, 'end': 30, 'char_start': 160, 'char_end': 197}]}",1,"[(Unfall, (84, 90), 0), (Stau, (96, 100), 6)]","[((Unfall, (84, 90), 0), (A43, location_street, (0, 3)), 0), ((Unfall, (84, 90), 0), (Recklinghausen, location_city, (23, 37)), 2), ((Unfall, (84, 90), 0), (Witten-Herbede, location, (47, 61)), 3), ((Unfall, (84, 90), 0), (Bochum-Querenburg, location, (66, 83)), 4), ((Unfall, (84, 90), 0), (halbe Stunde, duration, (184, 196)), 1), ((Stau, (96, 100), 6), (A43, location_street, (0, 3)), 0), ((Stau, (96, 100), 6), (Recklinghausen, location_city, (23, 37)), 2), ((Stau, (96, 100), 6), (Witten-Herbede, location, (47, 61)), 3), ((Stau, (96, 100), 6), (Bochum-Querenburg, location, (66, 83)), 4), ((Stau, (96, 100), 6), (5 km, distance, (91, 95)), 8), ((Stau, (96, 100), 6), (halbe Stunde, duration, (184, 196)), 1)]"
1628,"[und, Affoltern]","{'id': 'c/0a52410a-7bf3-4d57-bddd-ff64c01840e6', 'text': 'stockender Verkehr', 'entity_type': 'trigger', 'start': 16, 'end': 18, 'char_start': 70, 'char_end': 88}","{'id': 'c/f4a67c97-227c-4dbd-a052-3258fbac80f8', 'text': 'Verzw. Zürich-Nord', 'entity_type': 'location', 'start': 9, 'end': 14, 'char_start': 36, 'char_end': 54}","{'doc': [[#A1, -, St., Gallen, ->, Bern, -, Zwischen, Verzw, .], [Zürich-Nord, und, Affoltern, stockender, Verkehr, ,, Überlastung]], 'tokens': ['#A1', '-', 'St.', 'Gallen', '->', 'Bern', '-', 'Zwischen', 'Verzw', '.', 'Zürich-Nord', 'und', 'Affoltern', 'stockender', 'Verkehr', ',', 'Überlastung'], 'sentences': [{'text': '#A1 - St. Gallen -> Bern - Zwischen Verzw.', 'start': 0, 'end': 10, 'char_start': 0, 'char_end': 42}, {'text': 'Zürich-Nord und Affoltern stockender Verkehr, Überlastung', 'start': 10, 'end': 17, 'char_start': 43, 'char_end': 101}]}",3,"[(stockender Verkehr, (70, 88), 6)]","[((stockender Verkehr, (70, 88), 6), (#A1, location_street, (0, 3)), 0), ((stockender Verkehr, (70, 88), 6), (St. Gallen -> Bern, location_route, (6, 24)), 0), ((stockender Verkehr, (70, 88), 6), (Bern, location_city, (20, 24)), 2), ((stockender Verkehr, (70, 88), 6), (Verzw. Zürich-Nord, location, (36, 54)), 3), ((stockender Verkehr, (70, 88), 6), (Affoltern, location, (60, 69)), 4)]"
1626,"[-, Zwischen, Verzw, ., Zürich, -, Nord, und, Affoltern]","{'id': 'c/0a52410a-7bf3-4d57-bddd-ff64c01840e6', 'text': 'stockender Verkehr', 'entity_type': 'trigger', 'start': 16, 'end': 18, 'char_start': 70, 'char_end': 88}","{'id': 'c/e37cfd47-f82c-468c-9f8b-ffa28b3a5e6a', 'text': 'St. Gallen -> Bern', 'entity_type': 'location_route', 'start': 2, 'end': 7, 'char_start': 6, 'char_end': 24}","{'doc': [[#A1, -, St., Gallen, ->, Bern, -, Zwischen, Verzw, .], [Zürich-Nord, und, Affoltern, stockender, Verkehr, ,, Überlastung]], 'tokens': ['#A1', '-', 'St.', 'Gallen', '->', 'Bern', '-', 'Zwischen', 'Verzw', '.', 'Zürich-Nord', 'und', 'Affoltern', 'stockender', 'Verkehr', ',', 'Überlastung'], 'sentences': [{'text': '#A1 - St. Gallen -> Bern - Zwischen Verzw.', 'start': 0, 'end': 10, 'char_start': 0, 'char_end': 42}, {'text': 'Zürich-Nord und Affoltern stockender Verkehr, Überlastung', 'start': 10, 'end': 17, 'char_start': 43, 'char_end': 101}]}",0,"[(stockender Verkehr, (70, 88), 6)]","[((stockender Verkehr, (70, 88), 6), (#A1, location_street, (0, 3)), 0), ((stockender Verkehr, (70, 88), 6), (St. Gallen -> Bern, location_route, (6, 24)), 0), ((stockender Verkehr, (70, 88), 6), (Bern, location_city, (20, 24)), 2), ((stockender Verkehr, (70, 88), 6), (Verzw. Zürich-Nord, location, (36, 54)), 3), ((stockender Verkehr, (70, 88), 6), (Affoltern, location, (60, 69)), 4)]"
1627,"[-, Zwischen, Verzw, ., Zürich, -, Nord, und, Affoltern]","{'id': 'c/0a52410a-7bf3-4d57-bddd-ff64c01840e6', 'text': 'stockender Verkehr', 'entity_type': 'trigger', 'start': 16, 'end': 18, 'char_start': 70, 'char_end': 88}","{'id': 'c/b8564bf2-f611-4364-964a-c02f5cc38549', 'text': 'Bern', 'entity_type': 'location_city', 'start': 6, 'end': 7, 'char_start': 20, 'char_end': 24}","{'doc': [[#A1, -, St., Gallen, ->, Bern, -, Zwischen, Verzw, .], [Zürich-Nord, und, Affoltern, stockender, Verkehr, ,, Überlastung]], 'tokens': ['#A1', '-', 'St.', 'Gallen', '->', 'Bern', '-', 'Zwischen', 'Verzw', '.', 'Zürich-Nord', 'und', 'Affoltern', 'stockender', 'Verkehr', ',', 'Überlastung'], 'sentences': [{'text': '#A1 - St. Gallen -> Bern - Zwischen Verzw.', 'start': 0, 'end': 10, 'char_start': 0, 'char_end': 42}, {'text': 'Zürich-Nord und Affoltern stockender Verkehr, Überlastung', 'start': 10, 'end': 17, 'char_start': 43, 'char_end': 101}]}",2,"[(stockender Verkehr, (70, 88), 6)]","[((stockender Verkehr, (70, 88), 6), (#A1, location_street, (0, 3)), 0), ((stockender Verkehr, (70, 88), 6), (St. Gallen -> Bern, location_route, (6, 24)), 0), ((stockender Verkehr, (70, 88), 6), (Bern, location_city, (20, 24)), 2), ((stockender Verkehr, (70, 88), 6), (Verzw. Zürich-Nord, location, (36, 54)), 3), ((stockender Verkehr, (70, 88), 6), (Affoltern, location, (60, 69)), 4)]"
5081,"[wird, ab, #Düsseldorf, Hbf, (, 17:26, ), bis, #Mönchengladbach, (, 17:48, ), ohne, Halt]","{'id': 'c/0e68656e-bb16-44ba-899b-d96194262c7b', 'text': 'umgeleitet.Mit', 'entity_type': 'trigger', 'start': 15, 'end': 16, 'char_start': 76, 'char_end': 90}","{'id': 'c/bbe22213-66ff-4eff-b49a-3fa70707b087', 'text': '#RE4', 'entity_type': 'location_route', 'start': 0, 'end': 1, 'char_start': 0, 'char_end': 4}","{'doc': [[#RE4, wird, ab, #Düsseldorf, Hbf, (, 17:26, ), bis, #Mönchengladbach, (, 17:48, ), ohne, Halt, umgeleitet, .], [Mit, 20, Minuten, Verspätung, muss, in, MG, gerechnet, werden]], 'tokens': ['#RE4', 'wird', 'ab', '#Düsseldorf', 'Hbf', '(', '17:26', ')', 'bis', '#Mönchengladbach', '(', '17:48', ')', 'ohne', 'Halt', 'umgeleitet', '.', 'Mit', '20', 'Minuten', 'Verspätung', 'muss', 'in', 'MG', 'gerechnet', 'werden'], 'sentences': [{'text': '#RE4 wird ab #Düsseldorf Hbf (17:26) bis #Mönchengladbach (17:48) ohne Halt umgeleitet.', 'start': 0, 'end': 17, 'char_start': 0, 'char_end': 87}, {'text': 'Mit 20 Minuten Verspätung muss in MG gerechnet werden', 'start': 17, 'end': 26, 'char_start': 87, 'char_end': 140}]}",0,"[(umgeleitet.Mit, (76, 90), 1)]","[((umgeleitet.Mit, (76, 90), 1), (#RE4, location_route, (0, 4)), 0), ((umgeleitet.Mit, (76, 90), 1), (#Düsseldorf Hbf, location_stop, (13, 28)), 3), ((umgeleitet.Mit, (76, 90), 1), (#Mönchengladbach, location_stop, (41, 57)), 4)]"
5082,"[(, 17:26, ), bis, #Mönchengladbach, (, 17:48, ), ohne, Halt]","{'id': 'c/0e68656e-bb16-44ba-899b-d96194262c7b', 'text': 'umgeleitet.Mit', 'entity_type': 'trigger', 'start': 15, 'end': 16, 'char_start': 76, 'char_end': 90}","{'id': 'c/429278f7-d156-4c4f-a85c-d525edf3d936', 'text': '#Düsseldorf Hbf', 'entity_type': 'location_stop', 'start': 3, 'end': 5, 'char_start': 13, 'char_end': 28}","{'doc': [[#RE4, wird, ab, #Düsseldorf, Hbf, (, 17:26, ), bis, #Mönchengladbach, (, 17:48, ), ohne, Halt, umgeleitet, .], [Mit, 20, Minuten, Verspätung, muss, in, MG, gerechnet, werden]], 'tokens': ['#RE4', 'wird', 'ab', '#Düsseldorf', 'Hbf', '(', '17:26', ')', 'bis', '#Mönchengladbach', '(', '17:48', ')', 'ohne', 'Halt', 'umgeleitet', '.', 'Mit', '20', 'Minuten', 'Verspätung', 'muss', 'in', 'MG', 'gerechnet', 'werden'], 'sentences': [{'text': '#RE4 wird ab #Düsseldorf Hbf (17:26) bis #Mönchengladbach (17:48) ohne Halt umgeleitet.', 'start': 0, 'end': 17, 'char_start': 0, 'char_end': 87}, {'text': 'Mit 20 Minuten Verspätung muss in MG gerechnet werden', 'start': 17, 'end': 26, 'char_start': 87, 'char_end': 140}]}",3,"[(umgeleitet.Mit, (76, 90), 1)]","[((umgeleitet.Mit, (76, 90), 1), (#RE4, location_route, (0, 4)), 0), ((umgeleitet.Mit, (76, 90), 1), (#Düsseldorf Hbf, location_stop, (13, 28)), 3), ((umgeleitet.Mit, (76, 90), 1), (#Mönchengladbach, location_stop, (41, 57)), 4)]"
1624,"[-, St., Gallen, -, >, Bern, -, Zwischen, Verzw, ., Zürich, -, Nord, und, Affoltern]","{'id': 'c/0a52410a-7bf3-4d57-bddd-ff64c01840e6', 'text': 'stockender Verkehr', 'entity_type': 'trigger', 'start': 16, 'end': 18, 'char_start': 70, 'char_end': 88}","{'id': 'c/b1586a7c-9fe8-434f-b43d-5440754af709', 'text': '#A1', 'entity_type': 'location_street', 'start': 0, 'end': 1, 'char_start': 0, 'char_end': 3}","{'doc': [[#A1, -, St., Gallen, ->, Bern, -, Zwischen, Verzw, .], [Zürich-Nord, und, Affoltern, stockender, Verkehr, ,, Überlastung]], 'tokens': ['#A1', '-', 'St.', 'Gallen', '->', 'Bern', '-', 'Zwischen', 'Verzw', '.', 'Zürich-Nord', 'und', 'Affoltern', 'stockender', 'Verkehr', ',', 'Überlastung'], 'sentences': [{'text': '#A1 - St. Gallen -> Bern - Zwischen Verzw.', 'start': 0, 'end': 10, 'char_start': 0, 'char_end': 42}, {'text': 'Zürich-Nord und Affoltern stockender Verkehr, Überlastung', 'start': 10, 'end': 17, 'char_start': 43, 'char_end': 101}]}",0,"[(stockender Verkehr, (70, 88), 6)]","[((stockender Verkehr, (70, 88), 6), (#A1, location_street, (0, 3)), 0), ((stockender Verkehr, (70, 88), 6), (St. Gallen -> Bern, location_route, (6, 24)), 0), ((stockender Verkehr, (70, 88), 6), (Bern, location_city, (20, 24)), 2), ((stockender Verkehr, (70, 88), 6), (Verzw. Zürich-Nord, location, (36, 54)), 3), ((stockender Verkehr, (70, 88), 6), (Affoltern, location, (60, 69)), 4)]"


In [19]:
error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=19, label_of_interest=5, sample_size=1)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]

Unnamed: 0,text,between_tokens,trigger,argument,label,event_types,event_arg_roles
386,"rhein_sieg #529\n#529 / Hennef Linie 529, 17:48 Uhr ab Hennef hat auf Grund von Polizeieinsatz ca. 20 Minuten Verspätung. …\n","[Uhr, ab, Hennef, hat, auf, Grund, von, Polizeieinsatz, ca., 20, Minuten]","{'id': 'c/627f5d0b-087c-4d81-875f-542203f1755f', 'text': 'Verspätung', 'entity_type': 'trigger', 'start': 24, 'end': 25, 'char_start': 109, 'char_end': 119}","{'id': 'c/59588547-e7ac-42e4-9920-bff8c87960f0', 'text': '17:48', 'entity_type': 'time', 'start': 12, 'end': 13, 'char_start': 41, 'char_end': 46}",5,"[(Polizeieinsatz, (79, 93), 7), (Verspätung, (109, 119), 3)]","[((Verspätung, (109, 119), 3), (529, location_route, (36, 39)), 0), ((Verspätung, (109, 119), 3), (17:48, time, (41, 46)), 5), ((Verspätung, (109, 119), 3), (Hennef, location_stop, (54, 60)), 3), ((Verspätung, (109, 119), 3), (Polizeieinsatz, trigger, (79, 93)), 7), ((Verspätung, (109, 119), 3), (20 Minuten, duration, (98, 108)), 1)]"


In [20]:
error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=0, label_of_interest=0, sample_size=1)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types']]

Unnamed: 0,text,between_tokens,trigger,argument,label,event_types
2341,A99 Ostumfahrung München Richtung Nürnberg zwischen Aschheim / Ismaning und Kreuz München-Nord 6 km stockender Verkehr wegen einer Baustelle\n,"[Ostumfahrung, München, Richtung, Nürnberg, zwischen, Aschheim, /, Ismaning, und, Kreuz, München, -, Nord, 6, km]","{'id': 'c/9a4fb67f-d649-4a5e-97b8-a5bba884fde5', 'text': 'stockender Verkehr', 'entity_type': 'trigger', 'start': 16, 'end': 18, 'char_start': 100, 'char_end': 118}","{'id': 'c/9bd6264c-fea5-4390-b0f9-83682478bc5a', 'text': 'A99', 'entity_type': 'location_street', 'start': 0, 'end': 1, 'char_start': 0, 'char_end': 3}",0,"[(stockender Verkehr, (100, 118), 6), (Baustelle, (131, 140), 7)]"


## Step 5: Train the Labeling model and label the data

In [21]:
dataframe_file = DATA_DIR + '/pickled_daystream_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_daystream = None
Y_daystream = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_daystream, Y_daystream = pickle.load(pickled_dataframe)
else:
    df_daystream, Y_daystream = pipeline.build_event_role_examples(daystream)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_daystream, Y_daystream), pickled_dataframe)
if 'event_roles' in df_daystream:
    df_daystream.drop('event_roles', axis=1, inplace=True)

INFO:root:Building event role examples
INFO:root:DataFrame has 1955 rows
INFO:root:Adding the following attributes to each document: entity_type_freqs, somajo_doc, mixed_ner, mixed_ner_spans
1955it [01:18, 25.05it/s]
INFO:root:Adding the following attributes to each role example: not_an_event, arg_type_event_type_match, between_distance, is_multiple_same_event_type
INFO:root:Number of event roles: 0
INFO:root:Number of event role examples: 47376


In [22]:
L_daystream = applier.apply(df_daystream)

100%|██████████| 47376/47376 [05:28<00:00, 144.27it/s]


In [23]:
from snorkel.labeling import LFAnalysis
LFAnalysis(L_daystream, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_location_adjacent_markers,0,[0],0.00496,0.004264,2.1e-05
lf_location_adjacent_trigger_verb,1,[0],0.000697,0.000443,0.000127
lf_location_beginning_street_stop_route,2,[0],0.003441,0.003441,2.1e-05
lf_location_first_sentence_street_stop_route,3,[0],0.028643,0.028643,6.3e-05
lf_location_first_sentence_priorities,4,[0],0.030902,0.028981,6.3e-05
lf_delay_event_sentence,5,[1],0.004053,0.002132,0.0
lf_delay_preceding_arg,6,[1],0.000887,0.000887,0.0
lf_delay_preceding_trigger,7,[1],0.001245,0.001245,0.0
lf_direction_markers,8,[2],0.00553,0.004116,0.00019
lf_direction_markers_order,9,[2],0.003567,0.003567,0.000127


In [24]:
from snorkel.labeling import LabelModel

daystream_model = LabelModel(cardinality=11, verbose=True)
daystream_model.fit(L_train=L_daystream, n_epochs=5000, log_freq=500, seed=12345, Y_dev=Y_sd_train)

INFO:root:Computing O...
INFO:root:Estimating \mu...
INFO:root:[0 epochs]: TRAIN:[loss=0.259]
INFO:root:[500 epochs]: TRAIN:[loss=0.004]
INFO:root:[1000 epochs]: TRAIN:[loss=0.002]
INFO:root:[1500 epochs]: TRAIN:[loss=0.002]
INFO:root:[2000 epochs]: TRAIN:[loss=0.001]
INFO:root:[2500 epochs]: TRAIN:[loss=0.001]
INFO:root:[3000 epochs]: TRAIN:[loss=0.001]
INFO:root:[3500 epochs]: TRAIN:[loss=0.001]
INFO:root:[4000 epochs]: TRAIN:[loss=0.001]
INFO:root:[4500 epochs]: TRAIN:[loss=0.001]
INFO:root:Finished Training


Here we evaluate the LabelModel with a Majority Label Voter baseline on the SD4M training data.

In [25]:
metrics = ["accuracy", "f1_micro", "f1_macro"]
daystream_model_metrics = daystream_model.score(L=L_sd_train, Y=Y_sd_train, tie_break_policy="random", metrics=metrics)
for metric in metrics:
    print('Label Model {0}: {1:.1f}%'.format(metric, daystream_model_metrics[metric] * 100))

Label Model accuracy: 90.7%
Label Model f1_micro: 90.7%
Label Model f1_macro: 82.8%


In [26]:
from snorkel.labeling import MajorityLabelVoter

daystream_mlv = MajorityLabelVoter(cardinality=11, verbose=True)
daystream_mlv_metrics = daystream_mlv.score(L=L_sd_train, Y=Y_sd_train, tie_break_policy="random", metrics=metrics)
for metric in metrics:
    print('Majority Label Voter {0}: {1:.1f}%'.format(metric, daystream_mlv_metrics[metric] * 100))

Majority Label Voter accuracy: 79.8%
Majority Label Voter f1_micro: 79.8%
Majority Label Voter f1_macro: 58.8%


We repeat the evaluation on the SD4M development data, because we used the SD4M training data to develop our labeling functions our model is likely overfitted on the SD4M training data.

In [33]:
L_sd_dev = applier.apply(df_sd_dev)
daystream_model_metrics = daystream_model.score(L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random", metrics=metrics)
for metric in metrics:
    print('Label Model {0}: {1:.1f}%'.format(metric, daystream_model_metrics[metric] * 100))
daystream_mlv_metrics = daystream_mlv.score(L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random", metrics=metrics)
for metric in metrics:
    print('Majority Label Voter {0}: {1:.1f}%'.format(metric, daystream_mlv_metrics[metric] * 100))

100%|██████████| 491/491 [00:04<00:00, 107.71it/s]

Label Model accuracy: 83.1%
Label Model f1_micro: 83.1%
Label Model f1_macro: 72.6%
Majority Label Voter accuracy: 69.5%
Majority Label Voter f1_micro: 69.5%
Majority Label Voter f1_macro: 55.7%



  'precision', 'predicted', average, warn_for)


In [27]:
daystream_probs = daystream_model.predict_proba(L=L_daystream)

In [28]:
labeled_daystream_with_abstains = pipeline.merge_event_role_examples(df_daystream, utils.zero_out_abstains(daystream_probs, L_daystream))
labeled_daystream_with_abstains.reset_index(level=0).to_json(DATA_DIR + "/save_daystreamv6_roles_with_abstains.jsonl", orient='records', lines=True, force_ascii=False)

INFO:root:Merging event role examples that belong to the same document


## Step 7: Daystream Snorkel Labeling Check

To look at the daystream labeling it would be best to remove the abstains.

In [29]:
from snorkel.labeling import filter_unlabeled_dataframe

df_daystream_filtered, probs_daystream_filtered = filter_unlabeled_dataframe(
    X=df_daystream, y=daystream_probs, L=L_daystream
)

In [30]:
df_daystream_filtered['role_probs'] = list(probs_daystream_filtered)
df_daystream_filtered['most_probable_class'] = [ROLE_LABELS[label_idx] for label_idx in probs_daystream_filtered.argmax(axis=1)]
df_daystream_filtered['max_class_prob'] = ["{:.2f}".format(class_prob) for class_prob in probs_daystream_filtered.max(axis=1)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
for role_class in ROLE_LABELS:
    print(f"{role_class}: {len(df_daystream_filtered[df_daystream_filtered['most_probable_class'] == role_class])} instances")

location: 1573 instances
delay: 192 instances
direction: 259 instances
start_loc: 620 instances
end_loc: 374 instances
start_date: 578 instances
end_date: 98 instances
cause: 194 instances
jam_length: 22 instances
route: 40 instances
no_arg: 35199 instances


In [32]:
df_daystream_filtered[df_daystream_filtered['most_probable_class'] == 'route'].sample(1)[['text', 'trigger', 'argument', 'most_probable_class', 'max_class_prob', 'role_probs']]

Unnamed: 0,text,trigger,argument,most_probable_class,max_class_prob,role_probs
24133,Update! #RE2 Bahnhof #FrankfurtFlughafenRegionalbf jetzt doch komplett gesperrt. Es kann zu kurzfristigen Umleitungen über den Fernbahnhof kommen. Bitte Reiseverbindung vor Abfahrt prüfen.,"{'id': 'c/4f507898-f2d1-43ec-8848-9e02a6649c7e', 'text': 'gesperrt', 'entity_type': 'trigger', 'start': 8, 'end': 9, 'char_start': 71, 'char_end': 79}","{'id': 'c/d3619f13-e1b7-4793-bbb6-1f61a708b1bb', 'text': '#RE2', 'entity_type': 'location_route', 'start': 2, 'end': 3, 'char_start': 8, 'char_end': 12}",route,1.0,"[2.4448004486308372e-05, 1.383903928250819e-07, 4.3938042657787925e-07, 5.6005805894121545e-11, 1.2557914136490533e-10, 3.3197433240426747e-09, 1.0169574821978974e-08, 2.2496540019897624e-07, 5.10450336571369e-07, 0.999974215211592, 9.926462407221225e-09]"
