# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [1]:
import sys
sys.path.append("../")
import warnings
import pickle
from pathlib import Path
from wsee.utils import utils
from wsee.data import pipeline

warnings.filterwarnings(action='once')

DATA_DIR = '../data/daystream_corpus'  # replace path to corpus

### SD4M Relation/ Event Arguments

| Number | Code       | Description                                                                 |
|--------|------------|-----------------------------------------------------------------------------|
| -1     | ABSTAIN    | No vote, for Labeling Functions                                             |
| 0      | location   | Required argument for all events denoting the location.                     |
| 1      | delay      | Optional argument denoting the delay associated with the event.             |
| 2      | direction  | Optional argument denoting the direction associated with the event.         |
| 3      | start_loc  | Optional argument denoting the starting location associated with the event. |
| 4      | end_loc    | Optional argument denoting the ending location associated with the event.   |
| 5      | start_date | Optional argument denoting the start date associated with the event.        |
| 6      | end_date   | Optional argument denoting the end date associated with the event.          |
| 7      | cause      | Optional argument (trigger) denoting the cause associated with the event.   |
| 8      | jam_length | Optional argument denoting the jam length of a traffic jam event.           |
| 9      | route      | Optional argument denoting the route affected by a canceled stop event.     |
| 10     | no_arg     | No argument relation with the specified trigger.                            |

In [2]:
loaded_data = pipeline.load_data(DATA_DIR)
sd_train = loaded_data['train']
sd_dev = loaded_data['dev']
sd_test = loaded_data['test']

daystream = loaded_data['daystream']

INFO:wsee:Reading train data from: ../data/daystream_corpus/train/train_with_events_and_defaults.jsonl
INFO:wsee:Reading dev data from: ../data/daystream_corpus/dev/dev_with_events_and_defaults.jsonl
INFO:wsee:Reading test data from: ../data/daystream_corpus/test/test_with_events_and_defaults.jsonl
INFO:wsee:Reading daystream data from: ../data/daystream_corpus/daystream.jsonl


## Step 1: Create one row for each trigger-entity pair (event role)

In [3]:
dataframe_file = DATA_DIR + '/pickled_sd_train_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_sd_train = None
Y_sd_train = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_sd_train, Y_sd_train = pickle.load(pickled_dataframe)
else:
    df_sd_train, Y_sd_train = pipeline.build_event_role_examples(sd_train)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_sd_train, Y_sd_train), pickled_dataframe)

In [4]:
dataframe_file = DATA_DIR + '/pickled_sd_dev_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_sd_dev = None
Y_sd_dev = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_sd_dev, Y_sd_dev = pickle.load(pickled_dataframe)
else:
    df_sd_dev, Y_sd_dev = pipeline.build_event_role_examples(sd_dev)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_sd_dev, Y_sd_dev), pickled_dataframe)

In [5]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)

['location', 'delay', 'direction', 'start_loc', 'end_loc', 'start_date', 'end_date', 'cause', 'jam_length', 'route', 'no_arg']


## Step 2: Explore the data

In [6]:
from wsee.preprocessors.preprocessors import *
from wsee.data import explore

We can apply all our preprocessors on our data and see if we can find something interesting for our labeling functions. Let's first sample the SD4M training data, which is labeled.

In [7]:
dataframe_file = DATA_DIR + '/pickled_labeled_sd4m_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

labeled_sd4m_roles = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        labeled_sd4m_roles = pickle.load(pickled_dataframe)
else:
    labeled_sd4m_roles = explore.add_labels(df_sd_train, Y_sd_train)
    labeled_sd4m_roles = explore.apply_preprocessors(labeled_sd4m_roles, [pre_between_tokens, pre_between_distance])
    labeled_sd4m_roles = explore.add_event_types(labeled_sd4m_roles)
    labeled_sd4m_roles = explore.add_event_arg_roles(labeled_sd4m_roles)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump(labeled_sd4m_roles, pickled_dataframe)

Let's first take a look at the trigger and argument text, and the entity types!

In [8]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [9]:
explore.sample_data(labeled_sd4m_roles[labeled_sd4m_roles['label']==6], sample_size=2, columns=['text', 'between_tokens', 'trigger', 'argument', 'between_distance', 'label', 'event_types', 'event_arg_roles'])

Unnamed: 0,text,between_tokens,trigger,argument,between_distance,label,event_types,event_arg_roles
3132,"#RB35,#Duisburg(10:44)>#Wesel(11:14) & #Wesel(11:44)>#Duisburg(12:18) fallen leider aus. Grund: Verzögerungen im Betriebsablauf.\n","[), &, #Wesel, (, 11:44, ), >, #Duisburg, (, 12:18, )]","{'id': 'c/5dcd5643-5218-4245-b36b-b0271a18e90d', 'text': 'fallen', 'entity_type': 'trigger', 'start': 21, 'end': 22, 'char_start': 70, 'char_end': 76}","{'id': 'c/d5826ba6-a5d8-4b2d-ba7c-632c0f3f744e', 'text': '11:14', 'entity_type': 'time', 'start': 9, 'end': 10, 'char_start': 30, 'char_end': 35}",11,6,"[(fallen, (70, 76), 1), (aus, (84, 87), 7)]","[((fallen, (70, 76), 1), (#RB35, location_route, (0, 5)), 0), ((fallen, (70, 76), 1), (#Duisburg, location, (6, 15)), 3), ((fallen, (70, 76), 1), (10:44, time, (16, 21)), 5), ((fallen, (70, 76), 1), (#Wesel, location, (23, 29)), 4), ((fallen, (70, 76), 1), (11:14, time, (30, 35)), 6), ((fallen, (70, 76), 1), (#Wesel, location, (39, 45)), 3), ((fallen, (70, 76), 1), (11:44, time, (46, 51)), 5), ((fallen, (70, 76), 1), (#Duisburg, location, (53, 62)), 4), ((fallen, (70, 76), 1), (12:18, time, (63, 68)), 6)]"
2833,"von Donnerstag, 9. Juni, 8.15 Uhr bis Mittwoch, 20. Juli, 18.45 Uhr\nMeldung:\n ICE 777 nach Stuttgart Hbf (planmäßig 8.18 Uhr ab Mannheim Hbf) fällt montags bis freitags von Mannheim Hbf bis Stuttgart Hbf aus.\n ICE 577 von Hamburg-Altona (planmäßige Ankunft 16.35 Uhr in Stuttgart Hbf) endet vom 16. bis 20.07. in Frankfurt (M) Flughafen Fernbf und fällt von Frankfurt (M) Flughafen Fernbf bis Stuttgart Hbf aus.\n ICE 770 nach Hamburg-Altona (planmäßig 9.25 Uhr ab Stuttgart Hbf) beginnt montags bis freitags in Mannheim Hbf und fällt von Stuttgart Hbf bis Mannheim Hbf aus.\n ICE 572 nach Hamburg-Altona (planmäßig 17.25 Uhr ab Stuttgart Hbf) beginnt vom 16. bis 20.07. in Frankfurt (M) Flughafen Fernbf und fällt von Stuttgart Hbf bis Frankfurt (M) Flughafen Fernbf aus.\nGrund:\nArbeiten an der Leit- und Sicherungstechnik Mettertal – Lußhardt\nLink zur detaillierten Meldung: \nLink zum kompletten PDF-Dokument: \n(188 kB)\n--------\n","[in, Frankfurt, (, M, ), Flughafen, Fernbf, und]","{'id': 'c/11681595-80da-45bf-badf-8c4b28937cb5', 'text': 'fällt', 'entity_type': 'trigger', 'start': 148, 'end': 149, 'char_start': 707, 'char_end': 712}","{'id': 'c/f5db94c7-7373-4711-8052-d512d5e81282', 'text': '20.07.', 'entity_type': 'date', 'start': 138, 'end': 140, 'char_start': 662, 'char_end': 668}",8,6,"[(fällt, (142, 147), 1), (aus, (204, 207), 7), (fällt, (348, 353), 1), (aus, (407, 410), 7), (fällt, (528, 533), 1), (aus, (569, 572), 7), (fällt, (707, 712), 1), (aus, (766, 769), 7)]","[((fällt, (142, 147), 1), (ICE 777, location_route, (78, 85)), 0), ((fällt, (142, 147), 1), (Stuttgart Hbf, location_stop, (91, 104)), 2), ((fällt, (142, 147), 1), (Mannheim Hbf, location_stop, (173, 185)), 3), ((fällt, (142, 147), 1), (Stuttgart Hbf, location_stop, (190, 203)), 4), ((fällt, (348, 353), 1), (ICE 577, location_route, (210, 217)), 0), ((fällt, (348, 353), 1), (16., date, (295, 298)), 5), ((fällt, (348, 353), 1), (20.07., date, (303, 309)), 6), ((fällt, (348, 353), 1), (Frankfurt (M) Flughafen Fernbf, location_stop, (358, 388)), 3), ((fällt, (348, 353), 1), (Stuttgart Hbf, location_stop, (393, 406)), 4), ((fällt, (528, 533), 1), (ICE 770, location_route, (413, 420)), 0), ((fällt, (528, 533), 1), (Hamburg-Altona, location_stop, (426, 440)), 2), ((fällt, (528, 533), 1), (Stuttgart Hbf, location_stop, (538, 551)), 3), ((fällt, (528, 533), 1), (Mannheim Hbf, location_stop, (556, 568)), 4), ((fällt, (707, 712), 1), (ICE 572, location_route, (575, 582)), 0), ((fällt, (707, 712), 1), (Hamburg-Altona, location_stop, (588, 602)), 2), ((fällt, (707, 712), 1), (16., date, (654, 657)), 5), ((fällt, (707, 712), 1), (20.07., date, (662, 668)), 6), ((fällt, (707, 712), 1), (Stuttgart Hbf, location_stop, (717, 730)), 3), ((fällt, (707, 712), 1), (Frankfurt (M) Flughafen Fernbf, location_stop, (735, 765)), 4)]"


Now we can collect the most frequent trigger-argument pairs per class.

In [10]:
n = 100
filtered_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] != 10]
class_pairs = {}
print(f"Number of event-roles: {len(labeled_sd4m_roles)}\n")
for idx, class_name in enumerate(ROLE_LABELS):
    class_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] == idx]
    print(f"{class_name}: {len(class_sd4m_roles)} instances")

Number of event-roles: 7285

location: 571 instances
delay: 87 instances
direction: 277 instances
start_loc: 377 instances
end_loc: 352 instances
start_date: 35 instances
end_date: 41 instances
cause: 103 instances
jam_length: 135 instances
route: 23 instances
no_arg: 5284 instances


## Step 3: Evaluate the labeling functions on the SD4M training data

In [11]:
from wsee.labeling import event_argument_role_lfs as role_lfs

### Apply the labeling functions

In [12]:
from snorkel.labeling import PandasLFApplier
from wsee.data.pipeline import get_role_list_lfs

lfs = get_role_list_lfs()

applier = PandasLFApplier(lfs)

In [13]:
L_sd_train = applier.apply(df_sd_train)

100%|██████████| 7285/7285 [01:03<00:00, 114.96it/s]


In [14]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L_sd_train, lfs).lf_summary(Y_sd_train)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_location_adjacent_markers,0,[0],0.007412,0.003844,0.0,47,7,0.87037
lf_location_adjacent_trigger_verb,1,[0],0.003157,0.002471,0.000549,22,1,0.956522
lf_location_beginning_street_stop_route,2,[0],0.027454,0.027316,0.0,190,10,0.95
lf_location_first_sentence_street_stop_route,3,[0],0.05779,0.05779,0.000412,388,33,0.921615
lf_location_first_sentence_priorities,4,[0],0.063555,0.059437,0.000412,411,52,0.887689
lf_delay_event_sentence,5,[1],0.013315,0.006314,0.001098,83,14,0.85567
lf_delay_preceding_arg,6,[1],0.002608,0.002608,0.0,19,0,1.0
lf_delay_preceding_trigger,7,[1],0.002745,0.002745,0.000137,20,0,1.0
lf_direction_markers,8,[2],0.040082,0.038161,0.000549,249,43,0.85274
lf_direction_markers_order,9,[2],0.036925,0.036925,0.000549,238,31,0.884758


## Step 4: Error Analysis

In [15]:
from wsee.labeling import error_analysis

In [16]:
relevant_rows = labeled_sd4m_roles.iloc[L_sd_train[:, 0] == 0]
print(len(relevant_rows))
relevant_rows.sample()[['text', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]

54


Unnamed: 0,text,trigger,argument,label,event_types,event_arg_roles
1266,"am Freitag, 26. und Samstag, 27. Februar, jeweils 13.55 – 18.15 Uhr<br />\n<br />\nMeldung:<br />\nIC 2083 von Hamburg-Altona (planmäßige Ankunft 16.55 Uhr in Berchtesgaden Hbf) wird umgeleitet und hält nicht in München Ost. Aufgrund der Umleitung verspätet sich der Zug bis Freilassing um bis zu 20 Min. Von Freilassing bis Berchtesgaden Hbf wird der Zug durch einen Bus ersetzt.<br />\n<br />Grund:<br />\nOberleitungsarbeiten München – Rosenheim<br />\n<br />Link zur detaillierten Meldung: <br />\n<a href= / ><br />\nLink zum kompletten PDF-Dokument: <br />\n<a href=target=_blank>(91 kB)<br /><br />------------------<br /><br />\n","{'id': 'c/dee71cac-cc5a-43c7-bb30-7eb0e5ae9c95', 'text': 'nicht', 'entity_type': 'trigger', 'start': 41, 'end': 42, 'char_start': 200, 'char_end': 205}","{'id': 'c/6761e1e2-e942-4d5f-85c7-d9e9b4d5b309', 'text': 'München Ost', 'entity_type': 'location_stop', 'start': 43, 'end': 45, 'char_start': 209, 'char_end': 220}",10,"[(umgeleitet, (180, 190), 4), (hält, (195, 199), 2), (nicht, (200, 205), 7), (Umleitung, (235, 244), 7), (verspätet, (245, 254), 3), (Bus ersetzt, (365, 376), 5)]","[((umgeleitet, (180, 190), 4), (IC 2083, location_route, (96, 103)), 0), ((hält, (195, 199), 2), (IC 2083, location_route, (96, 103)), 9), ((hält, (195, 199), 2), (München Ost, location_stop, (209, 220)), 0), ((verspätet, (245, 254), 3), (Umleitung, trigger, (235, 244)), 7), ((verspätet, (245, 254), 3), (Zug, location_route, (264, 267)), 0), ((verspätet, (245, 254), 3), (Freilassing, location_stop, (272, 283)), 4), ((verspätet, (245, 254), 3), (bis zu 20 Min, duration, (287, 300)), 1), ((Bus ersetzt, (365, 376), 5), (Freilassing, location_stop, (306, 317)), 3), ((Bus ersetzt, (365, 376), 5), (Berchtesgaden Hbf, location_stop, (322, 339)), 4), ((Bus ersetzt, (365, 376), 5), (Zug, location_route, (349, 352)), 0)]"


In [17]:
error_analysis.sample_fp(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=36, label_of_interest=10, sample_size=1)[['between_tokens', 'trigger', 'argument', 'somajo_doc', 'label', 'event_types', 'event_arg_roles']]

Unnamed: 0,between_tokens,trigger,argument,somajo_doc,label,event_types,event_arg_roles
5643,"[5, km, Stau, ., Dort, wird, der, Verkehr, über, die, Parallelfahrbahn, geleitet, ., (, Zeitverlust, :, etwa, eine]","{'id': 'c/251f0a53-696b-4a15-a94a-89ce1446b92c', 'text': 'Unfall', 'entity_type': 'trigger', 'start': 12, 'end': 13, 'char_start': 84, 'char_end': 90}","{'id': 'c/9cafe0b7-fa7d-43ac-b247-8e1b3e9e551d', 'text': 'halbe Stunde', 'entity_type': 'duration', 'start': 31, 'end': 33, 'char_start': 184, 'char_end': 196}","{'doc': [[A43, Wuppertal, Richtung, Recklinghausen, zwischen, Witten-Herbede, und, Bochum-Querenburg, Unfall, 5, km, Stau, .], [Dort, wird, der, Verkehr, über, die, Parallelfahrbahn, geleitet, .], [(, Zeitverlust, :, etwa, eine, halbe, Stunde, )]], 'tokens': ['A43', 'Wuppertal', 'Richtung', 'Recklinghausen', 'zwischen', 'Witten-Herbede', 'und', 'Bochum-Querenburg', 'Unfall', '5', 'km', 'Stau', '.', 'Dort', 'wird', 'der', 'Verkehr', 'über', 'die', 'Parallelfahrbahn', 'geleitet', '.', '(', 'Zeitverlust', ':', 'etwa', 'eine', 'halbe', 'Stunde', ')'], 'sentences': [{'text': 'A43 Wuppertal Richtung Recklinghausen zwischen Witten-Herbede und Bochum-Querenburg Unfall 5 km Stau.', 'start': 0, 'end': 13, 'char_start': 0, 'char_end': 101}, {'text': 'Dort wird der Verkehr über die Parallelfahrbahn geleitet.', 'start': 13, 'end': 22, 'char_start': 102, 'char_end': 159}, {'text': '(Zeitverlust: etwa eine halbe Stunde)', 'start': 22, 'end': 30, 'char_start': 160, 'char_end': 197}]}",1,"[(Unfall, (84, 90), 0), (Stau, (96, 100), 6)]","[((Unfall, (84, 90), 0), (A43, location_street, (0, 3)), 0), ((Unfall, (84, 90), 0), (Recklinghausen, location_city, (23, 37)), 2), ((Unfall, (84, 90), 0), (Witten-Herbede, location, (47, 61)), 3), ((Unfall, (84, 90), 0), (Bochum-Querenburg, location, (66, 83)), 4), ((Unfall, (84, 90), 0), (halbe Stunde, duration, (184, 196)), 1), ((Stau, (96, 100), 6), (A43, location_street, (0, 3)), 0), ((Stau, (96, 100), 6), (Recklinghausen, location_city, (23, 37)), 2), ((Stau, (96, 100), 6), (Witten-Herbede, location, (47, 61)), 3), ((Stau, (96, 100), 6), (Bochum-Querenburg, location, (66, 83)), 4), ((Stau, (96, 100), 6), (5 km, distance, (91, 95)), 8), ((Stau, (96, 100), 6), (halbe Stunde, duration, (184, 196)), 1)]"


In [18]:
error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=19, label_of_interest=5, sample_size=1)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]

Unnamed: 0,text,between_tokens,trigger,argument,label,event_types,event_arg_roles
3130,"#RB35,#Duisburg(10:44)>#Wesel(11:14) & #Wesel(11:44)>#Duisburg(12:18) fallen leider aus. Grund: Verzögerungen im Betriebsablauf.\n","[), >, #Wesel, (, 11:14, ), &, #Wesel, (, 11:44, ), >, #Duisburg, (, 12:18, )]","{'id': 'c/5dcd5643-5218-4245-b36b-b0271a18e90d', 'text': 'fallen', 'entity_type': 'trigger', 'start': 21, 'end': 22, 'char_start': 70, 'char_end': 76}","{'id': 'c/40530ff2-de36-4c24-8d8d-a96a757d3702', 'text': '10:44', 'entity_type': 'time', 'start': 4, 'end': 5, 'char_start': 16, 'char_end': 21}",5,"[(fallen, (70, 76), 1), (aus, (84, 87), 7)]","[((fallen, (70, 76), 1), (#RB35, location_route, (0, 5)), 0), ((fallen, (70, 76), 1), (#Duisburg, location, (6, 15)), 3), ((fallen, (70, 76), 1), (10:44, time, (16, 21)), 5), ((fallen, (70, 76), 1), (#Wesel, location, (23, 29)), 4), ((fallen, (70, 76), 1), (11:14, time, (30, 35)), 6), ((fallen, (70, 76), 1), (#Wesel, location, (39, 45)), 3), ((fallen, (70, 76), 1), (11:44, time, (46, 51)), 5), ((fallen, (70, 76), 1), (#Duisburg, location, (53, 62)), 4), ((fallen, (70, 76), 1), (12:18, time, (63, 68)), 6)]"


In [19]:
error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=0, label_of_interest=0, sample_size=1)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types']]

Unnamed: 0,text,between_tokens,trigger,argument,label,event_types
3474,"RT @earlybird445: Wegen Weichenstörung in #Zuffenhausen kommt es bei S4, S5 und S6 / S60 zu großen Verspätungen und Ausfällen. Keine Infos be…\n","[zu, großen]","{'id': 'c/d863de9d-fb50-4c94-afdf-f42797f286ef', 'text': 'Verspätungen', 'entity_type': 'trigger', 'start': 19, 'end': 20, 'char_start': 99, 'char_end': 111}","{'id': 'c/aa63f167-1cd8-4fd4-9eff-6e652244bfac', 'text': 'S60', 'entity_type': 'location_route', 'start': 16, 'end': 17, 'char_start': 85, 'char_end': 88}",0,"[(Weichenstörung, (24, 38), 7), (Verspätungen, (99, 111), 3), (Ausfällen, (116, 125), 1)]"


## Step 5: Train the Label model and label the data

### Train the label model

In [20]:
dataframe_file = DATA_DIR + '/pickled_daystream_role_examples'
pickled_dataframe_file = Path(dataframe_file + '.pkl')

df_daystream = None
Y_daystream = None

if pickled_dataframe_file.exists():
    with open(pickled_dataframe_file, 'rb') as pickled_dataframe:
        df_daystream, Y_daystream = pickle.load(pickled_dataframe)
else:
    df_daystream, Y_daystream = pipeline.build_event_role_examples(daystream)
    with open(pickled_dataframe_file, 'wb') as pickled_dataframe:
        pickle.dump((df_daystream, Y_daystream), pickled_dataframe)
if 'event_roles' in df_daystream:
    df_daystream.drop('event_roles', axis=1, inplace=True)

In [21]:
L_daystream = applier.apply(df_daystream)

100%|██████████| 47376/47376 [05:14<00:00, 150.71it/s]


In [22]:
from snorkel.labeling import LFAnalysis
LFAnalysis(L_daystream, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_location_adjacent_markers,0,[0],0.00496,0.004264,2.1e-05
lf_location_adjacent_trigger_verb,1,[0],0.000697,0.000443,0.000127
lf_location_beginning_street_stop_route,2,[0],0.003441,0.003441,2.1e-05
lf_location_first_sentence_street_stop_route,3,[0],0.028643,0.028643,6.3e-05
lf_location_first_sentence_priorities,4,[0],0.030902,0.028981,6.3e-05
lf_delay_event_sentence,5,[1],0.004053,0.002132,0.0
lf_delay_preceding_arg,6,[1],0.000887,0.000887,0.0
lf_delay_preceding_trigger,7,[1],0.001245,0.001245,0.0
lf_direction_markers,8,[2],0.00553,0.004116,0.00019
lf_direction_markers_order,9,[2],0.003567,0.003567,0.000127


In [23]:
from snorkel.labeling import LabelModel

daystream_model = LabelModel(cardinality=11, verbose=True)
daystream_model.fit(L_train=L_daystream, n_epochs=5000, log_freq=500, seed=12345, Y_dev=Y_sd_train)

INFO:root:Computing O...
INFO:root:Estimating \mu...
INFO:root:[0 epochs]: TRAIN:[loss=0.259]
INFO:root:[500 epochs]: TRAIN:[loss=0.004]
INFO:root:[1000 epochs]: TRAIN:[loss=0.002]
INFO:root:[1500 epochs]: TRAIN:[loss=0.002]
INFO:root:[2000 epochs]: TRAIN:[loss=0.001]
INFO:root:[2500 epochs]: TRAIN:[loss=0.001]
INFO:root:[3000 epochs]: TRAIN:[loss=0.001]
INFO:root:[3500 epochs]: TRAIN:[loss=0.001]
INFO:root:[4000 epochs]: TRAIN:[loss=0.001]
INFO:root:[4500 epochs]: TRAIN:[loss=0.001]
INFO:root:Finished Training


### Look at label model performance

Here we evaluate the LabelModel on the SD4M development data, because we used the SD4M training data to develop our labeling functions our model is likely overfitted on the SD4M training data. The included `score` function from Snorkel is limited and more easily applicable in a binary classification setting. We will instead use the predictions and sklearn metrics ourselves.
For each model we will first report the metrics for all classes and then the metrics without the majority negative class.

In [24]:
from wsee.utils.scorer import score_model

positive_event_role_indices = [idx for idx, _ in enumerate(ROLE_LABELS)][:-1]

We create a MajorityLabelVoter and a LabelModel version that does not use the SD4M training data to infer a class balance prior for comparison.

In [25]:
from snorkel.labeling import MajorityLabelVoter

daystream_mlv = MajorityLabelVoter(cardinality=11, verbose=True)
daystream_without_sd4m_cb = LabelModel(cardinality=11, verbose=True)
daystream_without_sd4m_cb.fit(L_train=L_daystream,n_epochs=5000, log_freq=500, seed=12345)

INFO:root:Computing O...
INFO:root:Estimating \mu...
INFO:root:[0 epochs]: TRAIN:[loss=0.644]
INFO:root:[500 epochs]: TRAIN:[loss=0.002]
INFO:root:[1000 epochs]: TRAIN:[loss=0.001]
INFO:root:[1500 epochs]: TRAIN:[loss=0.001]
INFO:root:[2000 epochs]: TRAIN:[loss=0.001]
INFO:root:[2500 epochs]: TRAIN:[loss=0.001]
INFO:root:[3000 epochs]: TRAIN:[loss=0.001]
INFO:root:[3500 epochs]: TRAIN:[loss=0.001]
INFO:root:[4000 epochs]: TRAIN:[loss=0.000]
INFO:root:[4500 epochs]: TRAIN:[loss=0.000]
INFO:root:Finished Training


In [26]:
L_sd_dev = applier.apply(df_sd_dev)

100%|██████████| 491/491 [00:04<00:00, 106.25it/s]


#### With tie_break_policy set to "random"
Sometimes there might be instances where all the labeling functions abstain or where we might encounter a tie between the labeling functions.
Here we use the tie break policy "random", where the label models randomly choose among tied option using deterministic hash.
(When all labeling functions abstain all options/classes are tied.)
Note that coverage is still calculated as normal, i.e. as the ratio of labeled data points and all data points.

**Label Model**

In [27]:
score_model(model=daystream_model, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random")

  'precision', 'predicted', average, warn_for)


Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.830957,0.737816
1,recall,0.830957,0.75477
2,f1,0.830957,0.725519
3,accuracy,0.830957,0.830957
4,coverage,1.0,1.0


In [28]:
score_model(model=daystream_model, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random", labels=positive_event_role_indices)

Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.789474,0.725883
1,recall,0.769231,0.743085
2,f1,0.779221,0.711639
3,accuracy,0.830957,0.830957
4,coverage,1.0,1.0


**Label model without a class balance prior inferred from SD4M training set**

In [29]:
score_model(model=daystream_without_sd4m_cb, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random")

Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.688391,0.469263
1,recall,0.688391,0.742119
2,f1,0.688391,0.536639
3,accuracy,0.688391,0.688391
4,coverage,0.759674,0.759674


In [31]:
score_model(model=daystream_without_sd4m_cb, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random", labels=positive_event_role_indices)

Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.518395,0.420876
1,recall,0.794872,0.754507
2,f1,0.62753,0.515303
3,accuracy,0.688391,0.688391
4,coverage,0.759674,0.759674


**Majority Label Voter**

In [32]:
score_model(model=daystream_mlv, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random")

Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.694501,0.482654
1,recall,0.694501,0.833643
2,f1,0.694501,0.557398
3,accuracy,0.694501,0.694501
4,coverage,0.763747,0.763747


In [33]:
score_model(model=daystream_mlv, L=L_sd_dev, Y=Y_sd_dev, tie_break_policy="random", labels=positive_event_role_indices)

Unnamed: 0,Metric,Micro Average,Macro Average
0,precision,0.525253,0.435558
1,recall,0.8,0.854507
2,f1,0.634146,0.537628
3,accuracy,0.694501,0.694501
4,coverage,0.763747,0.763747


### Do predictions on the daystream data

In [34]:
daystream_probs = daystream_model.predict_proba(L=L_daystream)

In the proposed workflow one would filter out all the datapoints that were not labeled by any of the labeling functions.
In the actual pipeline we would multiply the probabilities of abstains with zero so that they look like padding instances, when fed into the end model.
We propose this workaround since examples that are filtered out here are treated as negative examples per default in the end model.
We also cannot afford to filter out the whole document if just one trigger/role example was not labeled and potentially loose valuable training examples.

In [35]:
labeled_daystream_with_abstains = pipeline.merge_event_role_examples(df_daystream, utils.zero_out_abstains(daystream_probs, L_daystream))
labeled_daystream_with_abstains.reset_index(level=0).to_json(DATA_DIR + "/save_daystreamv6_roles_with_abstains.jsonl", orient='records', lines=True, force_ascii=False)

INFO:wsee:Merging event role examples that belong to the same document


## Step 7: Daystream Snorkel Labeling Check

To look at the daystream labeling it would be best to remove the abstains.

In [36]:
from snorkel.labeling import filter_unlabeled_dataframe

df_daystream_filtered, probs_daystream_filtered = filter_unlabeled_dataframe(
    X=df_daystream, y=daystream_probs, L=L_daystream
)

In [37]:
df_daystream_filtered['role_probs'] = list(probs_daystream_filtered)
df_daystream_filtered['most_probable_class'] = [ROLE_LABELS[label_idx] for label_idx in probs_daystream_filtered.argmax(axis=1)]
df_daystream_filtered['max_class_prob'] = ["{:.2f}".format(class_prob) for class_prob in probs_daystream_filtered.max(axis=1)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
for role_class in ROLE_LABELS:
    print(f"{role_class}: {len(df_daystream_filtered[df_daystream_filtered['most_probable_class'] == role_class])} instances")

location: 1573 instances
delay: 192 instances
direction: 259 instances
start_loc: 620 instances
end_loc: 374 instances
start_date: 578 instances
end_date: 98 instances
cause: 194 instances
jam_length: 22 instances
route: 40 instances
no_arg: 35199 instances


In [39]:
df_daystream_filtered[df_daystream_filtered['most_probable_class'] == 'route'].sample(1)[['text', 'trigger', 'argument', 'most_probable_class', 'max_class_prob', 'role_probs']]

Unnamed: 0,text,trigger,argument,most_probable_class,max_class_prob,role_probs
24133,Update! #RE2 Bahnhof #FrankfurtFlughafenRegionalbf jetzt doch komplett gesperrt. Es kann zu kurzfristigen Umleitungen über den Fernbahnhof kommen. Bitte Reiseverbindung vor Abfahrt prüfen.,"{'id': 'c/4f507898-f2d1-43ec-8848-9e02a6649c7e', 'text': 'gesperrt', 'entity_type': 'trigger', 'start': 8, 'end': 9, 'char_start': 71, 'char_end': 79}","{'id': 'c/d3619f13-e1b7-4793-bbb6-1f61a708b1bb', 'text': '#RE2', 'entity_type': 'location_route', 'start': 2, 'end': 3, 'char_start': 8, 'char_end': 12}",route,1.0,"[2.4448004486308372e-05, 1.383903928250819e-07, 4.3938042657787925e-07, 5.6005805894121545e-11, 1.2557914136490533e-10, 3.3197433240426747e-09, 1.0169574821978974e-08, 2.2496540019897624e-07, 5.10450336571369e-07, 0.999974215211592, 9.926462407221225e-09]"
