# Transform the data to work with Snorkel: Part 1 - Event Type

Essentially we will have to create two labeling models.
One assigns labels to event types and the other assigns labels to argument roles in event mentions.

In any case we need to create a row for each event (trigger) to do event type labeling.

For this we need 1 additional column:
- trigger_id

One numpy array containing the:
- event_type

We will probably focus on keyword lists and some heuristics to create our labeling functions.

In [1]:
import sys
sys.path.append("../")
import os
from tqdm import tqdm
import pandas as pd
from wsee.utils import utils

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus

In [2]:
sd_train_path = os.path.join(DATA_DIR, 'train/train_with_events.jsonl')
sd_dev_path = os.path.join(DATA_DIR, 'dev/dev_with_events.jsonl')
sd_test_path = os.path.join(DATA_DIR, 'test/test_with_events.jsonl')

daystream_path = os.path.join(DATA_DIR, 'daystream.jsonl')

In [3]:
sd_train = pd.read_json(sd_train_path, lines=True)
sd_dev = pd.read_json(sd_dev_path, lines=True)
sd_test = pd.read_json(sd_test_path, lines=True)

daystream = pd.read_json(daystream_path, lines=True)

In [4]:
sd_train.head()

Unnamed: 0,id,text,tokens,ner_tags,entities,event_triggers,event_roles
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...
1,http://www.deutschlandradio.de/#17@2016-04-04T...,Vorsicht auf der A7 Ulm Richtung Füssen zwisch...,"[Vorsicht, auf, der, A7, Ulm, Richtung, Füssen...","[O, O, O, B-LOCATION_STREET, B-LOCATION_CITY, ...",[{'id': 'c/2db85836-812f-4ced-90d3-46df9495782...,[],[]
2,667383197769048064,"Genau in dem Bus sitzen, der im Stau steht. Fü...","[Genau, in, dem, Bus, sitzen, ,, der, im, Stau...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",[],[],[]
3,603844236484550658,Große Carsharing-Übernahme: Der französische C...,"[Große, Carsharing, -, Übernahme, :, Der, fran...","[O, O, O, O, O, O, B-LOCATION, O, O, O, B-ORGA...",[{'id': 'c/f0fdb663-677e-4353-9159-8a9530f9777...,[],[]
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...


In [5]:
utils.pretty_print_json(sd_train[['id', 'text', 'entities', 'event_triggers', 'event_roles']].iloc[6])

{
  "id": "754201930264633344",
  "text": "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n",
  "entities": [
    {
      "id": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "text": "#A1",
      "entity_type": "LOCATION_STREET",
      "start": 1,
      "end": 2
    },
    {
      "id": "c/7c844525-065a-498e-8b14-d4998cfe6fe7",
      "text": "#Bremen",
      "entity_type": "LOCATION_CITY",
      "start": 2,
      "end": 3
    },
    {
      "id": "c/2cc0660c-c343-4712-bb96-1f3f35282cf4",
      "text": "#Hamburg",
      "entity_type": "LOCATION_CITY",
      "start": 4,
      "end": 5
    },
    {
      "id": "c/631f4cdb-1331-4a84-8c0d-ddc37ab2f8c1",
      "text": "Horster Dreieck",
      "entity_type": "LOCATION",
      "start": 6,
      "end": 8
    },
    {
      "id": "c/5512c1fc-4ded-4b91-a562-a7dae5d066ee",
      "text": "#Stillhorn",
      "entity_type": "LOCATION_CITY",
      "start": 9,
      

## Step 1: Create one row for every event trigger

In [6]:
sd_train.iloc[1].event_triggers

[]

In [7]:
import pandas as pd 
import numpy as np

event_type_rows = []
event_type_rows_y = []

event_count = 0

print(f"DataFrame has {len(sd_train.index)} rows")
for index, row in sd_train.iterrows():
    """if i > 4:
        break"""
    for event_trigger in tqdm(row.event_triggers):
        augmented_row = utils.get_deep_copy(row)
        augmented_row['trigger_id'] = event_trigger['id']
        event_type_rows.append(augmented_row)
        event_type_num = np.asarray(event_trigger['event_type_probs']).argmax()
        event_type_rows_y.append(event_type_num)
        if event_type_num != 7:
            event_count += 1
    
print("Number of events:", event_count)

100%|██████████| 2/2 [00:00<00:00, 577.85it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 427.95it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 556.13it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 573.46it/s]
100%|██████████| 2/2 [00:00<00:00, 493.07it/s]
100%|██████████| 1/1 [00:00<00:00, 561.41it/s]
100%|██████████| 1/1 [00:00<00:00, 459.35it/s]
100%|██████████| 1/1 [00:00<00:00, 592.00it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 18.16it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 498.97it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 527.02it/s]
100%|██████████| 1/1 [00:00<00:00, 453.73it/s]
100%|██████████| 1/1 [00:00<00:00, 494.84it/s]
100%|██████████| 1/1 [00:00<00:00, 433.12it/s]
100%|██████████| 2/2 [00:00<00:00, 420.36it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00,

DataFrame has 1307 rows


0it [00:00, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 486.49it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 339.24it/s]
100%|██████████| 2/2 [00:00<00:00, 472.94it/s]
100%|██████████| 1/1 [00:00<00:00, 423.58it/s]
0it [00:00, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 483.24it/s]
100%|██████████| 1/1 [00:00<00:00, 544.64it/s]
100%|██████████| 1/1 [00:00<00:00, 432.18it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 474.79it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 467.49it/s]
100%|██████████| 2/2 [00:00<00:00, 547.63it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 475.76it/s]
0it [00:

Number of events: 496





In [8]:
event_type_rows = pd.DataFrame(event_type_rows)
event_type_rows.head()

Unnamed: 0,id,text,tokens,ner_tags,entities,event_triggers,event_roles,trigger_id
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/677ad2f7-6d6e-4143-a3e8-935b1d531230
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...,c/f84a50a1-b58f-4077-a68c-ae95a4f81e3f
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...,c/bdd0f916-b081-4d34-a52c-68ee059b7a22
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...,c/1caf174c-37a4-448e-9b85-b052942e9504


In [9]:
import numpy as np

event_type_rows_y = np.asarray(event_type_rows_y)

In [10]:
event_type_rows_y.shape

(756,)

## Step 2: Configure Snorkel

In [11]:
from wsee import SD4M_RELATION_TYPES
print(SD4M_RELATION_TYPES)

['Accident', 'CanceledRoute', 'CanceledStop', 'Delay', 'Obstruction', 'RailReplacementService', 'TrafficJam', 'O']


In [12]:
from wsee.labeling.event_trigger_lfs import lf_accident_cat, lf_canceledroute_cat, lf_delay_cat, \
    lf_obstruction_cat, lf_railreplacementservice_cat, lf_trafficjam_cat

In [13]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_accident_cat,
    lf_canceledroute_cat,
    # lf_canceledstop_cat
    lf_delay_cat,
    lf_obstruction_cat,
    lf_railreplacementservice_cat,
    lf_trafficjam_cat
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_type_rows)

100%|██████████| 756/756 [00:05<00:00, 136.12it/s]


In [14]:
from snorkel.labeling import LFAnalysis

Y_valid = event_type_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_accident_cat,0,[0],0.111111,0.0,0.0,53,31,0.630952
lf_canceledroute_cat,1,[1],0.002646,0.0,0.0,2,0,1.0
lf_delay_cat,2,[3],0.087302,0.0,0.0,59,7,0.893939
lf_obstruction_cat,3,[4],0.164021,0.0,0.0,92,32,0.741935
lf_railreplacementservice_cat,4,[5],0.033069,0.0,0.0,20,5,0.8
lf_trafficjam_cat,5,[6],0.208995,0.0,0.0,154,4,0.974684
