# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [1]:
import sys
sys.path.append("../")
import os
from tqdm import tqdm
import pandas as pd
from wsee.utils import utils

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus

In [2]:
sd_train_path = os.path.join(DATA_DIR, 'train/train_with_events.jsonl')
sd_dev_path = os.path.join(DATA_DIR, 'dev/dev_with_events.jsonl')
sd_test_path = os.path.join(DATA_DIR, 'test/test_with_events.jsonl')

daystream_path = os.path.join(DATA_DIR, 'daystream.jsonl')

In [3]:
sd_train = pd.read_json(sd_train_path, lines=True)
sd_dev = pd.read_json(sd_dev_path, lines=True)
sd_test = pd.read_json(sd_test_path, lines=True)

daystream = pd.read_json(daystream_path, lines=True)

In [4]:
sd_train.head()

Unnamed: 0,id,text,tokens,ner_tags,entities,event_triggers,event_roles
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...
1,http://www.deutschlandradio.de/#17@2016-04-04T...,Vorsicht auf der A7 Ulm Richtung Füssen zwisch...,"[Vorsicht, auf, der, A7, Ulm, Richtung, Füssen...","[O, O, O, B-LOCATION_STREET, B-LOCATION_CITY, ...",[{'id': 'c/2db85836-812f-4ced-90d3-46df9495782...,[],[]
2,667383197769048064,"Genau in dem Bus sitzen, der im Stau steht. Fü...","[Genau, in, dem, Bus, sitzen, ,, der, im, Stau...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",[],[],[]
3,603844236484550658,Große Carsharing-Übernahme: Der französische C...,"[Große, Carsharing, -, Übernahme, :, Der, fran...","[O, O, O, O, O, O, B-LOCATION, O, O, O, B-ORGA...",[{'id': 'c/f0fdb663-677e-4353-9159-8a9530f9777...,[],[]
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...


In [5]:
utils.pretty_print_json(sd_train[['id', 'text', 'entities', 'event_triggers', 'event_roles']].iloc[6])

{
  "id": "754201930264633344",
  "text": "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n",
  "entities": [
    {
      "id": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "text": "#A1",
      "entity_type": "LOCATION_STREET",
      "start": 1,
      "end": 2
    },
    {
      "id": "c/7c844525-065a-498e-8b14-d4998cfe6fe7",
      "text": "#Bremen",
      "entity_type": "LOCATION_CITY",
      "start": 2,
      "end": 3
    },
    {
      "id": "c/2cc0660c-c343-4712-bb96-1f3f35282cf4",
      "text": "#Hamburg",
      "entity_type": "LOCATION_CITY",
      "start": 4,
      "end": 5
    },
    {
      "id": "c/631f4cdb-1331-4a84-8c0d-ddc37ab2f8c1",
      "text": "Horster Dreieck",
      "entity_type": "LOCATION",
      "start": 6,
      "end": 8
    },
    {
      "id": "c/5512c1fc-4ded-4b91-a562-a7dae5d066ee",
      "text": "#Stillhorn",
      "entity_type": "LOCATION_CITY",
      "start": 9,
      

## Step 1: Create one row for each trigger-entity pair

In [6]:
import pandas as pd 
import numpy as np

event_role_rows_list = []
event_role_rows_y = []

event_count = 0

print(f"DataFrame has {len(sd_train.index)} rows")
for index, row in tqdm(sd_train.iterrows()):
    """if i > 4:
        break"""
    for event_role in row.event_roles:
        augmented_row = utils.get_deep_copy(row)
        augmented_row['trigger_id'] = event_role['trigger']
        augmented_row['argument_id'] = event_role['argument']
        event_role_rows_list.append(augmented_row)
        event_role_num = np.asarray(event_role['event_argument_probs']).argmax()
        event_role_rows_y.append(event_role_num)
        if event_role_num != 10:
            event_count += 1
    
print("Number of event roles:", event_count)

5it [00:00, 14.29it/s]

DataFrame has 1307 rows


1307it [00:16, 80.86it/s] 

Number of event roles: 1997





In [7]:
event_role_rows = pd.DataFrame(event_role_rows_list).reset_index(drop=True)
event_role_rows.head()

Unnamed: 0,id,text,tokens,ner_tags,entities,event_triggers,event_roles,trigger_id,argument_id
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08,c/cb2c9ebb-dce9-42f3-816c-d09f16935a5d
1,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08,c/1b88cd4b-f6da-4ef1-8bfc-d6452607b6d4
2,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08,c/77a0f4fa-c206-4c9c-be92-ffcf87be1b29
3,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08,c/76fb3fc6-24b7-45a7-973a-292c72f3b5d1
4,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...,c/e6ad8c7f-24a4-4742-a52d-90207de04f08,c/677ad2f7-6d6e-4143-a3e8-935b1d531230


In [8]:
import numpy as np

event_role_rows_y = np.asarray(event_role_rows_y)

In [9]:
event_role_rows_y.shape

(7198,)

## Step 2: Configure Snorkel

In [10]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)


['location', 'delay', 'direction', 'start_loc', 'end_loc', 'start_date', 'end_date', 'cause', 'jam_length', 'route', 'no_arg']


In [11]:
from wsee.labeling.event_argument_role_lfs import lf_date_type

OSError: [E050] Can't find model 'de'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

### Apply the labeling functions

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_date_type
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_role_rows)

In [None]:
from snorkel.labeling import LFAnalysis

Y_valid = event_role_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)