# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [3]:
import sys
sys.path.append("../")
from wsee.utils import utils
from wsee.data.pipeline import load_data, build_event_trigger_examples

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus
use_defaults = True
suffix = '_with_events_and_defaults.jsonl' if use_defaults else '_with_events.jsonl'

In [4]:
loaded_data = load_data(DATA_DIR)
sd_train = loaded_data['train']
sd_dev = loaded_data['dev']
sd_test = loaded_data['test']

daystream = loaded_data['daystream']

In [5]:
sd_train.head()

Unnamed: 0,id,text,tokens,pos_tags,ner_tags,entities,event_triggers,event_roles
0,http://www.viz-info.de/LMS-BR_r_LMS-BR_60517@2...,Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab...,"[Unfall, Abschnitt, :, Marzahn, (, Berlin, ), ...","[NN, NN, $., NE, TRUNC, NE, TRUNC, NN, PTKVZ, ...","[B-TRIGGER, O, O, B-LOCATION, O, B-LOCATION_CI...",[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'id': 'c/e6ad8c7f-24a4-4742-a52d-90207de04f0...,[{'trigger': 'c/e6ad8c7f-24a4-4742-a52d-90207d...
1,http://www.deutschlandradio.de/#17@2016-04-04T...,Vorsicht auf der A7 Ulm Richtung Füssen zwisch...,"[Vorsicht, auf, der, A7, Ulm, Richtung, Füssen...","[NN, APPR, ART, NE, NE, NN, NN, APPR, NN, NE, ...","[O, O, O, B-LOCATION_STREET, B-LOCATION_CITY, ...",[{'id': 'c/2db85836-812f-4ced-90d3-46df9495782...,[],[]
2,667383197769048064,"Genau in dem Bus sitzen, der im Stau steht. Fü...","[Genau, in, dem, Bus, sitzen, ,, der, im, Stau...","[ADV, APPR, ART, NN, VVFIN, $,, PRELS, APPRART...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",[],[],[]
3,603844236484550658,Große Carsharing-Übernahme: Der französische C...,"[Große, Carsharing, -, Übernahme, :, Der, fran...","[ADJA, NN, $[, NN, $., ART, ADJA, NN, $[, NN, ...","[O, O, O, O, O, O, B-LOCATION, O, O, O, B-ORGA...",[{'id': 'c/f0fdb663-677e-4353-9159-8a9530f9777...,[],[]
4,http://bauarbeiten.bahn.de/fernverkehr/Linie/I...,"an mehreren Terminen\n an den Freitagen, 3. un...","[an, mehreren, Terminen, an, den, Freitagen, ,...","[APPR, PIAT, NN, APPR, ART, NN, $,, CARD, $., ...","[O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-DATE...",[{'id': 'c/f46384bf-20c6-47f5-a019-2a11fc52079...,[{'id': 'c/f84a50a1-b58f-4077-a68c-ae95a4f81e3...,[{'trigger': 'c/f84a50a1-b58f-4077-a68c-ae95a4...


Example .jsonl file
```json
{
  "id": "754201930264633344",
  "text": "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n",
  "entities": [
    {
      "id": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "text": "#A1",
      "entity_type": "location_street",
      "start": 1,
      "end": 2,
      "char_start": 2,
      "char_end": 5
    },
    ...
  ],
  "event_triggers": [
    {
      "id": "c/3958da47-7b47-414f-8210-5b2c487de9df",
      "event_type_probs": [ 0.0, ..., 1.0, 0.0 ]
    }
  ],
  "event_roles": [
    {
      "trigger": "c/3958da47-7b47-414f-8210-5b2c487de9df",
      "argument": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "event_argument_probs": [ 1.0, 0.0, ..., 0.0 ]
    },
    
  ]
}
```

In [8]:
event_role_rows, event_role_rows_y = build_event_trigger_examples(sd_train)

103it [00:00, 1021.42it/s]

DataFrame has 1273 rows


1273it [00:01, 1060.32it/s]


Number of events: 487


In [9]:
event_role_rows_y.shape

(817,)

## Step 2: Configure Snorkel

In [None]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)

In [None]:
from wsee.labeling.event_argument_role_lfs import lf_event_patterns, lf_event_patterns_general_location, lf_date_type, lf_stanford_separate_sentence, lf_spacy_separate_sentence

### Apply the labeling functions

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_event_patterns,
    lf_event_patterns_general_location,
    lf_date_type,
    lf_stanford_separate_sentence,
    lf_spacy_separate_sentence
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_role_rows)

In [None]:
from snorkel.labeling import LFAnalysis

Y_valid = event_role_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)