# Transform the data to work with Snorkel: Part 1 - Event Type

Essentially we will have to create two labeling models.
One assigns labels to event types and the other assigns labels to argument roles in event mentions.

In any case we need to create a row for each event (trigger) to do event type labeling.

For this we need 1 additional column:
- trigger_id

One numpy array containing the:
- event_type

We will probably focus on keyword lists and some heuristics to create our labeling functions.

In [None]:
import sys
sys.path.append("../")
import os
from tqdm import tqdm
import pandas as pd
from wsee.utils import utils

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus

In [None]:
sd_train_path = os.path.join(DATA_DIR, 'train/train_with_events.jsonl')
sd_dev_path = os.path.join(DATA_DIR, 'dev/dev_with_events.jsonl')
sd_test_path = os.path.join(DATA_DIR, 'test/test_with_events.jsonl')

daystream_path = os.path.join(DATA_DIR, 'daystream.jsonl')

In [None]:
sd_train = pd.read_json(sd_train_path, lines=True)
sd_dev = pd.read_json(sd_dev_path, lines=True)
sd_test = pd.read_json(sd_test_path, lines=True)

daystream = pd.read_json(daystream_path, lines=True)

In [None]:
sd_train.head()

In [None]:
utils.pretty_print_json(sd_train[['id', 'text', 'entities', 'event_triggers', 'event_roles']].iloc[6])

## Step 1: Create one row for every event trigger

In [None]:
sd_train.iloc[1].event_triggers

In [None]:
import pandas as pd 
import numpy as np

event_type_rows = []
event_type_rows_y = []

event_count = 0

print(f"DataFrame has {len(sd_train.index)} rows")
for index, row in sd_train.iterrows():
    """if i > 4:
        break"""
    for event_trigger in tqdm(row.event_triggers):
        augmented_row = utils.get_deep_copy(row)
        augmented_row['trigger_id'] = event_trigger['id']
        event_type_rows.append(augmented_row)
        event_type_num = np.asarray(event_trigger['event_type_probs']).argmax()
        event_type_rows_y.append(event_type_num)
        if event_type_num != 7:
            event_count += 1
    
print("Number of events:", event_count)

In [None]:
event_type_rows = pd.DataFrame(event_type_rows)
event_type_rows.head()

In [None]:
import numpy as np

event_type_rows_y = np.asarray(event_type_rows_y)

In [None]:
event_type_rows_y.shape

## Step 2: Configure Snorkel

In [None]:
from wsee import SD4M_RELATION_TYPES
print(SD4M_RELATION_TYPES)

In [None]:
from wsee.labeling.event_trigger_lfs import lf_accident_cat, lf_canceledroute_cat, lf_delay_cat, \
    lf_obstruction_cat, lf_railreplacementservice_cat, lf_trafficjam_cat

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_accident_cat,
    lf_canceledroute_cat,
    # lf_canceledstop_cat
    lf_delay_cat,
    lf_obstruction_cat,
    lf_railreplacementservice_cat,
    lf_trafficjam_cat
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_type_rows)

In [None]:
from snorkel.labeling import LFAnalysis

Y_valid = event_type_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)