# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [None]:
import sys
sys.path.append("../")
import os
from tqdm import tqdm
import pandas as pd
from wsee.utils import utils

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus

In [None]:
sd_train_path = os.path.join(DATA_DIR, 'train/train_with_events.jsonl')
sd_dev_path = os.path.join(DATA_DIR, 'dev/dev_with_events.jsonl')
sd_test_path = os.path.join(DATA_DIR, 'test/test_with_events.jsonl')

daystream_path = os.path.join(DATA_DIR, 'daystream.jsonl')

In [None]:
sd_train = pd.read_json(sd_train_path, lines=True)
sd_dev = pd.read_json(sd_dev_path, lines=True)
sd_test = pd.read_json(sd_test_path, lines=True)

daystream = pd.read_json(daystream_path, lines=True)

In [None]:
sd_train.head()

In [None]:
utils.pretty_print_json(sd_train[['id', 'text', 'entities', 'event_triggers', 'event_roles']].iloc[6])

## Step 1: Create one row for each trigger-entity pair

In [None]:
import pandas as pd 
import numpy as np

event_role_rows_list = []
event_role_rows_y = []

event_count = 0

print(f"DataFrame has {len(sd_train.index)} rows")
for index, row in tqdm(sd_train.iterrows()):
    """if i > 4:
        break"""
    for event_role in row.event_roles:
        augmented_row = utils.get_deep_copy(row)
        augmented_row['trigger_id'] = event_role['trigger']
        augmented_row['argument_id'] = event_role['argument']
        event_role_rows_list.append(augmented_row)
        event_role_num = np.asarray(event_role['event_argument_probs']).argmax()
        event_role_rows_y.append(event_role_num)
        if event_role_num != 10:
            event_count += 1
    
print("Number of event roles:", event_count)

In [None]:
event_role_rows = pd.DataFrame(event_role_rows_list).reset_index(drop=True)
event_role_rows.head()

In [None]:
import numpy as np

event_role_rows_y = np.asarray(event_role_rows_y)

In [None]:
event_role_rows_y.shape

## Step 2: Configure Snorkel

In [None]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)


In [None]:
from wsee.labeling.event_argument_role_lfs import lf_event_patterns, lf_event_patterns_general_location

### Apply the labeling functions

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_event_patterns,
    lf_event_patterns_general_location
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_role_rows)

In [None]:
from snorkel.labeling import LFAnalysis

Y_valid = event_role_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)