# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [None]:
import sys
sys.path.append("../")
import warnings
from wsee.utils import utils
from wsee.data import pipeline

warnings.filterwarnings(action='once')

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv6-end_loc_fix'  # replace path to corpus

### SD4M Relation/ Event Arguments

| Number | Code       | Description                                                                 |
|--------|------------|-----------------------------------------------------------------------------|
| -1     | ABSTAIN    | No vote, for Labeling Functions                                             |
| 0      | location   | Required argument for all events denoting the location.                     |
| 1      | delay      | Optional argument denoting the delay associated with the event.             |
| 2      | direction  | Optional argument denoting the direction associated with the event.         |
| 3      | start_loc  | Optional argument denoting the starting location associated with the event. |
| 4      | end_loc    | Optional argument denoting the ending location associated with the event.   |
| 5      | start_date | Optional argument denoting the start date associated with the event.        |
| 6      | end_date   | Optional argument denoting the end date associated with the event.          |
| 7      | cause      | Optional argument (trigger) denoting the cause associated with the event.   |
| 8      | jam_length | Optional argument denoting the jam length of a traffic jam event.           |
| 9      | route      | Optional argument denoting the route affected by a canceled stop event.     |
| 10     | no_arg     | No argument relation with the specified trigger.                            |

In [None]:
loaded_data = pipeline.load_data(DATA_DIR)
sd_train = loaded_data['train']
sd_dev = loaded_data['dev']
sd_test = loaded_data['test']

daystream = loaded_data['daystream']

In [None]:
sd_train.head()

## Step 1: Create one row for each trigger-entity pair (event role)

In [None]:
SAMPLE = False

In [None]:
if SAMPLE:
    df_dev, Y_dev = pipeline.build_event_role_examples(sd_train.sample(n=150, random_state=42))
else:
    df_dev, Y_dev = pipeline.build_event_role_examples(sd_train)

In [None]:
if SAMPLE:
    df_test, Y_test = pipeline.build_event_role_examples(sd_dev.sample(n=10, random_state=42))
else:
    df_test, Y_test = pipeline.build_event_role_examples(sd_dev)

In [None]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)

## Step 2: Explore the data

In [None]:
from wsee.preprocessors.preprocessors import *
from wsee.data import explore

We can apply all our preprocessors on our data and see if we can find something interesting for our labeling functions. Let's first sample the SD4M training data, which is labeled.

In [None]:
labeled_sd4m_roles = explore.add_labels(df_dev, Y_dev)
labeled_sd4m_roles = explore.apply_preprocessors(labeled_sd4m_roles, [pre_between_tokens, pre_between_distance])
labeled_sd4m_roles = explore.add_event_types(labeled_sd4m_roles)
labeled_sd4m_roles = explore.add_event_arg_roles(labeled_sd4m_roles)

Let's first take a look at the trigger and argument text, and the entity types!

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [None]:
# explore.sample_data(labeled_sd4m_roles[labeled_sd4m_roles['label']==6], columns=['text', 'between_tokens', 'trigger', 'argument', 'between_distance', 'label', 'event_types', 'event_arg_roles'])

Now we can collect the most frequent trigger-argument pairs per class.

In [None]:
n = 100
filtered_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] != 10]
class_pairs = {}
print(f"Number of event-roles: {len(labeled_sd4m_roles)}\n")
for idx, class_name in enumerate(ROLE_LABELS):
    class_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] == idx]
    print(f"{class_name}: {len(class_sd4m_roles)} instances")

Only checking the argument text probably does not give us much, but it shall serve as an example.

## Step 3: Evaluate the labeling functions on the SD4M training data

In [None]:
from wsee.labeling.event_argument_role_lfs import *

### Apply the labeling functions

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_location_same_sentence_is_event,
    lf_location_same_sentence_nearest_is_event,
    lf_location_chained,
    lf_location_adjacent_markers,
    lf_location_beginning_street_stop_route,
    lf_location_first_sentence,
    lf_location_first_sentence_nearest,
    lf_location_first_sentence_street_stop_route,
    lf_location_first_sentence_priorities,
    lf_delay_event_sentence,
    lf_delay_event_sentence_check,
    lf_direction_type,
    lf_direction_order,
    lf_start_location_type,
    lf_start_location_nearest,
    lf_end_location_type,
    lf_end_location_nearest,
    lf_start_date_type,
    lf_start_date_first,
    lf_start_date_adjacent,
    lf_end_date_type,
    lf_cause_type,
    lf_cause_order,
    lf_cause_gaz_file,
    lf_distance_type,
    lf_distance_nearest,
    lf_route_type,
    lf_route_type_order,
    lf_not_an_event,
    lf_somajo_separate_sentence,
    lf_overlapping,
    lf_too_far_40,
    lf_multiple_same_event_type,
    lf_event_patterns,
    lf_event_patterns_general_location
]

applier = PandasLFApplier(lfs)

In [None]:
L_dev = applier.apply(df_dev)
L_test = applier.apply(df_test)

In [None]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

## Step 4: Error Analysis

In [None]:
from wsee.labeling import error_analysis

In [None]:
# error_analysis.sample_fp(labeled_df=labeled_sd4m_roles, lf_outputs=L_dev, lf_index=7, label_of_interest=3)[['between_tokens', 'trigger', 'argument', 'somajo_doc', 'label', 'event_types', 'event_arg_roles']]
with pd.option_context('display.max_rows', None):  # more options can be specified also
    display(error_analysis.get_false_positives(labeled_df=labeled_sd4m_roles, lf_outputs=L_dev, lf_index=11, label_of_interest=3)[['text', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']])

In [None]:
# error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_dev, lf_index=10, label_of_interest=4)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]
error_analysis.get_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_dev, lf_index=19, label_of_interest=5)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types', 'event_arg_roles']]

In [None]:
error_analysis.sample_abstained_instances(labeled_df=labeled_sd4m_roles, lf_outputs=L_dev, lf_index=0, label_of_interest=0)[['text', 'between_tokens', 'trigger', 'argument', 'label', 'event_types']]

## Step 5: Train the Labeling model and label the data

In [None]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=11, verbose=True)
label_model.fit(L_train=L_dev, n_epochs=500, log_freq=100, seed=123, class_balance=[
    0.07511483382869495,
    0.010537692515536342,
    0.037017022426371254,
    0.04998649013780059,
    0.0466090245879492,
    0.0045933531477978925,
    0.0054039448797622265,
    0.013915158065387734,
    0.018238313969197513,
    0.0031072683058632803,
    0.735476898135639
])

In [None]:
label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

In [None]:
probs_train = label_model.predict_proba(L=L_dev)

In [None]:
probs_train.shape

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_dev, y=probs_train, L=L_dev
)

In [None]:
labeled_sd_train = pipeline.merge_event_role_examples(df_train_filtered, probs_train_filtered)
labeled_sd_train.reset_index(level=0).to_json("/Users/phuc/data/snorkel-daystreamv6-end_loc_fix/save_sd_roles.jsonl", orient='records', lines=True, force_ascii=False)

## Step 6: Label the Daystream data

In [None]:
df_train, Y_train = pipeline.build_event_role_examples(daystream)

In [None]:
L_train = applier.apply(df_train)

In [None]:
from snorkel.labeling import LFAnalysis
LFAnalysis(L_train, lfs).lf_summary()

In [None]:
from snorkel.labeling import LabelModel

daystream_model = LabelModel(cardinality=11, verbose=True)
daystream_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance=[
    0.07511483382869495,
    0.010537692515536342,
    0.037017022426371254,
    0.04998649013780059,
    0.0466090245879492,
    0.0045933531477978925,
    0.0054039448797622265,
    0.013915158065387734,
    0.018238313969197513,
    0.0031072683058632803,
    0.735476898135639
])

In [None]:
daystream_model_acc = daystream_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {daystream_model_acc * 100:.1f}%")

In [None]:
daystream_probs = daystream_model.predict_proba(L=L_train)

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=daystream_probs, L=L_train
)

In [None]:
labeled_daystream = pipeline.merge_event_role_examples(df_train_filtered, probs_train_filtered)
labeled_daystream.reset_index(level=0).to_json("/Users/phuc/data/snorkel-daystreamv6-end_loc_fix/save_daystreamv6_roles.jsonl", orient='records', lines=True, force_ascii=False)

## Step 7: Daystream Snorkel Labeling Check

In [None]:
df_train.iloc[L_train[:, 3] == delay].sample(10, random_state=42)[['text', 'trigger', 'argument']]