# Transform the data to work with Snorkel: Part 2 - Event Role

Here we will do most of the work creating a labeling model that assigns labels to argument roles in event mentions.
We need to create a row for each pair of trigger and entity mention.

For this we need to create 2 additional columns:
- trigger_id
- argument_id

Everything else we can pull from the other columns using Snorkel preprocessor functions.

In [None]:
import sys
sys.path.append("../")
from wsee.utils import utils
from wsee.data.pipeline import load_data, build_event_role_examples

DATA_DIR = '/Users/phuc/data/snorkel-daystreamv5'  # replace path to corpus
use_defaults = True
suffix = '_with_events_and_defaults.jsonl' if use_defaults else '_with_events.jsonl'

In [None]:
loaded_data = load_data(DATA_DIR)
sd_train = loaded_data['train']
sd_dev = loaded_data['dev']
sd_test = loaded_data['test']

daystream = loaded_data['daystream']

In [None]:
sd_train.head()

Example .jsonl file
```json
{
  "id": "754201930264633344",
  "text": "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n",
  "entities": [
    {
      "id": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "text": "#A1",
      "entity_type": "location_street",
      "start": 1,
      "end": 2,
      "char_start": 2,
      "char_end": 5
    },
    ...
  ],
  "event_triggers": [
    {
      "id": "c/3958da47-7b47-414f-8210-5b2c487de9df",
      "event_type_probs": [ 0.0, ..., 1.0, 0.0 ]
    }
  ],
  "event_roles": [
    {
      "trigger": "c/3958da47-7b47-414f-8210-5b2c487de9df",
      "argument": "c/82bf4c32-861d-4e09-b8d1-bf7adc488f2b",
      "event_argument_probs": [ 1.0, 0.0, ..., 0.0 ]
    },
    
  ]
}
```

## Step 1: Create one row for each trigger-entity pair (event role)

In [None]:
event_role_rows, event_role_rows_y = build_event_role_examples(sd_train)

In [None]:
event_role_rows_y.shape

In [None]:
df_test, Y_test = build_event_role_examples(sd_dev)

In [None]:
from wsee import ROLE_LABELS
print(ROLE_LABELS)

## Step 2: Explore the data

In [None]:
from wsee.preprocessors.preprocessors import *
from wsee.data import explore, pipeline

We can apply all our preprocessors on our data and see if we can find something interesting for our labeling functions. Let's first sample the SD4M training data, which is labeled.

In [None]:
labeled_sd4m_roles = explore.add_labels(event_role_rows, event_role_rows_y)

In [None]:
labeled_sd4m_roles = explore.apply_preprocessors(labeled_sd4m_roles, [get_trigger, get_trigger_text, get_argument, get_argument_text, get_mixed_ner])

Let's first take a look at the trigger and argument text, and the entity types!

In [None]:
labeled_sd4m_roles[labeled_sd4m_roles['label'] != 10].sample(10)[['text','trigger_text','argument_text','label']]

Now we can collect the most frequent trigger-argument pairs per class.

In [None]:
n = 100
filtered_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] != 10]
class_pairs = {}
print(f"Number of event-roles: {len(labeled_sd4m_roles)}\n")
for idx, class_name in enumerate(ROLE_LABELS):
    class_sd4m_roles = labeled_sd4m_roles[labeled_sd4m_roles['label'] == idx]
    print(f"{class_name}: {len(class_sd4m_roles)} instances")
    class_pairs[class_name] = (class_sd4m_roles['trigger_text'], class_sd4m_roles['argument_text'])

## Step 3: Evaluate the labeling functions on the SD4M training data

In [None]:
from wsee.labeling.event_argument_role_lfs import lf_event_patterns, lf_event_patterns_general_location, lf_date_type, lf_stanford_separate_sentence, lf_spacy_separate_sentence

### Apply the labeling functions

In [None]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_event_patterns,
    lf_event_patterns_general_location,
    #lf_date_type,
    #lf_stanford_separate_sentence,
    #lf_spacy_separate_sentence
]

applier = PandasLFApplier(lfs)
L_valid = applier.apply(event_role_rows)

In [None]:
L_test = applier.apply(df_test)

In [None]:
from snorkel.labeling import LFAnalysis

Y_valid = event_role_rows_y
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

In [None]:
LFAnalysis(L_test, lfs).lf_summary(Y_test)

## Step 4: Train the Labeling model and label the data

In [None]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=10, verbose=True)
label_model.fit(L_train=L_valid, n_epochs=500, log_freq=100, seed=123)

In [None]:
label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

In [None]:
probs_train = label_model.predict_proba(L=L_valid)

In [None]:
probs_train.shape

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=event_type_rows, y=probs_train, L=L_valid
)

In [None]:
df_train_filtered