In [None]:
%matplotlib inline

In [None]:
import sys
sys.path.append("../../")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import seaborn as sns

from wsee.predictors.predictor_utils import load_predictor
from wsee.models.model_utils import batched_predict_json
from wsee.utils import scorer, evaluate
from wsee import SD4M_RELATION_TYPES, ROLE_LABELS

In [None]:
CUDA_DEVICE = -1  # or -1 if no GPU is available

In [None]:
# Change paths
# DATASET_PATH = "../../data/daystream_corpus/test/test_sd4m_with_events.jsonl"
DATASET_PATH = "/Users/phuc/develop/python/eventx/data/daystream_corpus/test/test_sd4m_with_events.jsonl"
PREDICTOR_NAME = "snorkel-eventx-predictor"

## Load the test data, models and do predictions

Filter documents that do not contain any entity of type trigger as they are not relevant for the event extraction task.

In [None]:
test_docs = evaluate.load_test_data(DATASET_PATH)

In [None]:
#model_base_path = "../../data/runs/"
model_base_path = "/Users/phuc/develop/python/eventx/data/runs/"
model_names = [
    "snorkel_bert_gold",
    "snorkel_bert_daystream",
    "snorkel_bert_merged"
]

In [None]:
predicted_docs = {}
for model_name in model_names:
    predictor = load_predictor(model_base_path + model_name, PREDICTOR_NAME, CUDA_DEVICE)
    predicted_docs[model_name] = batched_predict_json(predictor=predictor, examples=test_docs)

## Conventional evaluation using sklearn toolkit
We simply consider the trigger & role label sequences disregarding the correctness of the corresponding trigger label when evaluating the argument roles to use the sklearn toolkit.

In [None]:
trigger_y_pred, arg_y_pred = {}, {}
for model_name in model_names:
    label_arrays = evaluate.get_label_arrays(test_docs, predicted_docs[model_name])
    trigger_y_true, trigger_y_pred[model_name] = label_arrays["trigger_y_true"], label_arrays["trigger_y_pred"]
    arg_y_true, arg_y_pred[model_name] = label_arrays["arg_y_true"], label_arrays["arg_y_pred"]

### Trigger classification

#### Model trained with SD4M gold training data

In [None]:
print(sklearn.metrics.classification_report(trigger_y_true, trigger_y_pred["snorkel_bert_gold"], SD4M_RELATION_TYPES[:-1]))

In [None]:
print(sklearn.metrics.classification_report(trigger_y_true, trigger_y_pred["snorkel_bert_daystream"], SD4M_RELATION_TYPES[:-1]))

In [None]:
print(sklearn.metrics.classification_report(trigger_y_true, trigger_y_pred["snorkel_bert_merged"], SD4M_RELATION_TYPES[:-1]))

In [None]:
f, axes = plt.subplots(1, 3, figsize=(30,7))
f.suptitle('Normalized Confusion Matrices - Trigger classification')
for i, model_name in enumerate(model_names):
    
    cm = sklearn.metrics.confusion_matrix(trigger_y_true, trigger_y_pred[model_name], labels=SD4M_RELATION_TYPES)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index = SD4M_RELATION_TYPES, columns = SD4M_RELATION_TYPES)
    sns.heatmap(df_cm, ax=axes[i], annot=True)
    axes[i].set_title(model_name)

### Argument role classification

In [None]:
print(sklearn.metrics.classification_report(arg_y_true, arg_y_pred["snorkel_bert_gold"], ROLE_LABELS[:-1]))

In [None]:
print(sklearn.metrics.classification_report(arg_y_true, arg_y_pred["snorkel_bert_daystream"], ROLE_LABELS[:-1]))

In [None]:
print(sklearn.metrics.classification_report(arg_y_true, arg_y_pred["snorkel_bert_merged"], ROLE_LABELS[:-1]))

In [None]:
f, axes = plt.subplots(1, 3, figsize=(30,7))
f.suptitle('Normalized Confusion Matrices - Argument classification')
for i, model_name in enumerate(model_names):
    
    cm = sklearn.metrics.confusion_matrix(arg_y_true, arg_y_pred[model_name], labels=ROLE_LABELS)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index=ROLE_LABELS, columns=ROLE_LABELS)
    sns.heatmap(df_cm, ax=axes[i], annot=True)
    axes[i].set_title(model_name)

## Event extraction evaluation using correctness criteria defined by Ji, Heng and Grishman, Ralph 2008

Ji, Heng and Grishman, Ralph (2008). Refining event extraction through cross-document inference.
> - A trigger is correctly labeled if its event type and offsets match a reference trigger.
> - An argument is correctly identified if its event type and offsets match any of the reference argument mentions.
> - An argument is correctly identified and classified if its event type, offsets, and role match any of the reference argument mentions.

The main difference to the previous evaluation method lies in the correctness criteria for the arguments. Here we additionally consider the correctness of the event type for the argument.

Caution:
Using the following methods to retrieve the triggers and arguments from the gold data might result in duplicate gold triggers & arguments.
This is due to different events possibly sharing the same trigger.
The model is not able to distinguish such events and instead fuses them all together, which results in lower recall.
If we remove duplicates from the gold triggers and gold arguments, recall and consequently f1 should be higher.

In [None]:
REMOVE_DUPLICATES = True  # change to False if you want to keep duplicate triggers/ arguments from the gold data caused by events sharing the same trigger

In [None]:
gold_triggers = scorer.get_triggers(test_docs)
gold_arguments = scorer.get_arguments(test_docs)

In [None]:
if REMOVE_DUPLICATES:
    gold_triggers = list(set(gold_triggers))
    gold_arguments = list(set(gold_arguments))

In [None]:
pred_triggers = {}
pred_arguments = {}
for model_name in model_names:
    pred_triggers[model_name] = scorer.get_triggers(predicted_docs[model_name])
    pred_arguments[model_name] = scorer.get_arguments(predicted_docs[model_name])

In [None]:
for model_name in model_names:
    print(model_name)
    trigger_id_metrics = scorer.get_trigger_identification_metrics(gold_triggers, pred_triggers[model_name], output_string=True)
    print('\n')

In [None]:
for model_name in model_names:
    print(model_name)
    trigger_class_metrics = scorer.get_trigger_classification_metrics(gold_triggers, pred_triggers[model_name], output_string=True)
    print('\n')

In [None]:
for model_name in model_names:
    print(model_name)
    argument_id_metrics = scorer.get_argument_identification_metrics(gold_arguments, pred_arguments[model_name], output_string=True)
    print('\n')

In [None]:
for model_name in model_names:
    print(model_name)
    argument_class_metrics = scorer.get_argument_classification_metrics(gold_arguments, pred_arguments[model_name], output_string=True)
    print('\n')

### 5 Random Repeats
5 random repeats for each configuration with random seeds for the snorkel label models and eventx model.
Metrics are given as median & standard deviation.

In [None]:
from pathlib import Path

In [None]:
model_base_path = Path(model_base_path)
runs = 5
trigger_metrics, argument_metrics = {}, {}
for model_name in model_names:
    model_paths = [model_base_path.joinpath(f'run0{run+1}/{model_name}') for run in range(runs)]
    trigger_metrics[model_name], argument_metrics[model_name] = evaluate.summize_multiple_runs(model_paths, test_docs)
    print(model_name)
    print(pd.DataFrame(trigger_metrics[model_name]))
    print(pd.DataFrame(argument_metrics[model_name]))
    print('\n')

### Scalability of snorkel labeled data
Compare model performance with increasing amount of snorkel labeled data

In [None]:
model_names = [f'snorkel_bert_daystream{percentage}' for percentage in range(50,101,10)]
model_names.append('snorkel_bert_daystream')

In [None]:
pred_triggers = {}
pred_arguments = {}
trigger_id_metrics = {}
trigger_class_metrics = {}
argument_id_metrics = {}
argument_class_metrics = {}
for model_name in model_names:
    predictor = load_predictor(model_base_path.joinpath(model_name), PREDICTOR_NAME, CUDA_DEVICE)
    predicted_docs[model_name] = batched_predict_json(predictor=predictor, examples=test_docs)
    pred_triggers[model_name] = scorer.get_triggers(predicted_docs[model_name])
    pred_arguments[model_name] = scorer.get_arguments(predicted_docs[model_name])
    print(model_name)
    trigger_id_metrics[model_name] = scorer.get_trigger_identification_metrics(gold_triggers, pred_triggers[model_name], output_string=False)
    trigger_class_metrics[model_name] = scorer.get_trigger_classification_metrics(gold_triggers, pred_triggers[model_name], output_string=False) # restrict to Trigger classification
    argument_id_metrics[model_name] = scorer.get_argument_identification_metrics(gold_arguments, pred_arguments[model_name], output_string=False)
    argument_class_metrics[model_name] = scorer.get_argument_classification_metrics(gold_arguments, pred_arguments[model_name], output_string=False) # restrict to Argument classification
    print('\n')

In [None]:
trigger_id_metrics

In [None]:
def get_infos(metrics, metric_name):
    table = []
    for k,v in metrics.items():
        tmp = {'Model': k, metric_name: v[metric_name]['f1-score']}
        table.append(tmp)
    return pd.DataFrame(table).set_index('Model')

In [None]:
progression_table = get_infos(trigger_id_metrics, 'Trigger identification')

In [None]:
progression_table = progression_table.merge(get_infos(trigger_class_metrics, 'Trigger classification'), left_index=True, right_index=True)

In [None]:
progression_table = progression_table.merge(get_infos(argument_id_metrics, 'Argument identification'), left_index=True, right_index=True)

In [None]:
progression_table = progression_table.merge(get_infos(argument_class_metrics, 'Argument classification'), left_index=True, right_index=True)

In [None]:
progression_table.plot()