In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")

In [2]:
from typing import List, Optional

from collections import Counter

from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from tacrev.analysis import true_pred_labels_from_dataframe
from tacrev.readers.tacred import load_tacred
from tacrev.readers.evaluation_results import load_evaluation_results
from tacrev.writers.writer_utils import results_as_dataframe, documents_as_dataframe

  if __name__ == '__main__':


In [3]:
def strip_end(text, suffixes):
    for suffix in suffixes:
        if not text.endswith(suffix):
            text = text
            continue
        
        text = text[:len(text)-len(suffix)]
    return text


def ensemble_p_r_f1_score_as_df(dataframe, true_label_col, ignore_models: List[str] = None, ignore_label="no_relation", average="micro", sample_weight=None):
    ignore_models = ignore_models or []
    
    true_labels = dataframe[true_label_col]
    
    pred_labels = []
    for _, row in dataframe.iterrows():
        preds = []
        for model_name, pred in row["model_pred"].items():
            if any([s in model_name.lower() for s in ignore_models]):
                continue
            preds.append(pred)
        pred_labels.append(Counter(preds).most_common()[0][0])
    
    dataframe["pred_label"]
    
    
    unique_labels = list(set(true_labels) | set(pred_labels))
    
    if ignore_label and ignore_label in unique_labels:
        unique_labels.remove(ignore_label)
    
    
    prec, rec, f1, _ = precision_recall_fscore_support(true_labels,
                                                       pred_labels,
                                                       labels=unique_labels,
                                                       average=average,
                                                       sample_weight=sample_weight)
        
    return prec, rec, f1


def p_r_f1_scores_as_df(dataframe, models, true_label_col, ignore_label="no_relation", average="micro", sample_weight=None):
    precisions = []
    recalls = []
    f1s = []
    for model_name in models:
        true_labels, pred_labels, unique_labels = true_pred_labels_from_dataframe(dataframe,
                                                                                  ignore_label,
                                                                                  true_label_col,
                                                                                  model_name)
        prec, rec, f1, _ = precision_recall_fscore_support(true_labels,
                                                           pred_labels,
                                                           labels=unique_labels,
                                                           average=average,
                                                           sample_weight=sample_weight)
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        
    return precisions, recalls, f1s


def p_r_f1_orig_enhanced_weighted_as_df(dataframe,
                                        models,
                                        split,
                                        model_name_map=None,
                                        to_percentage: bool=True,
                                        average="micro"):
    model_name_map = model_name_map or {}
    
    sample_weight = []
    for _, row in dataframe.iterrows():
        num_incorrect = 0
        num_all = 0
        for model_name, pred in row["model_pred"].items():
            if any([s in model_name.lower() for s in ["tre", "spanbert", "knowbert"]]):
                continue
                
            num_all += 1
            if pred != row["true_label_reannotated"]:
                num_incorrect += 1
                
        sample_weight.append(num_incorrect / float(num_all))
    
    models_w_split = [f"{m}_{split}" for m in models]
    
    prec_orig, rec_orig, f1_orig = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                       true_label_col="true_label",
                                                       ignore_label="no_relation",
                                                       average=average)
    
    prec_enh, rec_enh, f1_enh = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                    true_label_col="true_label_reannotated",
                                                    ignore_label="no_relation",
                                                    average=average)
    
    prec_weight, rec_weight, f1_weight = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                             true_label_col="true_label_reannotated",
                                                             ignore_label="no_relation",
                                                             average=average,
                                                             sample_weight=sample_weight)
    
    df = pd.DataFrame([prec_orig, rec_orig, f1_orig, prec_enh, rec_enh, f1_enh, prec_weight, rec_weight, f1_weight]).T
    df["Model"] = [model_name_map.get(m, m) for m in models]
    df = df.set_index("Model")
    df.columns = pd.MultiIndex.from_product([["Original", "Revised", "Weighted"], ["P", "R", "F1"]])
    
    if to_percentage:
        df = df.apply(lambda x: x * 100)
    
    return df


def p_r_f1_orig_enhanced_as_df(dataframe,
                               models,
                               split,
                               model_name_map=None,
                               to_percentage: bool=True,
                               average="micro"):
    model_name_map = model_name_map or {}
    
    sample_weight = []
    for _, row in dataframe.iterrows():
        num_incorrect = 0
        num_all = 0
        for model_name, pred in row["model_pred"].items():
            if any([s in model_name.lower() for s in ["tre", "spanbert", "knowbert"]]):
                continue
                
            num_all += 1
            if pred != row["true_label_reannotated"]:
                num_incorrect += 1
                
        sample_weight.append(num_incorrect / float(num_all))
    
    models_w_split = [f"{m}_{split}" for m in models]
    
    prec_orig, rec_orig, f1_orig = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                       true_label_col="true_label",
                                                       ignore_label="no_relation",
                                                       average=average)
    
    prec_enh, rec_enh, f1_enh = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                    true_label_col="true_label_reannotated",
                                                    ignore_label="no_relation",
                                                    average=average)
    
    df = pd.DataFrame([prec_orig, rec_orig, f1_orig, prec_enh, rec_enh, f1_enh]).T
    df["Model"] = [model_name_map.get(m, m) for m in models]
    df = df.set_index("Model")
    df.columns = pd.MultiIndex.from_product([["Original", "Revised"], ["P", "R", "F1"]])
    
    if to_percentage:
        df = df.apply(lambda x: x * 100)
    
    return df


def f1_orig_enhanced_weighted_as_df(dataframe,
                                    models,
                                    split,
                                    model_name_map=None,
                                    to_percentage: bool=True,
                                    average="micro"):
    model_name_map = model_name_map or {}
    
    sample_weight = []
    for _, row in dataframe.iterrows():
        num_incorrect = 0
        num_all = 0
        for model_name, pred in row["model_pred"].items():
            if any([s in model_name.lower() for s in ["tre", "spanbert", "knowbert"]]):
                continue
                
            num_all += 1
            if pred != row["true_label_reannotated"]:
                num_incorrect += 1
                
        sample_weight.append(num_incorrect / float(num_all))
    
    models_w_split = [f"{m}_{split}" for m in models]
    
    prec_orig, rec_orig, f1_orig = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                       true_label_col="true_label",
                                                       ignore_label="no_relation",
                                                       average=average)
    
    prec_enh, rec_enh, f1_enh = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                    true_label_col="true_label_reannotated",
                                                    ignore_label="no_relation",
                                                    average=average)
    
    prec_weight, rec_weight, f1_weight = p_r_f1_scores_as_df(dataframe, models_w_split,
                                                             true_label_col="true_label_reannotated",
                                                             ignore_label="no_relation",
                                                             average=average,
                                                             sample_weight=sample_weight)
    
    df = pd.DataFrame([f1_orig, f1_enh, f1_weight]).T
    df["Model"] = [model_name_map.get(m, m) for m in models]
    df = df.set_index("Model")
    df.columns = pd.MultiIndex.from_product([["Original", "Revised", "Weighted"], ["F1"]])
    
    if to_percentage:
        df = df.apply(lambda x: x * 100)
    
    return df

In [4]:
DATASET_PATH = "../dataset/test_rev.json"
MODEL_RESULTS_PATH = "../results/test_results/"

In [5]:
MODELS_UNDER_INVESTIGATION = ["cnn_wo_discr_masked",
                              "tre_tacred",
                              "spanbert_tacred",
                              "knowbert_wordnet_wiki_tacred"]

MODEL_NAME_MAP = {
    "cnn_wo_discr_unmasked": "CNN",
    "cnn_wo_discr_masked": "CNN, masked",
    "self_att_wo_discr_unmasked": "S-Att.",
    "self_att_wo_discr_masked": "S-Att., masked",
    "tre_tacred": "TRE",
    "spanbert_tacred": "SpanBERT",
    "knowbert_wordnet_wiki_tacred": "KnowBert-W+W"
}

ENSEMBLE_MODEL_NAME_MAP = {
    "bag_of_embeddings": "BoE",

    "cnn_wo_discr_unmasked": "CNN",
    "cnn_wo_discr_masked": "CNN, masked",
    "cnn": "CNN w/ synt/sem",
    
    "cnn_elmo_wo_discr_unmasked": "CNN + ELMo",
    "cnn_elmo_wo_discr_masked": "CNN + ELMo, masked",
    "cnn_elmo": "CNN + ELMo, masked w/ synt/sem",
    
    "cnn_bert_uncased_unmasked": "CNN + BERT uncased",
    "cnn_bert_uncased": "CNN + BERT uncased, masked",
    "cnn_bert_cased_unmasked": "CNN + BERT cased",
    "cnn_bert_cased": "CNN + BERT cased, masked",

    
    "lstm_wo_discr_unmasked": "LSTM",
    "lstm_wo_discr_masked": "LSTM, masked",
    "lstm": "LSTM, masked w/ synt/sem",
    
    "lstm_elmo_wo_discr_unmasked": "LSTM + ELMo",
    "lstm_elmo_wo_discr_masked": "LSTM + ELMo, masked",
    "lstm_elmo": "LSTM + ELMo, masked w/ synt/sem",
    
    "lstm_bert_uncased_unmasked": "LSTM + BERT uncased",
    "lstm_bert_uncased": "LSTM + BERT uncased, masked",
    "lstm_bert_cased_unmasked": "LSTM + BERT cased",
    "lstm_bert_cased": "LSTM + BERT cased, masked",

    
    "bilstm_wo_discr_unmasked": "Bi-LSTM",
    "bilstm_wo_discr_masked": "Bi-LSTM, masked",
    
    "bilstm_elmo_wo_discr_unmasked": "Bi-LSTM + ELMo",
    "bilstm_elmo_wo_discr_masked": "Bi-LSTM + ELMo, masked",
    "bilstm_elmo_unmasked": "Bi-LSTM + ELMo w/ synt/sem",
    "bilstm_elmo": "Bi-LSTM + ELMo, masked w/ synt/sem",
    
    "bilstm_bert_uncased_unmasked": "Bi-LSTM + BERT uncased",
    "bilstm_bert_uncased": "Bi-LSTM + BERT uncased, masked",
    "bilstm_bert_cased_unmasked": "Bi-LSTM + BERT cased",
    "bilstm_bert_cased": "Bi-LSTM + BERT cased, masked",
    
    
    "gcn_wo_discr_unmasked": "GCN",
    "gcn_wo_discr_masked": "GCN, masked",
    "gcn": "GCN, masked w/ synt/sem",
    
    "gcn_elmo_wo_discr_unmasked": "GCN + ELMo",
    "gcn_elmo_wo_discr_masked": "GCN + ELMo, masked",
    "gcn_elmo": "GCN + ELMo, masked w/ synt/sem",
    
    "gcn_bert_uncased_unmasked": "GCN + BERT uncased",
    "gcn_bert_uncased": "GCN + BERT uncased, masked",
    "gcn_bert_cased_unmasked": "GCN + BERT cased",
    "gcn_bert_cased": "GCN + BERT cased, masked",

    
    "self_att_wo_discr_unmasked": "S-Att.",
    "self_att_wo_discr_masked": "S-Att., masked",
    
    "self_att_elmo_wo_discr_unmasked": "S-Att. + ELMo",
    "self_att_elmo_wo_discr_masked": "S-Att. + ELMo, masked",
    
    "self_att_bert_uncased_unmasked": "S-Att. + BERT uncased",
    "self_att_bert_uncased": "S-Att. + BERT uncased, masked",
    "self_att_bert_cased_unmasked": "S-Att. + BERT cased",
    "self_att_bert_cased": "S-Att. + BERT cased, masked",
}

In [6]:
ENSEMBLE_MODEL_NAME_MAP.keys() - ENSEMBLE_MODEL_NAME_MAP_SHORT.keys()

{'bilstm_elmo_unmasked'}

In [7]:
documents = load_tacred(DATASET_PATH)
documents_df = documents_as_dataframe(documents, mark_arguments=True)
evaluation_results = results_as_dataframe(load_evaluation_results(MODEL_RESULTS_PATH, documents))
combined_df = pd.merge(documents_df, evaluation_results, left_index=True, right_index=True)

# F1 score for TACRED, revised TACRED, and difficulty weighted

In [8]:
print(p_r_f1_orig_enhanced_weighted_as_df(combined_df,
                                          split="test",
                                          models=MODELS_UNDER_INVESTIGATION,
                                          model_name_map=MODEL_NAME_MAP).to_latex(float_format="{:0.1f}".format, multicolumn_format="c"))

\begin{tabular}{lrrrrrrrrr}
\toprule
{} & \multicolumn{3}{c}{Original} & \multicolumn{3}{c}{Revised} & \multicolumn{3}{c}{Weighted} \\
{} &        P &    R &   F1 &       P &    R &   F1 &        P &    R &   F1 \\
Model        &          &      &      &         &      &      &          &      &      \\
\midrule
CNN, masked  & 67.2 & 53.5 & 59.5 & 72.5 & 61.4 & 66.5 & 47.5 & 27.5 & 34.8 \\
TRE          & 70.1 & 65.0 & 67.4 & 75.8 & 74.9 & 75.3 & 54.8 & 43.9 & 48.8 \\
SpanBERT     & 70.8 & 70.9 & 70.8 & 75.6 & 80.6 & 78.0 & 65.0 & 59.0 & 61.9 \\
KnowBert-W+W & 71.4 & 71.6 & 71.5 & 76.8 & 82.0 & 79.3 & 61.6 & 55.9 & 58.7 \\
\bottomrule
\end{tabular}



In [9]:
p_r_f1_orig_enhanced_weighted_as_df(combined_df,
                                    split="test",
                                    models=MODELS_UNDER_INVESTIGATION,
                                    model_name_map=MODEL_NAME_MAP,
                                    average="micro")

Unnamed: 0_level_0,Original,Original,Original,Revised,Revised,Revised,Weighted,Weighted,Weighted
Unnamed: 0_level_1,P,R,F1,P,R,F1,P,R,F1
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
"CNN, masked",67.170382,53.473684,59.544541,72.459388,61.415306,66.481802,47.478745,27.480009,34.811581
TRE,70.061628,64.962406,67.41573,75.835225,74.863913,75.346439,54.842468,43.934927,48.786462
SpanBERT,70.793269,70.857143,70.825192,75.600962,80.563561,78.00341,65.006424,59.019662,61.868552
KnowBert-W+W,71.437144,71.609023,71.52298,76.837684,82.004483,79.337051,61.647422,55.939933,58.655161


In [10]:
print(f1_orig_enhanced_weighted_as_df(combined_df,
                                    split="test",
                                    models=MODELS_UNDER_INVESTIGATION,
                                    model_name_map=MODEL_NAME_MAP,
                                    average="micro").to_latex(float_format="{:0.1f}".format, multicolumn_format="c"))

\begin{tabular}{lrrr}
\toprule
{} & Original & Revised & Weighted \\
{} &       F1 &      F1 &       F1 \\
Model        &          &         &          \\
\midrule
CNN, masked  & 59.5 & 66.5 & 34.8 \\
TRE          & 67.4 & 75.3 & 48.8 \\
SpanBERT     & 70.8 & 78.0 & 61.9 \\
KnowBert-W+W & 71.5 & 79.3 & 58.7 \\
\bottomrule
\end{tabular}



# Ensemble F1 score for TACRED (original)

In [11]:
ensemble_p_r_f1_score_as_df(combined_df,
                            true_label_col="true_label",
                            ignore_models=["tre", "spanbert", "knowbert"],
                            ignore_label="no_relation",
                            average="micro",
                            sample_weight=None)

(0.7583819241982507, 0.6258646616541353, 0.6857801944307135)

# Ensemble F1 score for TACRED (revised)

In [12]:
ensemble_p_r_f1_score_as_df(combined_df,
                            true_label_col="true_label_reannotated",
                            ignore_models=["tre", "spanbert", "knowbert"],
                            ignore_label="no_relation",
                            average="micro",
                            sample_weight=None)

(0.8567784256559767, 0.7528017931476145, 0.8014317368331345)

In [14]:
print(p_r_f1_orig_enhanced_as_df(combined_df,
                                 split="test",
                                 models=list(ENSEMBLE_MODEL_NAME_MAP.keys()),
                                 model_name_map=ENSEMBLE_MODEL_NAME_MAP).to_latex(float_format="{:0.1f}".format, multicolumn_format="c", escape=False))

\begin{tabular}{lrrrrrr}
\toprule
{} & \multicolumn{3}{c}{Original} & \multicolumn{3}{c}{Revised} \\
{} &        P &    R &   F1 &       P &    R &   F1 \\
Model                              &          &      &      &         &      &      \\
\midrule
BoE                                & 50.0 & 32.6 & 39.4 & 51.8 & 35.9 & 42.4 \\
CNN                                & 72.3 & 45.5 & 55.9 & 79.8 & 53.5 & 64.1 \\
CNN, masked                        & 67.2 & 53.5 & 59.5 & 72.5 & 61.4 & 66.5 \\
CNN w/ synt/sem                    & 72.2 & 54.7 & 62.2 & 79.7 & 64.3 & 71.2 \\
CNN + ELMo                         & 73.8 & 48.8 & 58.8 & 82.1 & 57.9 & 67.9 \\
CNN + ELMo, masked                 & 72.3 & 53.8 & 61.7 & 79.8 & 63.2 & 70.5 \\
CNN + ELMo, masked w/ synt/sem     & 69.2 & 59.0 & 63.7 & 76.0 & 69.1 & 72.4 \\
CNN + BERT uncased                 & 71.9 & 51.1 & 59.7 & 79.5 & 60.2 & 68.5 \\
CNN + BERT uncased, masked         & 69.0 & 62.0 & 65.3 & 74.9 & 71.7 & 73.2 \\
CNN + BERT cased            

In [15]:
p_r_f1_orig_enhanced_as_df(combined_df,
                                 split="test",
                                 models=list(ENSEMBLE_MODEL_NAME_MAP.keys()),
                                 model_name_map=ENSEMBLE_MODEL_NAME_MAP).describe()

Unnamed: 0_level_0,Original,Original,Original,Revised,Revised,Revised
Unnamed: 0_level_1,P,R,F1,P,R,F1
count,49.0,49.0,49.0,49.0,49.0,49.0
mean,65.646538,59.476446,62.136173,71.76734,69.190404,70.146273
std,4.583137,6.943285,4.645253,5.405378,7.880614,5.195892
min,49.976927,32.571429,39.439184,51.77665,35.926993,42.41966
25%,63.913558,56.842105,61.035156,68.916936,67.114954,70.258621
50%,65.352697,60.240602,62.944162,72.428627,70.79731,71.240962
75%,68.642906,64.721805,65.055874,74.470944,74.67179,72.688525
max,73.751135,69.744361,66.863446,82.061762,79.987192,74.963913
