In [14]:
import os

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score

# This notebooks evaluates the result of proposed framework using XLM-RoBERTa against empirical moral association scores from Small World of Words (SWoW) dataset.

In [15]:
test_results_path = "./data/SWOW_prediction/eval/"

In [16]:
properties = ["previous_link", "polarity"]
model_name = "xlm-roberta-large"
reduces = [
    "forward",
    "both",
]
data_name = "coha"
train_sections = np.arange(0, 5)

loss_function_name = "likelihood_loss"
graph_strategy = "ppmi"
token_strategy = "frequency"
graph_version = 2
swow_version = 1
fill = "add"
add_self_loops = True
sections = ["dev", "test"]
baselines = [True, False]

In [17]:
wanted = []

In [18]:
list_rows = []

for property in properties:

    for baseline in baselines:

        for reduce in reduces:
            for section in sections:
                total_df = pd.DataFrame()

                for train_section in train_sections:

                    if baseline == True:
                        df_dir = (
                            test_results_path
                            + f"{property}_basic_{model_name}_{data_name}_{train_section}_{loss_function_name}_graph_{graph_strategy}_graph_version_{graph_version}_swow_version_{swow_version}_fill_{fill}_add_self_loops_{add_self_loops}_token_strategy_{token_strategy}_{section}.csv"
                        )
                    else:
                        df_dir = (
                            test_results_path
                            + f"{property}_{model_name}_{reduce}_{data_name}_{train_section}_{loss_function_name}_graph_{graph_strategy}_graph_version_{graph_version}_swow_version_{swow_version}_fill_{fill}_add_self_loops_{add_self_loops}_token_strategy_{token_strategy}_{section}.csv"
                        )

                    if not os.path.exists(df_dir):
                        continue

                    df = pd.read_csv(df_dir)
                    if baseline and property == "polarity":
                        wanted.append(df)
                    total_df = pd.concat((total_df, df), ignore_index=True)
                if len(total_df) == 0:
                    continue
                df = (
                    total_df.groupby(["words"])[["targets", "outputs"]]
                    .mean()
                    .reset_index()
                )

                r, p = pearsonr(df.targets, df.outputs)
                n = len(df)
                r2 = r2_score(df.targets, df.outputs)
                m = None
                if baseline == True:
                    m = "BERT"
                elif reduce == "both":

                    m = "GCN + BERT"

                elif reduce == "forward":
                    m = "GCN"

                row = {
                    "model": m,
                    "r": float(r),
                    "p": p,
                    "r2": r2,
                    "train_section": train_section,
                    "test_section": section,
                    "property": property,
                    "n": n,
                }
                list_rows.append(row)

results = pd.DataFrame(list_rows)

In [19]:
results = pd.DataFrame(list_rows)

In [20]:
print(
    results.loc[results.test_section == "test"]
    .groupby(["model", "property"])[["r", "r2", "n", "p"]]
    .mean()
    .reset_index()
    .sort_values(by=["property", "r", "r2", "n", "p"]),
)

        model       property         r        r2      n              p
0        BERT       polarity  0.462771 -0.070929  843.0   5.767353e-46
2         GCN       polarity  0.594130  0.330941  843.0   1.420819e-81
4  GCN + BERT       polarity  0.605670  0.364185  843.0   1.562124e-85
1        BERT  previous_link  0.623960  0.387193  936.0  3.923608e-102
3         GCN  previous_link  0.638555  0.399316  936.0  2.344228e-108
5  GCN + BERT  previous_link  0.643423  0.412175  936.0  1.651430e-110
