# Result analysis

We load the results (the saved model predictions) and compute the metrics reported in the paper. The latex tables are generated with the script `generate_latex_tables.py`.

### Load results

In [39]:
import pickle

# load saved probabilities
results = {}
for n_samples in [100, 187, 350, 654, 1223, 2287, 4278, 8000]:
    filename = f'results/ie_text_tab_probabilities_n_samples_{n_samples}.p'
    with open(filename, 'rb') as file:
        results[n_samples] = pickle.load(file)

In [40]:
list(results[100][212].keys())

['bn_realistic',
 'gt_bn',
 'binary_classifiers',
 'binary_classifiers_data_shift',
 'tabular_text_binary',
 'weighted_consistency',
 'weighted_consistency_data_shift',
 'weighted_consistency_ground_truth',
 'virtual',
 'virtual_data_shift',
 'virtual_ground_truth',
 'weighted_consistency_virtual',
 'weighted_consistency_virtual_data_shift',
 'weighted_consistency_virtual_ground_truth']

### Preprocessing

Reaggregate the probabilities and clean results (necessary for Brier score computation).

In [6]:
from collections import defaultdict
from run_experiments import to_dict, factory

def reaggregate_probabilities(results):
    new_results = defaultdict(factory(3))
    for n_samples, seed_data in results.items():
        for seed, model_data in seed_data.items():
            for model, probability_data in model_data.items():
                for symptom, probabilities in probability_data.items():
                    new_results[n_samples][model][symptom][seed] = probabilities
    return to_dict(new_results)

# n_samples -> seed -> model -> symptom -> probabilities
# to
# n_samples -> model -> symptom -> seed -> probabilities
results = reaggregate_probabilities(results)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import numpy as np

def clean_results(results):
    # n_samples -> model -> symptom -> seed -> probabilities
    for n_samples, n_samples_data in results.items():
        for model, model_data in n_samples_data.items():
            for symptom, symptom_data in model_data.items():
                if symptom == 'fever':
                    for seed, probabilities in symptom_data.items():
                        for i, row in probabilities.items():
                            if any(e < 0 for e in row):
                                probabilities.loc[i] = np.array([x if x >= 0 else -x for x in row])
                        for i, row in probabilities.items():
                            s = sum(row)
                            if s > 1:
                                probabilities.loc[i] = np.array([x / s for x in row])
                else:
                    for seed, probabilities in symptom_data.items():
                        for i, prob in probabilities.items():
                            if prob < 0:
                                probabilities.loc[i] = 0.
                            elif prob > 1:
                                probabilities.loc[i] = 1.
    return results

# normalization (necessary for brier_score_loss)
results = clean_results(results)

### Split results 

Split into the main results, ground_truth BN results, and data shift results.

In [None]:
# split results

main_result_models = ['bn_realistic', 'binary_classifiers', 'tabular_text_binary', 'weighted_consistency', 'virtual', 'weighted_consistency_virtual']
ground_truth_models = ['gt_bn', 'binary_classifiers', 'tabular_text_binary', 'weighted_consistency_ground_truth', 'virtual_ground_truth', 'weighted_consistency_virtual_ground_truth']
data_shift_models = ['binary_classifiers_data_shift', 'weighted_consistency_data_shift', 'virtual_data_shift', 'weighted_consistency_virtual_data_shift']

main_results = {}
ground_truth_results = {}
data_shift_results = {}
for n_samples, n_samples_data in results.items():
    main_results[n_samples] = {}
    ground_truth_results[n_samples] = {}
    data_shift_results[n_samples] = {}
    for model, model_data in n_samples_data.items():
        if model in main_result_models:
            main_results[n_samples][model] = model_data
        if model in ground_truth_models:
            ground_truth_results[n_samples][model] = model_data
        if model in data_shift_models:
            data_shift_results[n_samples][model] = model_data

### Metric computation

Compute average precision and Brier scores.

In [9]:
from run_experiments import load_simsum

df = load_simsum()

In [22]:
from sklearn.metrics import average_precision_score, brier_score_loss
from sklearn.preprocessing import label_binarize
from scipy.stats import entropy

def normalized_entropy(probs: np.ndarray):
    """
    Compute normalized entropy for an array of categorical distributions.

    Args:
        probs (np.ndarray): shape (batch_size, num_classes), rows sum to 1.

    Returns:
        np.ndarray: normalized entropy in [0, 1] for each distribution.
    """
    eps = 1e-12
    probs = np.clip(probs, eps, 1.0)  # avoid log(0)
    ent = entropy(probs, axis=1)  # default is base e
    max_ent = np.log(probs.shape[1])
    return np.mean(ent / max_ent)

def probabilities_to_metrics(probabilities_dict, use_metrics=['average_precision', 'brier'], models='all'):
    metrics = defaultdict(factory(4))
    # n_samples -> model -> symptom -> seed -> probabilities
    for n_samples, n_samples_data in probabilities_dict.items():
        for model, model_data in n_samples_data.items():
            if models == 'all' or model in models:
                for symptom, symptom_data in model_data.items():
                    for seed, probabilities in symptom_data.items():
                        idx = probabilities.index.to_list()
                        subset = df.loc[idx]
                        y_true = subset[symptom]
                        if symptom == 'fever':
                            true_fever = label_binarize(y_true, classes=['none', 'low', 'high'])
                            fever_probs = np.stack(probabilities.to_list())
                        # metric -> symptom -> n_samples -> seed -> model
                        # average precision
                        if 'average_precision' in use_metrics:
                            average_precision = average_precision_score(y_true, probabilities, pos_label='yes') if symptom != 'fever' else average_precision_score(true_fever, fever_probs)
                            metrics['average_precision'][symptom][n_samples][seed][model] = average_precision
                        # brier
                        if 'brier' in use_metrics:
                            brier_probs = np.flip(fever_probs, axis=1) if symptom == 'fever' else probabilities
                            brier = brier_score_loss(y_true, brier_probs, pos_label='yes') if symptom != 'fever' else brier_score_loss(y_true, brier_probs, labels=['high', 'low', 'none'])
                            metrics['brier'][symptom][n_samples][seed][model] = brier
                        # confidence (entropy-based)
                        if 'confidence' in use_metrics:
                            if symptom != 'fever':
                                probs = np.stack((1-probabilities.to_numpy(), probabilities.to_numpy()), axis=1)
                            else:
                                probs = fever_probs
                            entr = normalized_entropy(probs.astype(np.float64))
                            confidence = 1 - entr
                            metrics['confidence'][symptom][n_samples][seed][model] = confidence
    return to_dict(metrics)


Compute results and save in `results/` directory:

In [11]:
import warnings
warnings.filterwarnings("ignore")

main_results_metrics = probabilities_to_metrics(main_results)

In [12]:
with open('results/main_results.p', 'wb') as file:
    pickle.dump(main_results_metrics, file)

In [13]:
gt_bn_metrics = probabilities_to_metrics(ground_truth_results)
with open('results/ground_truth_results.p', 'wb') as file:
    pickle.dump(gt_bn_metrics, file)

In [14]:
data_shift_metrics = probabilities_to_metrics(data_shift_results)
with open('results/data_shift_results.p', 'wb') as file:
    pickle.dump(data_shift_metrics, file)

### Subset analysis

Further split the main_results into present vs. mentioned subsets based on the occurrence of the symptom and the label `[symptom]_mentioned`:

In [16]:
def isolate_test_subset(test_set, symptom, present, mentioned):
    idxs = []
    for _, row in test_set.iterrows():
        symptom_labels = ['yes', 'low', 'high'] if present else ['no', 'none']
        mentions_column = symptom + '_mentioned'
        if row[symptom] in symptom_labels and row[mentions_column] == mentioned:
            idxs.append(row.name)
    return test_set.loc[idxs]

def isolate_subsets(df, probabilities, symptom, present, mentioned):
    idx = probabilities.index.to_list()
    test_set = df.loc[idx]
    test_subset = isolate_test_subset(test_set, symptom, present, mentioned)
    subset_idx = test_subset.index.to_list()
    probabilities_subset = probabilities.loc[subset_idx]
    return test_subset, probabilities_subset

def probabilities_by_subsets(results):
    # n_samples -> model -> symptom -> seed -> probabilities
    results_by_mode = {}
    for mode, present, mentioned in [('present but not mentioned', True, False), ('mentioned but not present', False, True),
                                    ('present and mentioned', True, True), ('not mentioned and not present', False, False)]:
        subset_results = defaultdict(factory(4))
        for n_samples, n_samples_data in results.items():
            for model, model_data in n_samples_data.items():
                for symptom, symptom_data in model_data.items():
                    for seed, probabilities in symptom_data.items():
                        _, subset_probabilities = isolate_subsets(df, probabilities, symptom, present, mentioned)
                        subset_results[n_samples][model][symptom][seed] = subset_probabilities
        results_by_mode[mode] = to_dict(subset_results)
    return results_by_mode

In [17]:
# split into present / mentioned subsets
pbs = probabilities_by_subsets(main_results)

In [23]:
# subset brier scores
subset_models = ['binary_classifiers', 'weighted_consistency', 'virtual', 'weighted_consistency_virtual']
pbs_metrics_pnm = probabilities_to_metrics(pbs['present but not mentioned'], use_metrics=['brier'], models=subset_models)
pbs_metrics_pm = probabilities_to_metrics(pbs['present and mentioned'], use_metrics=['brier'], models=subset_models)
pbs_metrics_npm = probabilities_to_metrics(pbs['mentioned but not present'], use_metrics=['brier'], models=subset_models)
pbs_metrics_npnm = probabilities_to_metrics(pbs['not mentioned and not present'], use_metrics=['brier'], models=subset_models)

In [24]:
# save subset brier scores
with open('results/pm_results.p', 'wb') as file:
    pickle.dump(pbs_metrics_pm, file)
with open('results/pnm_results.p', 'wb') as file:
    pickle.dump(pbs_metrics_pnm, file)
with open('results/npm_results.p', 'wb') as file:
    pickle.dump(pbs_metrics_npm, file)
with open('results/npnm_results.p', 'wb') as file:
    pickle.dump(pbs_metrics_npnm, file)

Compare the **BN-only** and **text-only** models on the *present, not mentioned* subset.

In [25]:
pnm_bn_text = probabilities_to_metrics(pbs['present but not mentioned'], use_metrics=['brier'], models=['bn_realistic', 'binary_classifiers'])
with open('results/pnm_bn_text.p', 'wb') as file:
    pickle.dump(pnm_bn_text, file)

In [20]:
# nn results
nn_results = {}
for n_samples, n_samples_data in results.items():
    nn_results[n_samples] = {}
    for model, model_data in n_samples_data.items():
        if model == 'binary_classifiers':
            nn_results[n_samples][model] = model_data

### Model confidence

Compute the confidence of the **text-only** classifiers.

In [21]:
# nn confidence
nn_confidence = probabilities_to_metrics(nn_results, use_metrics=['confidence'])
with open('results/text_only_confidence.p', 'wb') as file:
    pickle.dump(nn_confidence, file)