# Setup

In [1]:
import json, os, sys, re
import pandas as pd
from collections import defaultdict
import numpy as np
import torch
from scipy import stats
from scipy.stats import entropy
from datasets import load_dataset, Dataset
import itertools
import torch
from pathlib import Path
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.special import softmax
import pickle
from joblib import Parallel, delayed
import language_tool_python
from itertools import combinations
# add src folder to path
sys.path.append('..')

from dev.ProbLM import JointLM, ConditionalLM
from exp_3_set_proba.prepare_data import correct_grammar, few_shot_examples 
from exp_3_set_proba.analyze import calculate_ranking, calculate_instance_probability # was calculate_p_t_V2

from exp_3_set_proba.utils import hist_of_all_p_t_values, classify, stacked_p_t_plot, hist_of_all_p_t_values, evaluate_classifier, boxplots, scatterplots, calculate_macro_avg, plot_roc_curve, plot_coverage_risk_curve_2, calculate_entropies, get_data_permutations
from exp_3_set_proba.utils import get_data, save_plot, combine_stats_dfs, df_stats_columns, _permutation_stats, subject_overview, convert_permutations_for_plotting, convert_for_pdf, rename_metrics
from dev.ProbLM import JointLM, ConditionalLM


from data_utils import get_wiki_summary
%load_ext autoreload
%autoreload 2

HOME_PATH = os.path.expanduser("~/")

BASE_PATH = Path(f"{HOME_PATH}/Desktop/exp_3_set_proba_V4/")

stat_metrics  = ['n_objs', 'n_subjs', 'n_para', 'n_instances',
       'dataset', 'model', 'run_name']
metrics_global = ['coverage_abs', 'coverage_rel','precision_global', 'recall_global', 'f1_global',
       'accuracy_global', 'fpr_global', 
       'precision_argmax_global', 'recall_argmax_global', 'f1_argmax_global',
       'accuracy_argmax_global',  'fpr_argmax_global',
       ] # others: 'tp_global', 'tn_global', 'fp_global', 'fn_global', 'tp_argmax_global', 'tn_argmax_global', 'fp_argmax_global', 'fn_argmax_global',
metrics_selective = ['precision_selective']
metrics_global_0_thershold = ['auc_global', 'fpr_by_threshold_global',
       'tpr_by_threshold_global', 'roc_thresholds_global', 'fpr_by_threshold_argmax_global', 'tpr_by_threshold_argmax_global',
       'roc_thresholds_argmax_global', 'auc_argmax_global']
metrics_per_paraphrase = ['precision_argmax_pp', 'recall_argmax_pp', 'f1_argmax_pp',
       'accuracy_argmax_pp', 'fpr_argmax_pp'] # others: 'tp_argmax_pp', 'tn_argmax_pp', 'fp_argmax_pp','fn_argmax_pp',
metrics_per_paraphrase_0_threshold = ['fpr_by_threshold_argmax_pp',
       'tpr_by_threshold_argmax_pp', 'roc_thresholds_argmax_pp', 'auc_argmax_pp']


BASE_PATH

PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4')

# Selective Prediction

In [2]:
run_names = ['hypernymy_2000_50_mistral7B', 'trex_test_2000_50_mistral7B', 'PopQA_test_2000_50_mistral7B', 'hypernymy_2000_50', 'trex_test_2000_50', 'PopQA_test_2000_50']
dataset_per_run = ['hypernymy', 'trex', 'PopQA', 'hypernymy', 'trex', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B', 'gpt-l', 'gpt-l', 'gpt-l']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH)
df_all_stats

Unnamed: 0,relation,r_s_id,max_paraphrases,max_o,threshold,coverage_abs_global,coverage_rel_global,precision_selective_global,recall_selective_global,f1_selective_global,...,fpr_argmax_overall_global,total_argmax_overall_global,n_objs,n_subjs,n_para,n_instances,dataset,model,run_name,run_attributes
0,all_relations,,0,30,0.00,17310,1.000000,0.033333,1.0,0.064516,...,0.018168,17310,30,577,1,17310,hypernymy,mistral-7B,hypernymy_2000_50_mistral7B,
1,all_relations,,0,30,0.01,5196,0.300173,0.091609,1.0,0.167842,...,0.018168,17310,30,577,1,17310,hypernymy,mistral-7B,hypernymy_2000_50_mistral7B,
2,all_relations,,0,30,0.02,3692,0.213287,0.124594,1.0,0.221580,...,0.018168,17310,30,577,1,17310,hypernymy,mistral-7B,hypernymy_2000_50_mistral7B,
3,all_relations,,0,30,0.03,3003,0.173484,0.147186,1.0,0.256604,...,0.018168,17310,30,577,1,17310,hypernymy,mistral-7B,hypernymy_2000_50_mistral7B,
4,all_relations,,0,30,0.04,2547,0.147140,0.168826,1.0,0.288881,...,0.018168,17310,30,577,1,17310,hypernymy,mistral-7B,hypernymy_2000_50_mistral7B,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44935,[X] is the capital of [Y].,729.0,20,50,0.99,0,0.000000,,,,...,,51,51,1,11,561,PopQA,gpt-l,PopQA_test_2000_50,
44936,[X] is the capital of [Y].,730.0,20,50,0.99,0,0.000000,,,,...,0.020000,51,51,1,11,561,PopQA,gpt-l,PopQA_test_2000_50,
44937,[X] is the capital of [Y].,731.0,20,50,0.99,0,0.000000,,,,...,,51,51,1,11,561,PopQA,gpt-l,PopQA_test_2000_50,
44938,[X] is the capital of [Y].,732.0,20,50,0.99,0,0.000000,,,,...,0.020000,51,51,1,11,561,PopQA,gpt-l,PopQA_test_2000_50,


## 2 barplots with different thresholds

In [3]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Graph for 0.5 threshold
threshold = 0.5
df_stats_all_r = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_all_r = df_stats_all_r[(df_stats_all_r['threshold'] == threshold) ]# & (df_stats_all_r['max_paraphrases'] > 0)  & (df_stats_all_r['dataset'].isin(['trex', 'PopQA']))] 
df_stats_all_r = convert_for_pdf(df_stats_all_r)
df_stats_all_r


df_stats_all_r['dataset, model, #p, coverage abs.'] = df_stats_all_r['dataset'] + ', ' + df_stats_all_r['model'] + ', ' + df_stats_all_r['max_paraphrases'].astype(str) + ', ' + df_stats_all_r['coverage_abs_global'].astype(str)

# selective
global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
global_argmax_metrics = ['precision_argmax_selective_global', ]
pp_argmax_metrics = ['precision_argmax_selective_pp']


# all (non-selective)
pp_argmax_all_metrics = ['precision_argmax_overall_pp']
global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics
metrics_plot = global_selective_metrics.copy() + [ 'precision_argmax_overall_global'] # 'precision_argmax_selective_global',
metrics_plot.remove('coverage_abs_global')

df_bar_plot = rename_metrics(df_stats_all_r.copy())
metrics_plot = [ 'coverage',
 'selective precision',
 'argmax precision']

fig_global = px.bar(df_bar_plot, x='dataset, model, #p, coverage abs.', y=metrics_plot, 
                    color='variable', barmode="group", title=f"Selective Prediction w/ threshold = {threshold} vs. argmax")
fig_global.update_layout( font_family="Serif", font_size=12,
)

fig_global.show()
save_plot(fig_global, BASE_PATH=BASE_PATH, filename='global_overview_knowing')


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / global_overview_knowing.pdf


In [4]:
df_differences = df_stats_all_r[["dataset, model, #p, coverage abs.", "coverage_rel_global", "precision_selective_global", "precision_argmax_overall_global"]]

df_differences.to_csv(BASE_PATH / "global_overview_knowing.csv", index=False)
BASE_PATH

PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4')

In [5]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Graph for 0.5 threshold
threshold = 0.7
df_stats_all_r = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_all_r = df_stats_all_r[(df_stats_all_r['threshold'] == threshold) ]# & (df_stats_all_r['max_paraphrases'] > 0)  & (df_stats_all_r['dataset'].isin(['trex', 'PopQA']))] 
df_stats_all_r['dataset, model, #p, coverage abs.'] = df_stats_all_r['dataset'] + ', ' + df_stats_all_r['model'] + ', ' + df_stats_all_r['max_paraphrases'].astype(str) + ', ' + df_stats_all_r['coverage_abs_global'].astype(str)
print(df_stats_all_r.columns)

df_stats_all_r = convert_for_pdf(df_stats_all_r)


# selective
global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
global_argmax_metrics = ['precision_argmax_selective_global', ]
pp_argmax_metrics = ['precision_argmax_selective_pp']


# all (non-selective)
pp_argmax_all_metrics = ['precision_argmax_overall_pp']
global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics
metrics_plot = global_selective_metrics.copy() + [ 'precision_argmax_overall_global'] # 'precision_argmax_selective_global',
metrics_plot.remove('coverage_abs_global')
# metrics_plot.remove('coverage_abs_pp')

df_stats_all_r = rename_metrics(df_stats_all_r)
metrics_plot = [ 'coverage',
 'selective precision',
 'argmax precision']

fig_global = px.bar(df_stats_all_r, x='dataset, model, #p, coverage abs.', y=metrics_plot, 
                    color='variable', barmode="group", title=f"Selective Prediction w/ threshold = {threshold} vs. argmax")
fig_global.update_layout( font_family="Serif", font_size=12,
)
fig_global.update_layout( font_family="Serif", font_size=12,
)
fig_global.show()

save_plot(fig_global, BASE_PATH=BASE_PATH, filename='global_overview_knowing_very_sure')

Index(['relation', 'r_s_id', 'max_paraphrases', 'max_o', 'threshold',
       'coverage_abs_global', 'coverage_rel_global',
       'precision_selective_global', 'recall_selective_global',
       'f1_selective_global', 'accuracy_selective_global',
       'tp_selective_global', 'tn_selective_global', 'fp_selective_global',
       'fn_selective_global', 'fpr_selective_global', 'total_selective_global',
       'coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp',
       'recall_selective_pp', 'f1_selective_pp', 'accuracy_selective_pp',
       'tp_selective_pp', 'tn_selective_pp', 'fp_selective_pp',
       'fn_selective_pp', 'fpr_selective_pp', 'total_selective_pp',
       'auc_p_o_global', 'fpr_by_threshold_p_o_global',
       'tpr_by_threshold_p_o_global', 'roc_thresholds_p_o_global',
       'precision_argmax_selective_global', 'recall_argmax_selective_global',
       'f1_argmax_selective_global', 'accuracy_argmax_selective_global',
       'tp_argmax_selective_global', 'tn_argmax

Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / global_overview_knowing_very_sure.pdf


## Detailed Table

In [6]:
run_names = ['hypernymy_2000_50_mistral7B', 'trex_test_2000_50_mistral7B', 'PopQA_test_2000_50_mistral7B', 'hypernymy_2000_50', 'trex_test_2000_50', 'PopQA_test_2000_50']
dataset_per_run = ['hypernymy', 'trex', 'PopQA', 'hypernymy', 'trex', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B', 'gpt-l', 'gpt-l', 'gpt-l']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH)

In [7]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Table for different thresholds
df_stats_all_r = df_all_stats[(df_all_stats['relation'].str.contains('all_relations')) & (df_all_stats['threshold'].isin([0.0, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]))]
df_stats_all_r = convert_for_pdf(df_stats_all_r)


# selective
global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
global_argmax_metrics = ['precision_argmax_selective_global', ]
pp_argmax_metrics = ['precision_argmax_selective_pp']

# all (non-selective)
pp_argmax_all_metrics = ['precision_argmax_overall_pp']
global_argmax_all_metrics = ['precision_argmax_overall_global']

all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics

metrics = ['precision_argmax_overall_global', 'precision_selective_global', 'coverage_abs_global', 'coverage_rel_global']
df_latex = df_stats_all_r.pivot_table(index= ['dataset', 'model' ], columns=['threshold', '#p'], values=metrics)
df_latex = df_latex.transpose()

df_latex.head(5)
# df_stats_all_r

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset,Hypernymy,Hypernymy,PopQA,PopQA,TRex,TRex
Unnamed: 0_level_1,Unnamed: 1_level_1,model,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_2,threshold,#p,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
coverage_abs_global,0.0,0,17310.0,17310.0,37170.0,37170.0,220926.0,220926.0
coverage_abs_global,0.0,20,17310.0,17310.0,37170.0,37170.0,220926.0,220926.0
coverage_abs_global,0.05,0,2107.0,2234.0,2982.0,2381.0,17763.0,12639.0
coverage_abs_global,0.05,20,2325.0,2109.0,3640.0,3506.0,20623.0,16614.0
coverage_abs_global,0.1,0,1170.0,1381.0,1776.0,1558.0,10554.0,8650.0


In [8]:
with open(BASE_PATH / 'global_overview.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="Comparision of predictions from P(o, T(r)) $>$ 0.5 vs. top-1 ranked object in settings with and without aggregation over paraphrases.",
            label="tab:global_overview",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

BASE_PATH / 'global_overview.tex'

PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4/global_overview.tex')

## Risk / Coverage Curves for Datasets

In [9]:
run_names = ['hypernymy_2000_50_mistral7B', 'trex_test_2000_50_mistral7B', 'PopQA_test_2000_50_mistral7B', 'hypernymy_2000_50', 'trex_test_2000_50', 'PopQA_test_2000_50']
dataset_per_run = ['hypernymy', 'trex', 'PopQA', 'hypernymy', 'trex', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B', 'gpt-l', 'gpt-l', 'gpt-l']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH)

In [10]:
# Risk Coverage Curve for Dataset, Model and Paraphrases
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold
# metrics for 3 full datastes = all relations, all paraphrases, full o_neg set
df_stats_overall = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_overall = df_stats_overall[ (df_stats_overall['dataset'].isin(['trex', 'PopQA', 'hypernymy']))  ] # & (df_stats_overall['max_paraphrases'] > 0) & (df_stats_overall['max_o'] == 30)
# df_stats_overall = df_stats_overall[['dataset',  'model', 'run_name', 'max_paraphrases', 'threshold', 'coverage_abs', 'relation', 'precision', 'recall', 'f1', 'accuracy', 'P(T)_fraction_above_threshold', 'coverage_abs', 'P(T)_fraction_above_threshold|pos', 'P(T)_total|pos', 'precision_argmax', 'recall_argmax', 'f1_argmax', 'accuracy_argmax']] 

df_stats_overall = convert_for_pdf(df_stats_overall)

models = df_stats_overall['model'].unique()
risk_names = ['precision_selective_global']

all_data = []
for model in models:
    df_stats_overall_model = df_stats_overall[df_stats_overall['model'] == model]
    for risk_name in risk_names:
        fig, data = plot_coverage_risk_curve_2(df_stats_overall_model, risk_name=risk_name, coverage_name='coverage_rel_global', plot_title=f'Selective Prediction Using Model: {model}', out_path=BASE_PATH / f'{model}_{risk_name}_coverage_dataset')
        all_data.extend(data)
        fig.show()    

auc_data = pd.DataFrame(all_data, columns=['dataset', 'model', 'run_attributes', 'relation','max_p', 'risk_name', 'auc'])

Plotting run: hypernymy_2000_50_mistral7B
Plotting run: trex_test_2000_50_mistral7B
Plotting run: PopQA_test_2000_50_mistral7B
/Users/dug/Desktop/exp_3_set_proba_V4/Mistral-7B-I_precision_selective_global_coverage_dataset.pdf


Plotting run: hypernymy_2000_50
Plotting run: trex_test_2000_50
Plotting run: PopQA_test_2000_50
/Users/dug/Desktop/exp_3_set_proba_V4/GPT-2-L_precision_selective_global_coverage_dataset.pdf


In [11]:
# Latex Table for AUC values
df_latex = convert_for_pdf(auc_data.copy())
num_para = df_latex["\#p"].to_list()
# df_latex["\#p"] = ['all' if p >0 else p for p in num_para]
df_latex = df_latex.pivot_table(index= ['dataset', 'model'], columns=["\#p"], values=['auc'])

df_latex["rel. diff."] = (df_latex[('auc', 20)] - df_latex[('auc', 0)]) / df_latex[('auc', 0)]

with open(BASE_PATH / 'auc_per_dataset.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="AUC values for selective prediction per dataset, model and number of paraphrase templates.",
            label="tab:auc_per_dataset",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / 'auc_per_dataset.tex')
df_latex


Df columns: Index(['dataset', 'model', 'run_attributes', 'relation', 'max_p', 'risk_name',
       'auc'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/auc_per_dataset.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,auc,auc,rel. diff.
Unnamed: 0_level_1,\#p,0,20,Unnamed: 4_level_1
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Hypernymy,GPT-2-L,0.103144,0.139618,0.353621
Hypernymy,Mistral-7B-I,0.115485,0.147664,0.278637
PopQA,GPT-2-L,0.044263,0.0454,0.025695
PopQA,Mistral-7B-I,0.059163,0.061441,0.038497
TRex,GPT-2-L,0.091359,0.093575,0.024251
TRex,Mistral-7B-I,0.113217,0.108242,-0.043941


# Selective Prediction per Relation

In [12]:
run_names = ['hypernymy_2000_50_mistral7B', 'trex_test_2000_50_mistral7B', 'PopQA_test_2000_50_mistral7B', 'hypernymy_2000_50', 'trex_test_2000_50', 'PopQA_test_2000_50']
dataset_per_run = ['hypernymy', 'trex', 'PopQA', 'hypernymy', 'trex', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B', 'gpt-l', 'gpt-l', 'gpt-l']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH)

In [13]:
# Risk Coverage Curve for Dataset, Model and Paraphrases
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold
# metrics for 3 full datastes = all relations, all paraphrases, full o_neg set
df_stats_overall = df_all_stats[(~df_all_stats['relation'].str.contains('all_relations')) & (df_all_stats['r_s_id'].isna())]
df_stats_overall = df_stats_overall[ (df_stats_overall['dataset'].isin(['trex', 'PopQA'])) ] #& (df_stats_overall['max_o'] == 30)
# df_stats_overall = df_stats_overall[['dataset',  'model', 'run_name', 'max_paraphrases', 'threshold', 'coverage_abs', 'relation', 'precision', 'recall', 'f1', 'accuracy', 'P(T)_fraction_above_threshold', 'coverage_abs', 'P(T)_fraction_above_threshold|pos', 'P(T)_total|pos', 'precision_argmax', 'recall_argmax', 'f1_argmax', 'accuracy_argmax']] 

df_stats_overall = convert_for_pdf(df_stats_overall.copy())
models = df_stats_overall['model'].unique()
risk_names = ['precision_selective_global']

all_data = []
for model in models:    
    df_stats_overall_model = df_stats_overall[df_stats_overall['model'] == model]
    for risk_name in risk_names:
        for dataset in df_stats_overall['dataset'].unique():
            df_stats_overall_model_dataset = df_stats_overall_model[df_stats_overall_model['dataset'] == dataset]
            fig, data = plot_coverage_risk_curve_2(df_stats_overall_model_dataset, plot_all_p_only=True, risk_name=risk_name, coverage_name='coverage_rel_global', mode='by_relation', plot_title=f'Selective Prediction Using Model: {model} on Dataset: {dataset}', out_path=BASE_PATH / f'{model}_{risk_name}_coverage_by_relation_{dataset}')
            all_data.extend(data)
            fig.update_layout( font_family="Serif", font_size=12, yaxis_title_text='selective precision')
            fig.show()    
            
auc_data_relations = pd.DataFrame(all_data, columns=['dataset', 'model', 'run_attributes', 'relation', 'r_s_id', 'max_p', 'risk_name', 'auc'])


Plotting run: trex_test_2000_50_mistral7B
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/Mistral-7B-I_precision_selective_global_coverage_by_relation_TRex.pdf


Plotting run: PopQA_test_2000_50_mistral7B
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/Mistral-7B-I_precision_selective_global_coverage_by_relation_PopQA.pdf


Plotting run: trex_test_2000_50
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/GPT-2-L_precision_selective_global_coverage_by_relation_TRex.pdf


Plotting run: PopQA_test_2000_50
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/GPT-2-L_precision_selective_global_coverage_by_relation_PopQA.pdf


In [14]:
# combine to one df
auc_data_all = pd.concat([auc_data, auc_data_relations], axis=0)

# convert to latex table
df_latex = convert_for_pdf(auc_data_all.copy())
num_para = df_latex["\#p"].to_list()
# df_latex["\#p"] = ['all' if p >0 else p for p in num_para]
df_latex = df_latex.pivot_table(index= ['dataset', 'relation' ], columns=['model', '\#p'], values=['auc'])

df_latex[("rel. diff. 0 vs. all p", "GPT-2-L", "")] = (df_latex[('auc', "GPT-2-L", 20)] - df_latex[('auc', "GPT-2-L", 0)]) / df_latex[('auc', "GPT-2-L", 0)]
df_latex[("rel. diff. 0 vs. all p", "Mistral-7B-I", "")] = (df_latex[('auc', "Mistral-7B-I", 20)] - df_latex[('auc', "Mistral-7B-I", 0)]) / df_latex[('auc', "Mistral-7B-I", 0)]
with open(BASE_PATH / 'auc_per_dataset_and_relation.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="AUC values for selective prediction using different models on different datasets. Using all available paraphrases.",
            label="tab:auc_per_dataset_and_relation",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / 'auc_per_dataset_and_relation.tex')
df_latex

Df columns: Index(['dataset', 'model', 'run_attributes', 'relation', 'max_p', 'risk_name',
       'auc', 'r_s_id'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/auc_per_dataset_and_relation.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,auc,auc,auc,auc,rel. diff. 0 vs. all p,rel. diff. 0 vs. all p
Unnamed: 0_level_1,model,GPT-2-L,GPT-2-L,Mistral-7B-I,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_2,\#p,0,20,0,20,Unnamed: 6_level_2,Unnamed: 7_level_2
dataset,relation,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
Hypernymy,all_relations,0.103144,0.139618,0.115485,0.147664,0.353621,0.278637
PopQA,O is S's father.,0.04419,0.045565,0.053434,0.070567,0.031127,0.320638
PopQA,O is the author of S.,0.029366,0.033533,0.092269,0.067849,0.141885,-0.264662
PopQA,O is the screenwriter of S,0.036382,0.036445,0.058772,0.055123,0.001741,-0.062083
PopQA,S is located in O.,0.073311,0.073467,0.095651,0.08039,0.002126,-0.159554
PopQA,S is the capital of O.,0.09095,0.079625,0.10061,0.091039,-0.124512,-0.095137
PopQA,S is the mother of O.,0.048096,0.044335,0.075371,0.055679,-0.078208,-0.261276
PopQA,S play O.,0.028183,0.031215,0.02737,0.046023,0.107585,0.681513
PopQA,S was born in O.,0.043943,0.045626,0.071057,0.058007,0.038292,-0.183658
PopQA,S's occupation is O.,0.037789,0.039288,0.027572,0.043768,0.039675,0.587397


# Selective Prediction per Subject and Relation

In [15]:
run_names = ['hypernymy_2000_50_mistral7B', 'trex_test_2000_50_mistral7B', 'PopQA_test_2000_50_mistral7B', 'hypernymy_2000_50', 'trex_test_2000_50', 'PopQA_test_2000_50']
dataset_per_run = ['hypernymy', 'trex', 'PopQA', 'hypernymy', 'trex', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B', 'gpt-l', 'gpt-l', 'gpt-l']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH)
df_all_stats = df_all_stats[(~df_all_stats['relation'].str.contains('all_relations')) & (~df_all_stats['r_s_id'].isna())]
df_all_stats = df_all_stats[df_all_stats['threshold'] == 0.5]

df_all_stats = convert_for_pdf(df_all_stats)
# df_all_stats['total_selective_global'].unique()

fig = px.histogram(df_all_stats, x="total_selective_global", color="model", marginal="box", nbins=100)
fig.show()



In [16]:
# Precision @0.5 threhold for Dataset, Model and Paraphrases and subject:
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold

threshold = 0.5

for r, run_name in enumerate(run_names):
    print(run_name)
    df_stats, df_instance_permutations = get_data(run_name, BASE_PATH)
    covered = subject_overview(
        df_stats,
        df_instance_permutations,
        threshold=0.5,
    )

    
    fig = px.treemap(covered, path=['orig_relation_template', "label", 's, o'],  title=f'Sequences with Precision > 0.5 argmax o vs. Ground Truth for {dataset_per_run[r]}, {model_per_run[r]}',
                     color="label", hover_data=['s, o','sequence', 'true_o'] ) # value=...
    fig.show()
    
    
    save_plot(fig, BASE_PATH=BASE_PATH, filename=f'per_s_{run_name}')
    
    
    # scatter_df = stats_best_worst.copy()
    # scatter_df['precision_selective_global'] = scatter_df['precision_selective_global'].astype(str)
    # fig = px.scatter(scatter_df, x='s, o', y='precision_selective_global', color='orig_relation_template', title=f'Precision @0.5 threshold for {dataset_per_run[r]}, {model_per_run[r]}')
    # fig.show()
    

    
# OBJ is permutated
# SUBJ stays

# need to get the OBJ associated with the subject by true label


hypernymy_2000_50_mistral7B


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_hypernymy_2000_50_mistral7B.pdf
trex_test_2000_50_mistral7B


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_trex_test_2000_50_mistral7B.pdf
PopQA_test_2000_50_mistral7B


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_PopQA_test_2000_50_mistral7B.pdf
hypernymy_2000_50


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_hypernymy_2000_50.pdf
trex_test_2000_50


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_trex_test_2000_50.pdf
PopQA_test_2000_50


Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / per_s_PopQA_test_2000_50.pdf
