# Setup

In [1]:
import json, os, sys, re
import pandas as pd
from collections import defaultdict
import numpy as np
import torch
from scipy import stats
from scipy.stats import entropy
from datasets import load_dataset, Dataset
import itertools
import torch
from pathlib import Path
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.special import softmax
import pickle
from joblib import Parallel, delayed
import language_tool_python
from itertools import combinations
# add src folder to path
sys.path.append('..')

from dev.ProbLM import JointLM, ConditionalLM
from exp_3_set_proba.prepare_data import correct_grammar, few_shot_examples 
from exp_3_set_proba.analyze import calculate_ranking, calculate_instance_probability # was calculate_p_t_V2

from exp_3_set_proba.utils import hist_of_all_p_t_values, classify, stacked_p_t_plot, hist_of_all_p_t_values, evaluate_classifier, boxplots, scatterplots, calculate_macro_avg, plot_roc_curve, plot_coverage_risk_curve_2, calculate_entropies, get_data_permutations
from exp_3_set_proba.utils import get_data, save_plot, combine_stats_dfs, subject_overview, convert_for_pdf, convert_permutations_for_plotting, find_differences_2_runs, rename_metrics
from dev.ProbLM import JointLM, ConditionalLM


from data_utils import get_wiki_summary
%load_ext autoreload
%autoreload 2

HOME_PATH = os.path.expanduser("~/")

BASE_PATH = Path(f"{HOME_PATH}/Desktop/exp_3_set_proba_V4/")

stat_metrics  = ['n_objs', 'n_subjs', 'n_para', 'n_instances',
       'dataset', 'model', 'run_name']
metrics_global = ['coverage_abs', 'coverage_rel','precision_global', 'recall_global', 'f1_global',
       'accuracy_global', 'fpr_global', 
       'precision_argmax_global', 'recall_argmax_global', 'f1_argmax_global',
       'accuracy_argmax_global',  'fpr_argmax_global',
       ] # others: 'tp_global', 'tn_global', 'fp_global', 'fn_global', 'tp_argmax_global', 'tn_argmax_global', 'fp_argmax_global', 'fn_argmax_global',
metrics_selective = ['precision_selective']
metrics_global_0_thershold = ['auc_global', 'fpr_by_threshold_global',
       'tpr_by_threshold_global', 'roc_thresholds_global', 'fpr_by_threshold_argmax_global', 'tpr_by_threshold_argmax_global',
       'roc_thresholds_argmax_global', 'auc_argmax_global']
metrics_per_paraphrase = ['precision_argmax_pp', 'recall_argmax_pp', 'f1_argmax_pp',
       'accuracy_argmax_pp', 'fpr_argmax_pp'] # others: 'tp_argmax_pp', 'tn_argmax_pp', 'fp_argmax_pp','fn_argmax_pp',
metrics_per_paraphrase_0_threshold = ['fpr_by_threshold_argmax_pp',
       'tpr_by_threshold_argmax_pp', 'roc_thresholds_argmax_pp', 'auc_argmax_pp']

BASE_PATH


PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4')

# n-shot demonstrations


In [2]:
run_names = ['trex_test_2000_50_mistral7B', 'trex_test_2000_50_3_shot_all_mistral7B',
             'trex_test_2000_50','trex_test_2000_50_3_shot_all',
             'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_3_shot_all_mistral7B', 'hypernymy_2000_50_3_shot_all_neg_mistral7B',
             'hypernymy_2000_50','hypernymy_2000_50_3_shot_all', 'hypernymy_2000_50_3_shot_all_neg',
             'PopQA_test_2000_50_mistral7B', 'PopQA_test_2000_50_3_shot_all_mistral7B', 
             'PopQA_test_2000_50', 'PopQA_test_2000_50_3_shot_all'] 
dataset_per_run = ['trex', 'trex', 
                   'trex', 'trex', 
                   'hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy', 
                   'PopQA', 'PopQA',
                   'PopQA', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 
                 'mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 'gpt-l',
                 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l']
run_attributes = ['0-shot', '3-shot',
                  '0-shot', '3-shot',
                  '0-shot', '3-shot', '3-shot neg',
                  '0-shot', '3-shot', '3-shot neg', 
                  '0-shot', '3-shot',
                  '0-shot', '3-shot']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

## 2 barplots with different thresholds

In [3]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Graph for 0.5 threshold
threshold = 0.5
df_stats_all_r = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_all_r = df_stats_all_r[ (df_stats_all_r['threshold'] == threshold) & (df_stats_all_r['max_paraphrases'] > 0) ] 

df_stats_all_r = convert_for_pdf(df_stats_all_r)
df_stats_all_r['dataset, model, coverage abs.'] = df_stats_all_r['dataset'] + ' ' + df_stats_all_r['run_attributes'] + ', ' + df_stats_all_r['model'] + ', ' + df_stats_all_r['coverage_abs_global'].astype(str)

# selective
global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
global_argmax_metrics = ['precision_argmax_selective_global', ]
pp_argmax_metrics = ['precision_argmax_selective_pp']

# all (non-selective)
pp_argmax_all_metrics = ['precision_argmax_overall_pp']
global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics
metrics_plot = global_selective_metrics.copy() + [ 'precision_argmax_overall_global'] # 'precision_argmax_selective_global',
metrics_plot.remove('coverage_abs_global')
# metrics_plot.remove('coverage_abs_pp')


df_figure = rename_metrics(df_stats_all_r.copy())
metrics_plot = [ 'coverage',
 'selective precision',
 'argmax precision']

fig_global = px.bar(df_figure, x='dataset, model, coverage abs.', y=metrics_plot, 
                    color='variable', barmode="group", title=f"N-shot Prompting: Selective Prediction w/ threshold = {threshold} vs. argmax",
                    #text='',
)

fig_global.update_layout( font_family="Serif", font_size=12)
fig_global.show()

save_plot(fig_global, BASE_PATH=BASE_PATH, filename='global_overview_n_shot')



Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / global_overview_n_shot.pdf


In [4]:
df_differences = df_stats_all_r[["dataset, model, coverage abs.", "coverage_rel_global", "precision_selective_global", "precision_argmax_overall_global"]]

df_differences.to_csv(BASE_PATH / "global_overview_n_shot.csv", index=False)
print(BASE_PATH)
df_differences

/Users/dug/Desktop/exp_3_set_proba_V4


Unnamed: 0,"dataset, model, coverage abs.",coverage_rel_global,precision_selective_global,precision_argmax_overall_global
155154,"TRex 0-shot, Mistral-7B-I, 2931",0.013267,0.859775,0.639472
155154,"TRex 3-shot, Mistral-7B-I, 3533",0.015992,0.843476,0.663364
155154,"TRex 0-shot, GPT-2-L, 1743",0.00789,0.854848,0.498252
155154,"TRex 3-shot, GPT-2-L, 2253",0.010198,0.68664,0.459596
17394,"Hypernymy 0-shot, Mistral-7B-I, 260",0.01502,0.980769,0.889081
17394,"Hypernymy 3-shot, Mistral-7B-I, 532",0.030734,0.962406,0.939341
17394,"Hypernymy 3-shot neg, Mistral-7B-I, 403",0.023281,0.952854,0.845754
17394,"Hypernymy 0-shot, GPT-2-L, 212",0.012247,0.886792,0.755633
17394,"Hypernymy 3-shot, GPT-2-L, 220",0.012709,0.336364,0.277296
17394,"Hypernymy 3-shot neg, GPT-2-L, 176",0.010168,0.306818,0.249567


## Detailed Table

In [5]:
run_names = ['trex_test_2000_50_mistral7B', 'trex_test_2000_50_3_shot_all_mistral7B',
             'trex_test_2000_50','trex_test_2000_50_3_shot_all',
             'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_3_shot_all_mistral7B', 'hypernymy_2000_50_3_shot_all_neg_mistral7B',
             'hypernymy_2000_50','hypernymy_2000_50_3_shot_all', 'hypernymy_2000_50_3_shot_all_neg',
             'PopQA_test_2000_50_mistral7B', 'PopQA_test_2000_50_3_shot_all_mistral7B', 
             'PopQA_test_2000_50', 'PopQA_test_2000_50_3_shot_all'] 
dataset_per_run = ['trex', 'trex', 
                   'trex', 'trex', 
                   'hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy', 
                   'PopQA', 'PopQA',
                   'PopQA', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 
                 'mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 'gpt-l',
                 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l']
run_attributes = ['0-shot', '3-shot',
                  '0-shot', '3-shot',
                  '0-shot', '3-shot', '3-shot neg',
                  '0-shot', '3-shot', '3-shot neg', 
                  '0-shot', '3-shot',
                  '0-shot', '3-shot']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

In [6]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Table for different thresholds
df_stats_all_r = df_all_stats[(df_all_stats['relation'].str.contains('all_relations')) & (df_all_stats['threshold'].isin([0.0, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]))]
df_stats_all_r = convert_for_pdf(df_stats_all_r)
# df_stats_all_r['dataset'] = df_stats_all_r['dataset'] + ' ' + df_stats_all_r['run_attributes']

# # selective
# global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
# pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
# global_argmax_metrics = ['precision_argmax_selective_global', ]
# pp_argmax_metrics = ['precision_argmax_selective_pp']

# # all (non-selective)
# pp_argmax_all_metrics = ['precision_argmax_overall_pp']
# global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics

metrics = ['precision_argmax_overall_global', 'precision_selective_global', 'coverage_abs_global', 'coverage_rel_global']

df_stats_all_r.rename(columns={'run_attributes': 'run attributes'}, inplace=True)
df_latex = df_stats_all_r.pivot_table(index= ['dataset', 'run attributes', 'model' ], columns=['threshold', '#p'], values=metrics)

df_latex = df_latex.transpose()

df_latex.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset,Hypernymy,Hypernymy,Hypernymy,Hypernymy,Hypernymy,Hypernymy,PopQA,PopQA,PopQA,PopQA,TRex,TRex,TRex,TRex
Unnamed: 0_level_1,Unnamed: 1_level_1,run attributes,0-shot,0-shot,3-shot,3-shot,3-shot neg,3-shot neg,0-shot,0-shot,3-shot,3-shot,0-shot,0-shot,3-shot,3-shot
Unnamed: 0_level_2,Unnamed: 1_level_2,model,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_3,threshold,#p,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
coverage_abs_global,0.0,0,17310.0,17310.0,17310.0,17310.0,17310.0,17310.0,37170.0,37170.0,37156.0,37170.0,220926.0,220926.0,220926.0,220926.0
coverage_abs_global,0.0,20,17310.0,17310.0,17310.0,17310.0,17310.0,17310.0,37170.0,37170.0,37170.0,37170.0,220926.0,220926.0,220926.0,220926.0
coverage_abs_global,0.05,0,2107.0,2234.0,1787.0,855.0,2052.0,1394.0,2982.0,2381.0,2720.0,1951.0,17763.0,12639.0,18137.0,11118.0
coverage_abs_global,0.05,20,2325.0,2109.0,2137.0,1113.0,2293.0,1791.0,3640.0,3506.0,3075.0,2874.0,20623.0,16614.0,18749.0,13871.0
coverage_abs_global,0.1,0,1170.0,1381.0,1193.0,738.0,1251.0,998.0,1776.0,1558.0,1688.0,1372.0,10554.0,8650.0,11049.0,8017.0


In [7]:
with open(BASE_PATH / 'global_overview_n_shot.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="Comparision of predictions from P(o, T(r)) $>$ 0.5 vs. top-1 ranked object in settings with and without aggregation over paraphrases.",
            label="tab:global_overview_n_shot",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

BASE_PATH / 'global_overview_n_shot.tex'

PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4/global_overview_n_shot.tex')

## Risk / Coverage Curves for Datasets

In [8]:
run_names = ['trex_test_2000_50_mistral7B', 'trex_test_2000_50_3_shot_all_mistral7B',
             'trex_test_2000_50','trex_test_2000_50_3_shot_all',
             'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_3_shot_all_mistral7B', 'hypernymy_2000_50_3_shot_all_neg_mistral7B',
             'hypernymy_2000_50','hypernymy_2000_50_3_shot_all', 'hypernymy_2000_50_3_shot_all_neg',
             'PopQA_test_2000_50_mistral7B', 'PopQA_test_2000_50_3_shot_all_mistral7B', 
             'PopQA_test_2000_50', 'PopQA_test_2000_50_3_shot_all'] 
dataset_per_run = ['trex', 'trex', 
                   'trex', 'trex', 
                   'hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy', 
                   'PopQA', 'PopQA',
                   'PopQA', 'PopQA']
model_per_run = ['mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 
                 'mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l', 'gpt-l',
                 'mistral-7B', 'mistral-7B',
                 'gpt-l', 'gpt-l']
run_attributes = ['0-shot', '3-shot',
                  '0-shot', '3-shot',
                  '0-shot', '3-shot', '3-shot neg',
                  '0-shot', '3-shot', '3-shot neg', 
                  '0-shot', '3-shot',
                  '0-shot', '3-shot']

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

In [9]:
# Risk Coverage Curve for Dataset, Model and Paraphrases
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold
# metrics for 3 full datastes = all relations, all paraphrases, full o_neg set
df_stats_overall = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_overall = df_stats_overall[ (df_stats_overall['dataset'].isin(['trex', 'PopQA', 'hypernymy']))  ] # & (df_stats_overall['max_paraphrases'] > 0) & (df_stats_overall['max_o'] == 30)
# df_stats_overall = df_stats_overall[['dataset',  'model', 'run_name', 'max_paraphrases', 'threshold', 'coverage_abs', 'relation', 'precision', 'recall', 'f1', 'accuracy', 'P(T)_fraction_above_threshold', 'coverage_abs', 'P(T)_fraction_above_threshold|pos', 'P(T)_total|pos', 'precision_argmax', 'recall_argmax', 'f1_argmax', 'accuracy_argmax']] 

models = df_stats_overall['model'].unique()
risk_names = ['precision_selective_global']

all_data = []
for model in models:
    df_stats_overall_model = df_stats_overall[df_stats_overall['model'] == model]
    df_stats_overall_model = convert_for_pdf(df_stats_overall_model)
    for risk_name in risk_names:
        fig, data = plot_coverage_risk_curve_2(df_stats_overall_model, plot_all_p_only=True, risk_name=risk_name, coverage_name='coverage_rel_global', plot_title=f'RC: Selective Prediction Using Model: {model}', out_path=BASE_PATH / f'RC_{model}_{risk_name}_coverage_dataset')
        all_data.extend(data)
        fig.update_layout( font_family="Serif", font_size=12, yaxis_title_text='selective precision')

        fig.show()    

auc_data = pd.DataFrame(all_data, columns=['dataset', 'model', 'run_attribute', 'relation','max_p', 'risk_name', 'auc'])

Plotting run: trex_test_2000_50_mistral7B
Plotting run: trex_test_2000_50_3_shot_all_mistral7B
Plotting run: hypernymy_2000_50_mistral7B
Plotting run: hypernymy_2000_50_3_shot_all_mistral7B
Plotting run: hypernymy_2000_50_3_shot_all_neg_mistral7B
Plotting run: PopQA_test_2000_50_mistral7B
Plotting run: PopQA_test_2000_50_3_shot_all_mistral7B
/Users/dug/Desktop/exp_3_set_proba_V4/RC_mistral-7B_precision_selective_global_coverage_dataset.pdf


Plotting run: trex_test_2000_50
Plotting run: trex_test_2000_50_3_shot_all
Plotting run: hypernymy_2000_50
Plotting run: hypernymy_2000_50_3_shot_all
Plotting run: hypernymy_2000_50_3_shot_all_neg
Plotting run: PopQA_test_2000_50
Plotting run: PopQA_test_2000_50_3_shot_all
/Users/dug/Desktop/exp_3_set_proba_V4/RC_gpt-l_precision_selective_global_coverage_dataset.pdf


In [10]:
# Latex Table for AUC values
df_latex = convert_for_pdf(auc_data.copy())
df_latex.rename(columns={'run_attribute': 'run attributes'}, inplace=True)

df_latex = df_latex.pivot_table(index= ['dataset', 'model'], columns=['run attributes'], values=['auc'])
df_latex["rel. diff.", "0 vs. 3"] = (df_latex[('auc', '3-shot')] - df_latex[('auc', '0-shot')]) / df_latex[('auc', '0-shot')]

with open(BASE_PATH / 'n_shot_auc_per_dataset.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="AUC values for selective prediction in 0-shot or 3-shot configuration, and for hypernymy data negative 3-shot.",
            label="tab:n_shot_auc_per_dataset",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / 'n_shot_auc_per_dataset.tex')
df_latex # TODO mabye transpose and rearange


Df columns: Index(['dataset', 'model', 'run_attribute', 'relation', 'max_p', 'risk_name',
       'auc'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_auc_per_dataset.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,auc,auc,auc,rel. diff.
Unnamed: 0_level_1,run attributes,0-shot,3-shot,3-shot neg,0 vs. 3
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Hypernymy,GPT-2-L,0.121381,0.106816,0.097361,-0.119999
Hypernymy,Mistral-7B-I,0.131575,0.214637,0.151612,0.631297
PopQA,GPT-2-L,0.044832,0.03923,,-0.124948
PopQA,Mistral-7B-I,0.060302,0.071467,,0.185154
TRex,GPT-2-L,0.092467,0.076246,,-0.175421
TRex,Mistral-7B-I,0.110729,0.125124,,0.129997


### by relation

In [11]:
# Risk Coverage Curve for Dataset, Model and Paraphrases
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold
# metrics for 3 full datastes = all relations, all paraphrases, full o_neg set
df_stats_overall = df_all_stats[(~df_all_stats['relation'].str.contains('all_relations')) & (df_all_stats['r_s_id'].isna())]
df_stats_overall = df_stats_overall[ (df_stats_overall['dataset'].isin(['trex', 'PopQA']))] #& (df_stats_overall['max_o'] == 30)
df_stats_overall = convert_for_pdf(df_stats_overall)

models = df_stats_overall['model'].unique()
risk_names = ['precision_selective_global']

all_data = []
for model in models:
    df_stats_overall_model = df_stats_overall[df_stats_overall['model'] == model]
    for risk_name in risk_names:
        for dataset in df_stats_overall['dataset'].unique():
            df_stats_overall_model_dataset = df_stats_overall_model[df_stats_overall_model['dataset'] == dataset]
            fig, data = plot_coverage_risk_curve_2(df_stats_overall_model_dataset, plot_all_p_only=True, mode='by_relation', risk_name=risk_name, coverage_name='coverage_rel_global', plot_title=f'N-shot Prompting: Selective Prediction Using Model: {model}', out_path=BASE_PATH / f'n_shot_{model}_{risk_name}_coverage_by_relation_{dataset}')
            all_data.extend(data)
            fig.update_layout( font_family="Serif", font_size=12, yaxis_title_text='selective precision')
            fig.show()    

auc_data_relations = pd.DataFrame(all_data, columns=['dataset', 'model', 'run_attributes', 'relation', 'r_s_id', 'max_p', 'risk_name', 'auc'])


Plotting run: trex_test_2000_50_mistral7B
Plotting by relation
Plotting run: trex_test_2000_50_3_shot_all_mistral7B
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_Mistral-7B-I_precision_selective_global_coverage_by_relation_TRex.pdf


Plotting run: PopQA_test_2000_50_mistral7B
Plotting by relation
Plotting run: PopQA_test_2000_50_3_shot_all_mistral7B
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_Mistral-7B-I_precision_selective_global_coverage_by_relation_PopQA.pdf


Plotting run: trex_test_2000_50
Plotting by relation
Plotting run: trex_test_2000_50_3_shot_all
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_GPT-2-L_precision_selective_global_coverage_by_relation_TRex.pdf


Plotting run: PopQA_test_2000_50
Plotting by relation
Plotting run: PopQA_test_2000_50_3_shot_all
Plotting by relation
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_GPT-2-L_precision_selective_global_coverage_by_relation_PopQA.pdf


In [12]:
# combine to one df
auc_data_all = pd.concat([auc_data, auc_data_relations], axis=0)
print(len(auc_data_all))

# convert to latex table
df_latex = convert_for_pdf(auc_data_all.copy())

df_latex.rename(columns={'run_attributes': 'run attributes'}, inplace=True)
df_latex = df_latex.pivot_table(index= ['dataset', 'relation', '\#p' ], columns=['model', 'run attributes'], values=['auc'])

df_latex[("rel. diff. 0 vs. 3-shot", "GPT-2-L", "")] = (df_latex[('auc', "GPT-2-L", '3-shot')] - df_latex[('auc', "GPT-2-L", '0-shot')]) / df_latex[('auc', "GPT-2-L", '0-shot')]
df_latex[("rel. diff. 0 vs. 3-shot", "Mistral-7B-I", "")] = (df_latex[('auc', "Mistral-7B-I", '3-shot')] - df_latex[('auc', "Mistral-7B-I", '0-shot')]) / df_latex[('auc', "Mistral-7B-I", '0-shot')]

print(len(df_latex))

with open(BASE_PATH / 'n_shot_auc_per_dataset_and_relation.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="AUC values per dataset and relation for selective prediction in 0-shot or 3-shot configuration.",
            label="tab:n_shot_auc_per_dataset_and_relation",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / 'n_shot_auc_per_dataset_and_relation.tex')
df_latex




324
Df columns: Index(['dataset', 'model', 'run_attribute', 'relation', 'max_p', 'risk_name',
       'auc', 'run_attributes', 'r_s_id'],
      dtype='object')
74
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_auc_per_dataset_and_relation.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,auc,auc,auc,auc,rel. diff. 0 vs. 3-shot,rel. diff. 0 vs. 3-shot
Unnamed: 0_level_1,Unnamed: 1_level_1,model,GPT-2-L,GPT-2-L,Mistral-7B-I,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_2,Unnamed: 1_level_2,run attributes,0-shot,3-shot,0-shot,3-shot,Unnamed: 7_level_2,Unnamed: 8_level_2
dataset,relation,\#p,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
PopQA,O is S's father.,0,0.044190,0.027703,0.053434,0.070009,-0.373087,0.310207
PopQA,O is S's father.,20,0.045565,0.037188,0.070567,0.071357,-0.183845,0.011201
PopQA,O is the author of S.,0,0.029366,0.020212,0.092269,0.110951,-0.311739,0.202473
PopQA,O is the author of S.,20,0.033533,0.029391,0.067849,0.080837,-0.123520,0.191433
PopQA,O is the screenwriter of S,0,0.036382,0.022780,0.058772,0.069722,-0.373861,0.186313
...,...,...,...,...,...,...,...,...
TRex,The native language of S is O.,20,0.113321,0.114104,0.146183,0.127423,0.006912,-0.128332
TRex,The official language of S is O.,0,0.099708,0.060308,0.114588,0.111276,-0.395147,-0.028899
TRex,The official language of S is O.,20,0.087350,0.088217,0.101247,0.101247,0.009916,-0.000006
TRex,The original language of S is O.,0,0.073184,0.037452,0.101077,0.087613,-0.488245,-0.133204


In [13]:
df_latex.columns
df_PopQA = df_latex.copy()
# df_PopQA = df_latex.loc[('PopQA')]
df_PopQA.columns

df_PopQA = df_PopQA.sort_values(by=('rel. diff. 0 vs. 3-shot', 'GPT-2-L',       ''), ascending=True)

len(df_PopQA)

df_PopQA.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,auc,auc,auc,auc,rel. diff. 0 vs. 3-shot,rel. diff. 0 vs. 3-shot
Unnamed: 0_level_1,Unnamed: 1_level_1,model,GPT-2-L,GPT-2-L,Mistral-7B-I,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_2,Unnamed: 1_level_2,run attributes,0-shot,3-shot,0-shot,3-shot,Unnamed: 7_level_2,Unnamed: 8_level_2
dataset,relation,\#p,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
PopQA,S is the mother of O.,0,0.048096,0.017387,0.075371,0.092285,-0.638504,0.2244
TRex,The original language of S is O.,0,0.073184,0.037452,0.101077,0.087613,-0.488245,-0.133204
TRex,The headquarter of S is in O.,0,0.089606,0.047534,0.129496,0.134964,-0.469518,0.042227
TRex,S is named after O.,0,0.135442,0.07553,0.155436,0.253912,-0.442344,0.633545
TRex,S died in O.,0,0.069871,0.039235,0.080851,0.089223,-0.438468,0.103546
TRex,S is the capital of O.,0,0.151619,0.085206,0.204083,0.222219,-0.438029,0.088862
PopQA,S is the capital of O.,0,0.09095,0.052029,0.10061,0.129793,-0.427941,0.290053
TRex,S was created in O.,0,0.065198,0.037821,0.079243,0.082778,-0.419903,0.044606
TRex,S is owned by O.,0,0.095138,0.055339,0.1618,0.163193,-0.418325,0.008613
TRex,S was written in O.,0,0.089786,0.052774,0.11266,0.152103,-0.412225,0.350097


# per subject

In [14]:
# pairwise run comparision (context vs. no context)
all_run_names = [['trex_test_2000_50_3_shot_all_mistral7B', 'trex_test_2000_50_mistral7B'], ['trex_test_2000_50_3_shot_all', 'trex_test_2000_50'], ['hypernymy_2000_50_3_shot_all_mistral7B', 'hypernymy_2000_50_mistral7B'], ['hypernymy_2000_50_3_shot_all', 'hypernymy_2000_50'], ['PopQA_test_2000_50_3_shot_all_mistral7B', 'PopQA_test_2000_50_mistral7B'], ['PopQA_test_2000_50_3_shot_all', 'PopQA_test_2000_50'], ['hypernymy_2000_50_3_shot_all_neg', 'hypernymy_2000_50_3_shot_all'], ['hypernymy_2000_50_3_shot_all_neg_mistral7B', 'hypernymy_2000_50_3_shot_all_mistral7B']] 
all_datasets_per_run = [['trex', 'trex'], ['trex', 'trex'], ['hypernymy', 'hypernymy'], ['hypernymy', 'hypernymy'], ['PopQA', 'PopQA'], ['PopQA', 'PopQA'], ['hypernymy', 'hypernymy'], ['hypernymy', 'hypernymy']]
all_models_per_run = [['mistral-7B', 'mistral-7B'], ['gpt-l', 'gpt-l'], ['mistral-7B', 'mistral-7B'], ['gpt-l', 'gpt-l'], ['mistral-7B', 'mistral-7B'], ['gpt-l', 'gpt-l'],['gpt-l', 'gpt-l'], ['mistral-7B', 'mistral-7B']]
all_runs_attributes = [['3-shot', '0-shot'], ['3-shot', '0-shot'], ['3-shot', '0-shot'], ['3-shot', '0-shot'], ['3-shot', '0-shot'], ['3-shot', '0-shot'], ['3-shot-neg', '3-shot'], ['3-shot-neg', '3-shot']]

len(all_run_names), len(all_datasets_per_run), len(all_models_per_run), len(all_runs_attributes)

(8, 8, 8, 8)

In [15]:
# Precision @0.5 threshold for Dataset, Model and Paraphrases and subject: compare with and without n-shot demonstrations
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold

threshold = 0.5

change_data = []
for t in range(len(all_run_names)):
    run_names = all_run_names[t]
    dataset_per_run = all_datasets_per_run[t]
    model_per_run = all_models_per_run[t]
    run_attributes = all_runs_attributes[t]    
    
    assert len(run_names) == 2 and len(dataset_per_run) == 2 and len(model_per_run) and len(run_attributes) ==2, f'Need 2 runs for comparison, got {len(run_names)} runs.'

    stats_s_all, stats_plot, data = find_differences_2_runs(run_names, dataset_per_run, model_per_run, run_attributes, BASE_PATH, threshold, metric='precision_selective_global')
    change_data.append(data)

    # plot same and different r,s,o for both runs 
    fig = px.treemap(stats_plot, path=['orig_relation_template', 'difference_str', 's, o'],  title=f'Precision @0.5 threshold for {dataset_per_run[0]}, {model_per_run[0]}',
                        color='category', hover_data=['sub_label', 'obj_label', 'argmax_o', 'old_argmax_o', 'sequence'] ) # value=...
    # fig.show()

    # plot the changes
    treemap_df = stats_plot[stats_plot['difference'] == 1]
    new_correct = treemap_df['new_correct'].to_list()
    treemap_df['new_correct_str'] = ['Incorrect -> Correct' if x == True else 'Correct -> Incorrect' for x in new_correct]

    fig = px.treemap(treemap_df, path=['orig_relation_template', 'new_correct_str', 's, o'],  title=f'Precision @0.5 threshold for {dataset_per_run[0]}, {model_per_run[0]}',
                        color='category', hover_data=['sub_label', 'obj_label', 'argmax_o', 'old_argmax_o', 'sequence'] ) # value=...
    fig.show()


        



In [17]:
change_df = pd.DataFrame(change_data, columns=['run_name', 'dataset', 'model', 'run_attributes', 'incorrect -> correct',  'correct -> incorrect', 'no change', 'total'])

change_df[f'to correct'] = change_df['incorrect -> correct'] / change_df['total'] 
change_df[f'to incorrect'] = change_df['correct -> incorrect'] / change_df['total']
change_df[f'no changes'] = change_df['no change'] / change_df['total']  # r,s,o+ combinations

change_df = change_df.astype({'incorrect -> correct': 'int', 'correct -> incorrect': 'int', 'no change': 'int', 'total': 'int'})
change_df.drop(columns=['incorrect -> correct', 'correct -> incorrect', 'no change', 'total', 'run_name'], inplace=True)
df_latex = convert_for_pdf(change_df.copy())
df_latex.rename(columns={'run_attribute': 'run attributes'}, inplace=True)
df_latex = df_latex.pivot(index=['dataset', 'model'], columns='run_attributes', values=['to correct', 'to incorrect', 'no changes'])



with open(BASE_PATH / 'n_shot_changes.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="Comparison 0-shot vs. the run attibute modification.",
            label="tab:n_shot_changes",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=False,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / f'n_shot_changes.tex')
df_latex

Df columns: Index(['dataset', 'model', 'run_attributes', 'to correct', 'to incorrect',
       'no changes'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/n_shot_changes.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,to correct,to correct,to incorrect,to incorrect,no changes,no changes
Unnamed: 0_level_1,run_attributes,3-shot,3-shot-neg,3-shot,3-shot-neg,3-shot,3-shot-neg
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Hypernymy,GPT-2-L,0.053726,0.076256,0.2513,0.110919,0.694974,0.812825
Hypernymy,Mistral-7B-I,0.455806,0.031196,0.010399,0.253033,0.533795,0.715771
PopQA,GPT-2-L,0.039563,,0.043656,,0.91678,
PopQA,Mistral-7B-I,0.121419,,0.028649,,0.849932,
TRex,GPT-2-L,0.089161,,0.078089,,0.832751,
TRex,Mistral-7B-I,0.140637,,0.051282,,0.808081,
