# Setup

In [1]:
import json, os, sys, re
import pandas as pd
from collections import defaultdict
import numpy as np
import torch
from scipy import stats
from scipy.stats import entropy
from datasets import load_dataset, Dataset
import itertools
import torch
from pathlib import Path
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.special import softmax
import pickle
from joblib import Parallel, delayed
import language_tool_python
from itertools import combinations
# add src folder to path
sys.path.append('..')

from dev.ProbLM import JointLM, ConditionalLM
from exp_3_set_proba.prepare_data import correct_grammar, few_shot_examples 
from exp_3_set_proba.analyze import calculate_ranking, calculate_instance_probability # was calculate_p_t_V2

from exp_3_set_proba.utils import hist_of_all_p_t_values, classify, stacked_p_t_plot, hist_of_all_p_t_values, evaluate_classifier, boxplots, scatterplots, calculate_macro_avg, plot_roc_curve, plot_coverage_risk_curve_2, calculate_entropies, get_data_permutations
from exp_3_set_proba.utils import get_data, save_plot, combine_stats_dfs, subject_overview, convert_for_pdf, convert_permutations_for_plotting, find_differences_2_runs, rename_metrics
from dev.ProbLM import JointLM, ConditionalLM


from data_utils import get_wiki_summary
%load_ext autoreload
%autoreload 2

HOME_PATH = os.path.expanduser("~/")

BASE_PATH = Path(f"{HOME_PATH}/Desktop/exp_3_set_proba_V4/")

stat_metrics  = ['n_objs', 'n_subjs', 'n_para', 'n_instances',
       'dataset', 'model', 'run_name']
metrics_global = ['coverage_abs', 'coverage_rel','precision_global', 'recall_global', 'f1_global',
       'accuracy_global', 'fpr_global', 
       'precision_argmax_global', 'recall_argmax_global', 'f1_argmax_global',
       'accuracy_argmax_global',  'fpr_argmax_global',
       ] # others: 'tp_global', 'tn_global', 'fp_global', 'fn_global', 'tp_argmax_global', 'tn_argmax_global', 'fp_argmax_global', 'fn_argmax_global',
metrics_selective = ['precision_selective']
metrics_global_0_thershold = ['auc_global', 'fpr_by_threshold_global',
       'tpr_by_threshold_global', 'roc_thresholds_global', 'fpr_by_threshold_argmax_global', 'tpr_by_threshold_argmax_global',
       'roc_thresholds_argmax_global', 'auc_argmax_global']
metrics_per_paraphrase = ['precision_argmax_pp', 'recall_argmax_pp', 'f1_argmax_pp',
       'accuracy_argmax_pp', 'fpr_argmax_pp'] # others: 'tp_argmax_pp', 'tn_argmax_pp', 'fp_argmax_pp','fn_argmax_pp',
metrics_per_paraphrase_0_threshold = ['fpr_by_threshold_argmax_pp',
       'tpr_by_threshold_argmax_pp', 'roc_thresholds_argmax_pp', 'auc_argmax_pp']

BASE_PATH


PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4')

# Reading Comprehension
- add contexts about subject

In [2]:
run_names = ['hypernymy_2000_50_s_contexts_mistral7B', 'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_s_contexts_neg_mistral7B',
             'hypernymy_2000_50_s_contexts',  'hypernymy_2000_50', 'hypernymy_2000_50_s_contexts_neg']
dataset_per_run = ['hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy',]
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l',  'gpt-l', 'gpt-l']
run_attributes = ['+ context', '', '+ neg context', 
                  '+ context', '', '+ neg context' ]

len(run_names), len(dataset_per_run), len(model_per_run), len(run_attributes)
df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

## 2 barplots with different thresholds

In [3]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Graph for 0.5 threshold
threshold = 0.5
df_stats_all_r = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_all_r = df_stats_all_r[ (df_stats_all_r['threshold'] == threshold) & (df_stats_all_r['max_paraphrases'] > 0) ] 
df_stats_all_r = convert_for_pdf(df_stats_all_r)
df_stats_all_r['dataset, model, coverage abs.'] = df_stats_all_r['dataset'] + ' ' + df_stats_all_r['run_attributes'] + ', ' + df_stats_all_r['model'] + ', ' + ', ' + df_stats_all_r['coverage_abs_global'].astype(str)

# selective
global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
global_argmax_metrics = ['precision_argmax_selective_global', ]
pp_argmax_metrics = ['precision_argmax_selective_pp']

# all (non-selective)
pp_argmax_all_metrics = ['precision_argmax_overall_pp']
global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics
metrics_plot = global_selective_metrics.copy() + [ 'precision_argmax_overall_global'] # 'precision_argmax_selective_global',
metrics_plot.remove('coverage_abs_global')
# metrics_plot.remove('coverage_abs_pp')

df_bar_plot = rename_metrics(df_stats_all_r.copy())
metrics_plot = [ 'coverage',
 'selective precision',
 'argmax precision']
fig_global = px.bar(df_bar_plot, x='dataset, model, coverage abs.', y=metrics_plot, 
                    color='variable', barmode="group", title=f"RC: Selective Prediction w/ threshold = {threshold} vs. argmax",
                    #text='',
)


fig_global.show()

save_plot(fig_global, BASE_PATH=BASE_PATH, filename='global_overview_RC')



Saved to /Users/dug/Desktop/exp_3_set_proba_V4 / global_overview_RC.pdf


In [4]:
df_differences = df_stats_all_r[["dataset, model, coverage abs.", "coverage_rel_global", "precision_selective_global", "precision_argmax_overall_global"]]

df_differences.to_csv(BASE_PATH / "global_overview_RC.csv", index=False)
print(BASE_PATH)
df_differences

/Users/dug/Desktop/exp_3_set_proba_V4


Unnamed: 0,"dataset, model, coverage abs.",coverage_rel_global,precision_selective_global,precision_argmax_overall_global
17394,"Hypernymy + context, Mistral-7B-I, , 280",0.016176,0.967857,0.885615
17394,"Hypernymy , Mistral-7B-I, , 260",0.01502,0.980769,0.889081
17394,"Hypernymy + neg context, Mistral-7B-I, , 144",0.008319,0.652778,0.426343
17394,"Hypernymy + context, GPT-2-L, , 397",0.022935,0.906801,0.857886
17394,"Hypernymy , GPT-2-L, , 212",0.012247,0.886792,0.755633
17394,"Hypernymy + neg context, GPT-2-L, , 251",0.0145,0.278884,0.279029


## Detailed Table

In [5]:
run_names = ['hypernymy_2000_50_s_contexts_mistral7B', 'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_s_contexts_neg_mistral7B',
             'hypernymy_2000_50_s_contexts',  'hypernymy_2000_50', 'hypernymy_2000_50_s_contexts_neg']
dataset_per_run = ['hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy',]
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l',  'gpt-l', 'gpt-l']
run_attributes = ['+ context', '', '+ neg context', 
                  '+ context', '', '+ neg context' ]

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

In [5]:
# Selective Prediction for Dataset, Model and #Paraphrases
# Table for different thresholds
df_stats_all_r = df_all_stats[(df_all_stats['relation'].str.contains('all_relations')) & (df_all_stats['threshold'].isin([0.0, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]))]
num_para = df_stats_all_r['max_paraphrases'].to_list()
# # num_para = ['all' if x >0 else '0' for x in num_para]
# df_stats_all_r['\#paraphrases'] = num_para
df_stats_all_r['dataset'] = df_stats_all_r['dataset'] + ' ' + df_stats_all_r['run_attributes']

df_stats_all_r = convert_for_pdf(df_stats_all_r, para_expl=True)

# # selective
# global_selective_metrics = ['coverage_abs_global', 'coverage_rel_global', 'precision_selective_global']
# pp_selective_metrics = ['coverage_abs_pp', 'coverage_rel_pp', 'precision_selective_pp']
# global_argmax_metrics = ['precision_argmax_selective_global', ]
# pp_argmax_metrics = ['precision_argmax_selective_pp']

# # all (non-selective)
# pp_argmax_all_metrics = ['precision_argmax_overall_pp']
# global_argmax_all_metrics = ['precision_argmax_overall_global']

# all_metrics = global_selective_metrics + pp_selective_metrics + global_argmax_metrics + pp_argmax_metrics + pp_argmax_all_metrics + global_argmax_all_metrics

metrics = ['precision_argmax_overall_global', 'precision_selective_global', 'coverage_abs_global', 'coverage_rel_global']
df_latex = df_stats_all_r.pivot_table(index= ['dataset', 'model' ], columns=['threshold', '#p'], values=metrics)
df_latex = df_latex.transpose()

df_latex.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset,Hypernymy,Hypernymy,Hypernymy + context,Hypernymy + context,Hypernymy + neg context,Hypernymy + neg context
Unnamed: 0_level_1,Unnamed: 1_level_1,model,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I,GPT-2-L,Mistral-7B-I
Unnamed: 0_level_2,threshold,#p,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
coverage_abs_global,0.0,0,17310.0,17310.0,17310.0,17310.0,17310.0,17310.0
coverage_abs_global,0.0,20,17310.0,17310.0,17310.0,17310.0,17310.0,17310.0
coverage_abs_global,0.05,0,2107.0,2234.0,2216.0,1345.0,2301.0,1610.0
coverage_abs_global,0.05,20,2325.0,2109.0,1676.0,2323.0,2123.0,2572.0
coverage_abs_global,0.1,0,1170.0,1381.0,1218.0,1008.0,1252.0,1103.0


In [6]:
with open(BASE_PATH / 'global_overview_RC.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="Comparision of predictions from P(o, T(r)) $>$ 0.5 vs. top-1 ranked object in settings with and without aggregation over paraphrases.",
            label="tab:global_overview_RC",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

BASE_PATH / 'global_overview_RC.tex'

PosixPath('/Users/dug/Desktop/exp_3_set_proba_V4/global_overview_RC.tex')

## Risk / Coverage Curves for Datasets

In [7]:
run_names = ['hypernymy_2000_50_s_contexts_mistral7B', 'hypernymy_2000_50_mistral7B', 'hypernymy_2000_50_s_contexts_neg_mistral7B',
             'hypernymy_2000_50_s_contexts',  'hypernymy_2000_50', 'hypernymy_2000_50_s_contexts_neg']
dataset_per_run = ['hypernymy', 'hypernymy', 'hypernymy',
                   'hypernymy', 'hypernymy', 'hypernymy',]
model_per_run = ['mistral-7B', 'mistral-7B', 'mistral-7B',
                 'gpt-l',  'gpt-l', 'gpt-l']
run_attributes = ['+ context', '', '+ neg context', 
                  '+ context', '', '+ neg context' ]

df_all_stats = combine_stats_dfs(run_names, dataset_per_run, model_per_run, BASE_PATH, run_attributes=run_attributes)

In [8]:
# Risk Coverage Curve for Dataset, Model and Paraphrases
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold
# metrics for 3 full datastes = all relations, all paraphrases, full o_neg set
df_stats_overall = df_all_stats[df_all_stats['relation'].str.contains('all_relations')]
df_stats_overall = df_stats_overall[ (df_stats_overall['dataset'].isin(['trex', 'PopQA', 'hypernymy']))  ] # & (df_stats_overall['max_paraphrases'] > 0) & (df_stats_overall['max_o'] == 30)
# df_stats_overall = df_stats_overall[['dataset',  'model', 'run_name', 'max_paraphrases', 'threshold', 'coverage_abs', 'relation', 'precision', 'recall', 'f1', 'accuracy', 'P(T)_fraction_above_threshold', 'coverage_abs', 'P(T)_fraction_above_threshold|pos', 'P(T)_total|pos', 'precision_argmax', 'recall_argmax', 'f1_argmax', 'accuracy_argmax']] 
df_stats_overall = convert_for_pdf(df_stats_overall)
models = df_stats_overall['model'].unique()
risk_names = ['precision_selective_global']

all_data = []
for model in models:
    df_stats_overall_model = df_stats_overall[df_stats_overall['model'] == model]
    for risk_name in risk_names:
        fig, data = plot_coverage_risk_curve_2(df_stats_overall_model, risk_name=risk_name, coverage_name='coverage_rel_global', plot_title=f'RC: Selective Prediction Using Model: {model}', out_path=BASE_PATH / f'RC_{model}_{risk_name}_coverage_dataset')
        all_data.extend(data)
        fig.update_layout( font_family="Serif", font_size=12, yaxis_title_text='selective precision')
        fig.show()    
        
auc_data = pd.DataFrame(all_data, columns=['dataset', 'model', 'run_attributes', 'relation', 'max_p', 'risk_name', 'auc'])

Plotting run: hypernymy_2000_50_s_contexts_mistral7B
Plotting run: hypernymy_2000_50_mistral7B
Plotting run: hypernymy_2000_50_s_contexts_neg_mistral7B
/Users/dug/Desktop/exp_3_set_proba_V4/RC_Mistral-7B-I_precision_selective_global_coverage_dataset.pdf


Plotting run: hypernymy_2000_50_s_contexts
Plotting run: hypernymy_2000_50
Plotting run: hypernymy_2000_50_s_contexts_neg
/Users/dug/Desktop/exp_3_set_proba_V4/RC_GPT-2-L_precision_selective_global_coverage_dataset.pdf


In [9]:
# Latex Table for AUC values
df_latex = convert_for_pdf(auc_data.copy())
df_latex.rename(columns={'run_attributes': 'run attributes'}, inplace=True)
run_attributes = df_latex['run attributes'].to_list()
df_latex['run attributes'] = ['0-shot' if x == '' else x for x in run_attributes]
df_latex = df_latex.pivot_table(index= ['dataset', 'model' ], columns=['run attributes'], values=['auc'])
df_latex["rel. diff.", "+ context"] = (df_latex[('auc', '+ context')] - df_latex[('auc', '0-shot')]) / df_latex[('auc', '0-shot')]
df_latex["rel. diff.", "+ neg context"] = (df_latex[('auc', '+ neg context')] - df_latex[('auc', '0-shot')]) / df_latex[('auc', '0-shot')]
with open(BASE_PATH / 'RC_auc_per_dataset.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="AUC values for selective prediction on the hypernymy data in the standard setting (0-shot), with context information about the subject, and with a random subject provided as control run.",
            label="tab:RC_auc_per_dataset",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=True,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / 'RC_auc_per_dataset.tex')
df_latex


Df columns: Index(['dataset', 'model', 'run_attributes', 'relation', 'max_p', 'risk_name',
       'auc'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/RC_auc_per_dataset.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,auc,auc,auc,rel. diff.,rel. diff.
Unnamed: 0_level_1,run attributes,+ context,+ neg context,0-shot,+ context,+ neg context
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Hypernymy,GPT-2-L,0.142964,0.099229,0.121381,0.177809,-0.182504
Hypernymy,Mistral-7B-I,0.150922,0.119916,0.131575,0.147042,-0.088606


# per subject

In [10]:
# pairwise run comparision (context vs. no context)
all_run_names = [['hypernymy_2000_50_s_contexts_mistral7B', 'hypernymy_2000_50_mistral7B'], ['hypernymy_2000_50_s_contexts',  'hypernymy_2000_50'], ['hypernymy_2000_50_s_contexts_neg_mistral7B', 'hypernymy_2000_50_mistral7B'], ['hypernymy_2000_50_s_contexts_neg', 'hypernymy_2000_50']]
all_datasets_per_run = [['hypernymy', 'hypernymy'], ['hypernymy', 'hypernymy'], ['hypernymy', 'hypernymy'], ['hypernymy', 'hypernymy']]
all_models_per_run = [['mistral-7B', 'mistral-7B'], ['gpt-l',  'gpt-l'], ['mistral-7B', 'mistral-7B'], ['gpt-l',  'gpt-l']]
all_runs_attributes = [['+ context', ''], ['+ context', ''], ['+ neg context', ''], ['+ neg context', '']]

len(all_run_names), len(all_datasets_per_run), len(all_models_per_run), len(all_runs_attributes)

(4, 4, 4, 4)

In [11]:
# Precision @0.5 threshold for Dataset, Model and Paraphrases and subject: compare with and without context
# RISK / COVERAGE CURVE for GLOBAL Classification with moving threshold

threshold = 0.5

change_data = []
for t in range(len(all_run_names)):
    run_names = all_run_names[t]
    dataset_per_run = all_datasets_per_run[t]
    model_per_run = all_models_per_run[t]
    run_attributes = all_runs_attributes[t]    
    
    assert len(run_names) == 2 and len(dataset_per_run) == 2 and len(model_per_run) and len(run_attributes) ==2, f'Need 2 runs for comparison, got {len(run_names)} runs.'

    stats_s_all, stats_plot, data = find_differences_2_runs(run_names, dataset_per_run, model_per_run, run_attributes, BASE_PATH, threshold, metric='precision_selective_global')
    change_data.append(data)

    # plot same and different r,s,o for both runs 
    fig = px.treemap(stats_plot, path=['orig_relation_template', 'difference_str', 's, o'],  title=f'Precision @0.5 threshold for {dataset_per_run[0]}, {model_per_run[0]}',
                        color='category', hover_data=['sub_label', 'obj_label', 'argmax_o', 'old_argmax_o', 'sequence'] ) # value=...
    # fig.show()

    # plot the changes
    treemap_df = stats_plot[stats_plot['difference'] == 1]
    new_correct = treemap_df['new_correct'].to_list()
    treemap_df['new_correct_str'] = ['Incorrect -> Correct' if x == True else 'Correct -> Incorrect' for x in new_correct]

    fig = px.treemap(treemap_df, path=['orig_relation_template', 'new_correct_str', 's, o'],  title=f'Precision @0.5 threshold for {dataset_per_run[0]}, {model_per_run[0]}',
                        color='category', hover_data=['sub_label', 'obj_label', 'argmax_o', 'old_argmax_o', 'sequence'] ) # value=...
    fig.show()


  

In [12]:
      
change_df = pd.DataFrame(change_data, columns=['run_name', 'dataset', 'model', 'run_attributes', 'incorrect -> correct',  'correct -> incorrect', 'no change', 'total'])

change_df[f'to correct'] = change_df['incorrect -> correct'] / change_df['total'] 
change_df[f'to incorrect'] = change_df['correct -> incorrect'] / change_df['total']
change_df[f'no changes'] = change_df['no change'] / change_df['total']  # r,s,o+ combinations

change_df = change_df.astype({'incorrect -> correct': 'int', 'correct -> incorrect': 'int', 'no change': 'int', 'total': 'int'})
change_df.drop(columns=['incorrect -> correct', 'correct -> incorrect', 'no change', 'total', 'run_name'], inplace=True)
df_latex = convert_for_pdf(change_df.copy())
df_latex.rename(columns={'run_attribute': 'run attributes'}, inplace=True)

df_latex = df_latex.pivot(index=['dataset', 'model'], columns='run_attributes', values=['to correct', 'to incorrect', 'no changes'])

with open(BASE_PATH / 'RC_changes.tex', 'w') as f:
    f.write(
        df_latex.to_latex(
            index=True, formatters={"name": str.upper}, 
            caption="Comparison 0-shot vs. the run attibute modification.",
            label="tab:RC_changes",
            float_format="{:.3f}".format,
            # column_format="|l|l|l|l|",
            escape=False,
            bold_rows=False, # Make the header row bold
            sparsify =True,
            position='H',
        ),
    )

print(BASE_PATH / f'RC_changes.tex')
df_latex

Df columns: Index(['dataset', 'model', 'run_attributes', 'to correct', 'to incorrect',
       'no changes'],
      dtype='object')
/Users/dug/Desktop/exp_3_set_proba_V4/RC_changes.tex


Unnamed: 0_level_0,Unnamed: 1_level_0,to correct,to correct,to incorrect,to incorrect,no changes,no changes
Unnamed: 0_level_1,run_attributes,+ context,+ neg context,+ context,+ neg context,+ context,+ neg context
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Hypernymy,GPT-2-L,0.315425,0.02773,0.017331,0.232236,0.667244,0.740035
Hypernymy,Mistral-7B-I,0.17331,0.020797,0.145581,0.299827,0.681109,0.679376
