In [21]:
import json
import pickle

import numpy as np
import pandas as pd
import sklearn
import sklearn.metrics
import torch
import wandb

In [65]:
# LLM Parameter Tuning
params = {    
    # Model related arguments
    'generation_version': 'opt-125m',
    'experiment_id': 'run_1',
    'verbose': True,
}

In [66]:
# Dictionary to store overall results for each model
overall_result_dict = {}

# List to store Area Under the Receiver Operating Characteristic (AUROC) for each model
aurocs_across_models = []

# Dictionary to store embeddings for each sequence
sequence_embeddings_dict = {}

# Extracting run IDs that need to be analyzed from the arguments
run_version = params['run_version']

In [67]:
run_version

'run_1'

In [68]:
import wandb

# We are using wandb to track our experiments
wandb.init(project='nlg_uncertainty', id=run_ids_to_analyze, resume='allow')

model_name = wandb.config.model
print(run_name)
print(model_name)

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,0.3988
average_neg_llh_most_likely_gen_auroc,0.59308
average_rougeL_among_generations,0.11742
average_rougeL_among_generations_correct,0.13015
average_rougeL_among_generations_incorrect,0.10897
average_rougeL_auroc,0.49073
entropy_over_concepts_auroc,0.62563
ln_predictive_entropy_auroc,0.62683
margin_measure_auroc,0.58221
model_name,opt-350m


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

run_1
opt-125m


In [69]:
def load_similarity_dataframe():
    """
    Load the similarity data from a pickle file and transform it into a DataFrame.
    
    Returns:
        DataFrame: A pandas DataFrame containing similarity data.
    """
    
    # Construct the file path based on run and model names
    file_path = f'./ss/{run_version}/{model_name}_generations_similarities.pkl'
    
    # Load the pickle file into a dictionary
    with open(file_path, 'rb') as file:
        similarity_data = pickle.load(file)
    
    # Convert the dictionary to a DataFrame
    similarity_dataframe = pd.DataFrame.from_dict(similarity_data, orient='index')
    
    # Add an 'id' column based on the DataFrame index
    similarity_dataframe['id'] = similarity_dataframe.index
    
    # Convert the 'has_semantically_different_answers' column to integer type
    similarity_dataframe['has_semantically_different_answers'] = similarity_dataframe['has_semantically_different_answers'].astype('int')
    
    # Extract the 'rougeL' value from the 'syntactic_similarities' column
    similarity_dataframe['rougeL_among_generations'] = similarity_dataframe['syntactic_similarities'].apply(lambda x: x['rougeL'])

    return similarity_dataframe


In [70]:
def load_generations_dataframe():
    """
    Load the generations data from a pickle file and transform it into a DataFrame.
    
    Returns:
        DataFrame: A pandas DataFrame containing generation data.
    """
    
    # Construct the file path based on run and model names
    file_path = f'./sequences/{run_name}/{model_name}_generations.pkl'
    
    # Load the pickle file into a list
    with open(file_path, 'rb') as file:
        generation_data = pickle.load(file)
    
    # Convert the list to a DataFrame
    generation_dataframe = pd.DataFrame(generation_data)
    
    # Process 'id' column
    generation_dataframe['id'] = generation_dataframe['id'].apply(lambda x: x[0])
    generation_dataframe['id'] = generation_dataframe['id'].astype('object')
    
    # Process 'semantic_variability_reference_answers' column if no null values
    if not generation_dataframe['semantic_variability_reference_answers'].isnull().values.any():
        generation_dataframe['semantic_variability_reference_answers'] = generation_dataframe['semantic_variability_reference_answers'].apply(lambda x: x[0].item())
    
    # Process 'rougeL_reference_answers' column if no null values
    if not generation_dataframe['rougeL_reference_answers'].isnull().values.any():
        generation_dataframe['rougeL_reference_answers'] = generation_dataframe['rougeL_reference_answers'].apply(lambda x: x[0].item())
    
    # Calculate the length of the most likely generation and the answer
    generation_dataframe['length_of_most_likely_generation'] = generation_dataframe['most_likely_generation'].apply(lambda x: len(str(x).split(' ')))
    generation_dataframe['length_of_answer'] = generation_dataframe['answer'].apply(lambda x: len(str(x).split(' ')))
    
    # Calculate the variance of the length of generations
    generation_dataframe['variance_of_length_of_generations'] = generation_dataframe['generated_texts'].apply(lambda x: np.var([len(str(y).split(' ')) for y in x]))
    
    # Determine if the generation is correct based on 'rougeL_to_target'
    generation_dataframe['correct'] = (generation_dataframe['rougeL_to_target'] > 0.3).astype('int')

    return generation_dataframe

In [71]:
def load_likelihood_dataframe():
    """
    Load the likelihood data from a pickle file and transform it into a DataFrame.
    
    Returns:
        DataFrame: A pandas DataFrame containing likelihood data.
        sequence_embeddings: Embeddings for each sequence.
    """
    
    # Construct the file path based on run and model names
    file_path = f'./log/{run_name}/aggregated_likelihoods_{model_name}_generations.pkl'
    
    # Load the pickle file into a dictionary
    with open(file_path, 'rb') as file:
        likelihood_data = pickle.load(file)
        print(likelihood_data.keys())
    
    # Generate subset keys for various metrics
    metrics = ['average_predictive_entropy', 'predictive_entropy', 'semantic_predictive_entropy', 'number_of_semantic_sets']
    subset_keys = [f"{metric}_on_subset_{i}" for metric in metrics for i in range(1, num_generations + 1)]
    
    # Define the primary keys to use
    primary_keys = ('ids', 'predictive_entropy', 'mutual_information', 'average_predictive_entropy',
                    'average_pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen',
                    'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen',
                    'predictive_entropy_over_concepts', 'number_of_semantic_sets', 'unnormalised_entropy_over_concepts')
    
    # Extract the relevant data from the likelihood data
    filtered_likelihood_data = {k: likelihood_data[k] for k in primary_keys + tuple(subset_keys)}
    
    # Convert torch tensors to CPU tensors and squeeze them
    for key, value in filtered_likelihood_data.items():
        if isinstance(value, torch.Tensor):
            filtered_likelihood_data[key] = torch.squeeze(value.cpu())
    
    # Extract sequence embeddings
    sequence_embeddings = likelihood_data['sequence_embeddings']
    
    # Convert the filtered likelihood data to a DataFrame
    likelihood_dataframe = pd.DataFrame.from_dict(filtered_likelihood_data)
    
    # Rename the 'ids' column to 'id'
    likelihood_dataframe.rename(columns={'ids': 'id'}, inplace=True)

    return likelihood_dataframe, sequence_embeddings

In [72]:
# Load data from the respective functions
similarity_dataframe = load_similarity_dataframe()
generation_dataframe = load_generations_dataframe()

# Determine the number of generations based on the 'generated_texts' column
num_generations = len(generation_dataframe['generated_texts'][0])

# Load likelihood data and sequence embeddings
likelihood_dataframe, sequence_embeddings = load_likelihood_dataframe()

# Merge the dataframes based on the 'id' column to create a comprehensive result dataframe
comprehensive_dataframe = generation_dataframe.merge(similarity_dataframe, on='id').merge(likelihood_dataframe, on='id')

# Record the number of samples before any filtering
n_samples_before_filtering = len(comprehensive_dataframe)

# Calculate the length of the most likely generation for each row
comprehensive_dataframe['len_most_likely_generation_length'] = comprehensive_dataframe['most_likely_generation'].apply(lambda x: len(x.split()))

dict_keys(['neg_log_likelihoods', 'average_neg_log_likelihoods', 'sequence_embeddings', 'pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen', 'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen', 'semantic_set_ids', 'ids', 'mutual_information', 'predictive_entropy', 'predictive_entropy_over_concepts', 'unnormalised_entropy_over_concepts', 'number_of_semantic_sets', 'margin_measures', 'unnormalised_margin_measures', 'average_predictive_entropy', 'average_predictive_entropy_on_subset_1', 'predictive_entropy_on_subset_1', 'semantic_predictive_entropy_on_subset_1', 'number_of_semantic_sets_on_subset_1', 'average_predictive_entropy_on_subset_2', 'predictive_entropy_on_subset_2', 'semantic_predictive_entropy_on_subset_2', 'number_of_semantic_sets_on_subset_2', 'average_predictive_entropy_on_subset_3', 'predictive_entropy_on_subset_3', 'semantic_predictive_entropy_on_subset_3', 'number_of_semantic_sets_on_subset_3', 'averag

In [73]:
# Initialize a dictionary to store analysis results
analysis_results = {}
analysis_results['accuracy'] = comprehensive_dataframe['correct'].mean()

In [74]:
# Compute the AUROC (Area Under the Receiver Operating Characteristic) for various metrics

# 1. Length Normalized Predictive Entropy
ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['average_predictive_entropy'])
analysis_results['ln_predictive_entropy_auroc'] = ln_predictive_entropy_auroc

# 2. Predictive Entropy
predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['predictive_entropy'])
analysis_results['predictive_entropy_auroc'] = predictive_entropy_auroc

# 3. Entropy Over Concepts
entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['predictive_entropy_over_concepts'])

analysis_results['entropy_over_concepts_auroc'] = entropy_over_concepts_auroc


In [75]:
# 4. Unnormalized Entropy Over Concepts (if present in the dataframe)
if 'unnormalised_entropy_over_concepts' in comprehensive_dataframe.columns:
    unnormalised_entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(
        1 - comprehensive_dataframe['correct'], comprehensive_dataframe['unnormalised_entropy_over_concepts'])
    analysis_results['unnormalised_entropy_over_concepts_auroc'] = unnormalised_entropy_over_concepts_auroc

# Add the entropy over concepts AUROC to the list for across models comparison
aurocs_across_models.append(entropy_over_concepts_auroc)

# 5. Negative Log Likelihood of Most Likely Generation
neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'],
                                                              comprehensive_dataframe['neg_log_likelihood_of_most_likely_gen'])
analysis_results['neg_llh_most_likely_gen_auroc'] = neg_llh_most_likely_gen_auroc

# 6. Number of Semantic Sets
number_of_semantic_sets_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'],
                                                              comprehensive_dataframe['number_of_semantic_sets'])
analysis_results['number_of_semantic_sets_auroc'] = number_of_semantic_sets_auroc

# Compute average number of semantic sets for correct and incorrect predictions
analysis_results['number_of_semantic_sets_correct'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 1]['number_of_semantic_sets'].mean()
analysis_results['number_of_semantic_sets_incorrect'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 0]['number_of_semantic_sets'].mean()

# Compute average Rouge-L scores for all, correct, and incorrect predictions
analysis_results['average_rougeL_among_generations'] = comprehensive_dataframe['rougeL_among_generations'].mean()
analysis_results['average_rougeL_among_generations_correct'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 1]['rougeL_among_generations'].mean()
analysis_results['average_rougeL_among_generations_incorrect'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 0]['rougeL_among_generations'].mean()

# 7. Rouge-L AUROC
analysis_results['average_rougeL_auroc'] = sklearn.metrics.roc_auc_score(comprehensive_dataframe['correct'],

In [76]:
# 8. Average Negative Log Likelihood of Most Likely Generation
average_neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['average_neg_log_likelihood_of_most_likely_gen'])
analysis_results['average_neg_llh_most_likely_gen_auroc'] = average_neg_llh_most_likely_gen_auroc

# 9. Rouge-L based accuracy
analysis_results['rougeL_based_accuracy'] = comprehensive_dataframe['correct'].mean()

# 10. Margin Measure AUROC
analysis_results['margin_measure_auroc'] = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['average_neg_log_likelihood_of_most_likely_gen'] + 
                                                                         comprehensive_dataframe['average_neg_log_likelihood_of_second_most_likely_gen'])

In [77]:
if args.verbose:
    print('Number of samples:', len(result_df))
    print(comprehensive_dataframe['predictive_entropy'].mean())
    print(comprehensive_dataframe['average_predictive_entropy'].mean())
    print(comprehensive_dataframe['predictive_entropy_over_concepts'].mean())
    print('ln_predictive_entropy_auroc', ln_predictive_entropy_auroc)
    print('semantci entropy auroc', entropy_over_concepts_auroc)
    
    combined_entropy_auroc = sklearn.metrics.roc_auc_score( 1 - comprehensive_dataframe['correct'], comprehensive_dataframe['predictive_entropy_over_concepts'] - 3 * comprehensive_dataframe['rougeL_among_generations'])
    print('Semantic entropy +', combined_entropy_auroc)
    
    rougeL_auroc = sklearn.metrics.roc_auc_score(comprehensive_dataframe['correct'], comprehensive_dataframe['rougeL_among_generations'])
    print('RougeL among generations auroc', rougeL_auroc)
    print('margin measure auroc:', comprehensive_dataframe['margin_measure_auroc'])

Number of samples: 7184
3.4509668
0.9861466
4.7006407
ln_predictive_entropy_auroc 0.5700715655231574
semantci entropy auroc 0.57555895225158
Semantic entropy + 0.5857361771617284
RougeL among generations auroc 0.49982581267572435
margin measure auroc: 0.5998732476167101


In [78]:
# Initialize lists to store AUROCs and other metrics for different numbers of generations
ln_aurocs = []
predictive_aurocs = []
semantic_entropy_aurocs = []
avg_semantic_sets = []
avg_semantic_sets_correct = []
avg_semantic_sets_incorrect = []

# Compute metrics for each subset of generations
for i in range(1, num_generations + 1):
    subset_suffix = f"_on_subset_{i}"
    
    # Length Normalized Predictive Entropy AUROC
    ln_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'average_predictive_entropy{subset_suffix}'])
    ln_aurocs.append(ln_auroc)
    
    # Predictive Entropy AUROC
    predictive_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'predictive_entropy{subset_suffix}'])
    predictive_aurocs.append(predictive_auroc)
    
    # Semantic Predictive Entropy AUROC
    semantic_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'semantic_predictive_entropy{subset_suffix}'])
    semantic_entropy_aurocs.append(semantic_auroc)
    
    # Average number of semantic sets for all, correct, and incorrect predictions
    avg_semantic_sets.append(comprehensive_dataframe[f'number_of_semantic_sets{subset_suffix}'].mean())
    avg_semantic_sets_correct.append(comprehensive_dataframe[comprehensive_dataframe['correct'] == 1][f'number_of_semantic_sets{subset_suffix}'].mean())
    avg_semantic_sets_incorrect.append(comprehensive_dataframe[comprehensive_dataframe['correct'] == 0][f'number_of_semantic_sets{subset_suffix}'].mean())

# Update the analysis results dictionary with the computed metrics
analysis_results.update({
    'ln_predictive_entropy_auroc_on_subsets': ln_aurocs,
    'predictive_entropy_auroc_on_subsets': predictive_aurocs,
    'semantic_predictive_entropy_auroc_on_subsets': semantic_entropy_aurocs,
    'average_number_of_semantic_sets_on_subsets': avg_semantic_sets,
    'average_number_of_semantic_sets_on_subsets_correct': avg_semantic_sets_correct,
    'average_number_of_semantic_sets_on_subsets_incorrect': avg_semantic_sets_incorrect,
    'model_name': model_name,
    'run_name': run_version
})

In [79]:
wandb.log(result_dict)

# Store the analysis results and sequence embeddings for the current run ID
overall_result_dict[run_version] = analysis_results
sequence_embeddings_dict[run_version] = sequence_embeddings

wandb.finish()

# Free up GPU memory
torch.cuda.empty_cache()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
average_neg_llh_most_likely_gen_auroc,▁
average_rougeL_among_generations,▁
average_rougeL_among_generations_correct,▁
average_rougeL_among_generations_incorrect,▁
average_rougeL_auroc,▁
entropy_over_concepts_auroc,▁
ln_predictive_entropy_auroc,▁
margin_measure_auroc,▁
neg_llh_most_likely_gen_auroc,▁

0,1
accuracy,0.27492
average_neg_llh_most_likely_gen_auroc,0.56815
average_rougeL_among_generations,0.08512
average_rougeL_among_generations_correct,0.09761
average_rougeL_among_generations_incorrect,0.08039
average_rougeL_auroc,0.49983
entropy_over_concepts_auroc,0.57556
ln_predictive_entropy_auroc,0.57007
margin_measure_auroc,0.59987
model_name,opt-125m


In [80]:
with open('./overall_results.json', 'w') as f:
    json.dump(overall_result_dict, f)

with open('./sequence_embeddings.pkl', 'wb') as f:
    pickle.dump(sequence_embeddings_dict, f)

In [81]:
# Extract relevant columns from the comprehensive dataframe for accuracy verification
accuracy_check_dataframe = comprehensive_dataframe[['most_likely_generation', 'answer', 'correct']]

# Save the accuracy verification data to a CSV file
accuracy_check_dataframe.to_csv('accuracy_verification.csv')