In [13]:

from semantic_uncertainty.uncertainty import utils
import pickle
import torch
import numpy as np
import os
from sentence_transformers import SentenceTransformer, util
import time
import pandas as pd
from uncertainty.uncertainty_measures.semantic_entropy import cluster_assignment_entropy, logsumexp_by_id, predictive_entropy_rao


In [14]:
# Model for computing sentence embeddings. We use one trained for similar questions detection
# model = SentenceTransformer("all-MiniLM-L6-v2")


In [15]:
def restore (filepath) :
    """
    read a pickle file and restore the object
    """
    with open(filepath, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [16]:
run_id = "latest-run" # latest-run for latest run
output_dir = f"../A-Rush-R/uncertainty/wandb/{run_id}/files"

results_path = os.path.join(output_dir, "uncertainty_measures.pkl")
answers_path = os.path.join(output_dir, "train_generations.pkl")


results = restore(results_path)
answers = restore(answers_path)

In [17]:
for key in results.keys():
    print(f"Key: {key}")
    print(f"Value: {len(results[key])}")
    
entropy_dict = {}
for key in results['uncertainty_measures'].keys():
    if key.endswith("entropy"):
        entropy_dict[key] = results['uncertainty_measures'][key]

print(f"Entropy dict keys: {entropy_dict.keys()}")

Key: uncertainty_measures
Value: 1318
Key: semantic_ids
Value: 5
Key: graphs
Value: 5
Key: validation_is_false
Value: 5
Key: validation_unanswerable
Value: 5
Entropy dict keys: dict_keys(['cluster_assignment_entropy', 'regular_entropy', 'semantic_entropy', 'heat_t=0.1_kernel_entropy', 'heatn_t=0.1_kernel_entropy', 'heat_t=0.2_kernel_entropy', 'heatn_t=0.2_kernel_entropy', 'heat_t=0.3_kernel_entropy', 'heatn_t=0.3_kernel_entropy', 'heat_t=0.4_kernel_entropy', 'heatn_t=0.4_kernel_entropy', 'heat_t=0.5_kernel_entropy', 'heatn_t=0.5_kernel_entropy', 'heat_t=0.6_kernel_entropy', 'heatn_t=0.6_kernel_entropy', 'heat_t=0.7_kernel_entropy', 'heatn_t=0.7_kernel_entropy', 'matern_kappa=1.0_nu=1.0_kernel_entropy', 'maternn_kappa=1.0_nu=1.0_kernel_entropy', 'matern_kappa=1.0_nu=2.0_kernel_entropy', 'maternn_kappa=1.0_nu=2.0_kernel_entropy', 'matern_kappa=1.0_nu=3.0_kernel_entropy', 'maternn_kappa=1.0_nu=3.0_kernel_entropy', 'matern_kappa=2.0_nu=1.0_kernel_entropy', 'maternn_kappa=2.0_nu=1.0_kernel_

## ID's analysis

In [19]:
# retrieve the corpus_sentences to the first question
i = 0
question_id = list(answers.keys())[i]
responses = answers[question_id]['responses']

# each question has a list of responses, that have semantic id's
semantic_ids = results['semantic_ids'][i]
corpus_sentences = [x[0] for x in responses]
log_liks = [x[1] for x in responses]
log_liks_agg = [np.mean(log_lik) for log_lik in log_liks]

print(semantic_ids)

[0, 1, 0, 2, 0]


In [20]:
semantic_entropy = entropy_dict['semantic_entropy'][i]
regular_entropy = entropy_dict['regular_entropy'][i]
kernel_entropy = entropy_dict['heat_t=0.1_kernel_entropy'][i]
cluster_entropy = entropy_dict['cluster_assignment_entropy'][i]


### Generate semantic id's using the responses and similarity metric

#### 1. [fast clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/applications/clustering/fast_clustering.py)

In [21]:
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")

embeds_path = f"corpus_embeds_{i}.pkl"

if os.path.exists(embeds_path): 
    corpus_embeddings = torch.load(embeds_path)
    print("Successfully loaded embeds")
else :
    corpus_embeddings = model.encode(corpus_sentecommunity_detectionnces, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
    torch.save(corpus_embeddings, embeds_path)
    print("Successfully saved embeds")

print("Start clustering")
start_time = time.time()

# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least 25 elements
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=1, threshold=0.8)

print(f"Clustering done after {time.time() - start_time:.2f} sec")

# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i + 1}, #{len(cluster)} Elements ")
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

new_semantic_ids = np.array([0] * len(responses))

for i, cluster in enumerate(clusters):
    for idx in cluster:
        new_semantic_ids[idx] = i

Encode the corpus. This might take a while
Successfully loaded embeds
Start clustering
Clustering done after 0.09 sec

Cluster 1, #3 Elements 
	 406
	 406
	 406
	 ...
	 406
	 406
	 406

Cluster 2, #1 Elements 
	 132
	 ...
	 132

Cluster 3, #1 Elements 
	 129
	 ...
	 129


  corpus_embeddings = torch.load(embeds_path)


Comparision with semantic id's

In [22]:
# Plot the answer and new semantic id and the semantic id
df = pd.DataFrame({
    'answer': corpus_sentences,
    'semantic_id': semantic_ids,
    'new_semantic_id': new_semantic_ids
})

new_cluster_entropy = cluster_assignment_entropy(df['new_semantic_id'].values)


# Compute semantic entropy.
unique_ids, log_likelihood_per_semantic_id = logsumexp_by_id(semantic_ids, log_liks_agg, agg='sum_normalized')
new_semantic_entropy = predictive_entropy_rao(log_likelihood_per_semantic_id)


In [23]:
print(f"{'Entropy Type':<22} {'Value':<10}")
print("-" * 30)
print(f"{'Cluster Entropy':<22} {cluster_entropy:<10.4f}")
print(f"{'Regular Entropy':<22} {regular_entropy:<10.4f}")
print(f"{'Semantic Entropy':<22} {semantic_entropy:<10.4f}")
print(f"{'Kernel Entropy':<22} {kernel_entropy:<10.4f}")
print(f"{'New Cluster Entropy':<22} {new_cluster_entropy:<10.4f}")
print(f"{'New Semantic Entropy':<22} {new_semantic_entropy:<10.4f}")

df


Entropy Type           Value     
------------------------------
Cluster Entropy        0.9503    
Regular Entropy        0.9261    
Semantic Entropy       0.5483    
Kernel Entropy         1.5855    
New Cluster Entropy    0.9503    
New Semantic Entropy   1.0185    


Unnamed: 0,answer,semantic_id,new_semantic_id
0,406,0,0
1,132,1,1
2,406,0,0
3,406,2,0
4,129,0,2
