In [29]:
import pickle
import torch
import numpy as np
import os
from sentence_transformers import SentenceTransformer, util
import time
import pandas as pd
from uncertainty.uncertainty_measures.semantic_entropy import cluster_assignment_entropy, logsumexp_by_id, predictive_entropy_rao


In [30]:
# Model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer("all-MiniLM-L6-v2")


In [31]:
def restore (filepath) :
    """
    read a pickle file and restore the object
    """
    with open(filepath, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [32]:
run_id = "latest-run" # latest-run for latest run
output_dir = f"../root/uncertainty/wandb/{run_id}/files"

results_path = os.path.join(output_dir, "uncertainty_measures.pkl")
answers_path = os.path.join(output_dir, "train_generations.pkl")


results = restore(results_path)
answers = restore(answers_path)

In [33]:
for key in results.keys():
    print(f"Key: {key}")
    print(f"Value: {len(results[key])}")
    
entropy_dict = {}
for key in results['uncertainty_measures'].keys():
    if key.endswith("entropy"):
        entropy_dict[key] = results['uncertainty_measures'][key]

print(f"Entropy dict keys: {entropy_dict.keys()}")

Key: uncertainty_measures
Value: 1318
Key: semantic_ids
Value: 20
Key: graphs
Value: 20
Key: validation_is_false
Value: 20
Key: validation_unanswerable
Value: 20
Entropy dict keys: dict_keys(['cluster_assignment_entropy', 'regular_entropy', 'semantic_entropy', 'heat_t=0.1_kernel_entropy', 'heatn_t=0.1_kernel_entropy', 'heat_t=0.2_kernel_entropy', 'heatn_t=0.2_kernel_entropy', 'heat_t=0.3_kernel_entropy', 'heatn_t=0.3_kernel_entropy', 'heat_t=0.4_kernel_entropy', 'heatn_t=0.4_kernel_entropy', 'heat_t=0.5_kernel_entropy', 'heatn_t=0.5_kernel_entropy', 'heat_t=0.6_kernel_entropy', 'heatn_t=0.6_kernel_entropy', 'heat_t=0.7_kernel_entropy', 'heatn_t=0.7_kernel_entropy', 'matern_kappa=1.0_nu=1.0_kernel_entropy', 'maternn_kappa=1.0_nu=1.0_kernel_entropy', 'matern_kappa=1.0_nu=2.0_kernel_entropy', 'maternn_kappa=1.0_nu=2.0_kernel_entropy', 'matern_kappa=1.0_nu=3.0_kernel_entropy', 'maternn_kappa=1.0_nu=3.0_kernel_entropy', 'matern_kappa=2.0_nu=1.0_kernel_entropy', 'maternn_kappa=2.0_nu=1.0_ker

## ID's analysis

In [34]:
# retrieve the corpus_sentences to the first question
i = 2
question_id = list(answers.keys())[i]
responses = answers[question_id]['responses']

# each question has a list of responses, that have semantic id's
semantic_ids = results['semantic_ids'][i]
corpus_sentences = [x[0] for x in responses]
log_liks = [x[1] for x in responses]
log_liks_agg = [np.mean(log_lik) for log_lik in log_liks]

print(semantic_ids)

[0, 1, 2, 1, 1, 1, 0, 3, 1, 1]


In [35]:
semantic_entropy = entropy_dict['semantic_entropy'][i]
regular_entropy = entropy_dict['regular_entropy'][i]
kernel_entropy = entropy_dict['heat_t=0.1_kernel_entropy'][i]
cluster_entropy = entropy_dict['cluster_assignment_entropy'][i]


### Generate semantic id's using the responses and similarity metric

In [36]:
df = pd.DataFrame({
    'answer': corpus_sentences,
    'semantic_id': semantic_ids
})

Embeddings

In [37]:
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")

embeds_path = f"corpus_embeds_{i}.pt"

if os.path.exists(embeds_path): 
    corpus_embeddings = torch.load(embeds_path)
    print("Successfully loaded embeds")
else :
    corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
    torch.save(corpus_embeddings, embeds_path)
    print("Successfully saved embeds")

Encode the corpus. This might take a while
Successfully loaded embeds


  corpus_embeddings = torch.load(embeds_path)


#### 1. [fast clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/applications/clustering/fast_clustering.py)

In [38]:
print("Start clustering")
start_time = time.time()

# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least 25 elements
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=1, threshold=0.8)

print(f"Clustering done after {time.time() - start_time:.2f} sec")

# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i + 1}, #{len(cluster)} Elements ")
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

new_semantic_ids = np.array([0] * len(responses))

for i, cluster in enumerate(clusters):
    for idx in cluster:
        new_semantic_ids[idx] = i
        
df['new_semantic_id'] = new_semantic_ids
new_cluster_entropy = cluster_assignment_entropy(df['new_semantic_id'].values)

# Compute semantic entropy.
unique_ids, log_likelihood_per_semantic_id = logsumexp_by_id(semantic_ids, log_liks_agg, agg='sum_normalized')
new_semantic_entropy = predictive_entropy_rao(log_likelihood_per_semantic_id)

Start clustering
Clustering done after 0.00 sec

Cluster 1, #3 Elements 
	 23
	 26
	 26
	 ...
	 23
	 26
	 26

Cluster 2, #2 Elements 
	 3
	 4
	 ...
	 3
	 4

Cluster 3, #1 Elements 
	 108
	 ...
	 108

Cluster 4, #1 Elements 
	 33
	 ...
	 33

Cluster 5, #1 Elements 
	 858
	 ...
	 858

Cluster 6, #1 Elements 
	 18
	 ...
	 18

Cluster 7, #1 Elements 
	 9
	 ...
	 9


#### 2. [agglomorative clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/applications/clustering/agglomerative.py)

In [39]:

from sklearn.cluster import AgglomerativeClustering

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=0.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings.cpu().numpy())

semantic_ids = clustering_model.labels_
df['agg_cluster_id'] = semantic_ids

agg_cluster_entropy = cluster_assignment_entropy(semantic_ids)

# Compute semantic entropy.
unique_ids, log_likelihood_per_semantic_id = logsumexp_by_id(semantic_ids, log_liks_agg, agg='sum_normalized')
agg_semantic_entropy = predictive_entropy_rao(log_likelihood_per_semantic_id)

Comparision with semantic id's

In [40]:
print(f"{'Entropy Type':<22} {'Value':<10}")
print("-" * 30)
print(f"{'Cluster Entropy':<22} {cluster_entropy:<10.4f}")
print(f"{'Regular Entropy':<22} {regular_entropy:<10.4f}")
print(f"{'Semantic Entropy':<22} {semantic_entropy:<10.4f}")
print(f"{'Kernel Entropy':<22} {kernel_entropy:<10.4f}")
print(f"{'New Cluster Entropy':<22} {new_cluster_entropy:<10.4f}")
print(f"{'New Semantic Entropy':<22} {new_semantic_entropy:<10.4f}")
print(f"{'Agglomerative Cluster Entropy':<22} {agg_cluster_entropy:<10.4f}")
print(f"{'Agglomerative Semantic Entropy':<22} {agg_semantic_entropy:<10.4f}")

df


Entropy Type           Value     
------------------------------
Cluster Entropy        1.0889    
Regular Entropy        1.1150    
Semantic Entropy       0.6601    
Kernel Entropy         2.2292    
New Cluster Entropy    1.8344    
New Semantic Entropy   0.9231    
Agglomerative Cluster Entropy 2.1640    
Agglomerative Semantic Entropy 2.0936    


Unnamed: 0,answer,semantic_id,new_semantic_id,agg_cluster_id
0,108,0,2,4
1,3,1,1,8
2,23,2,0,7
3,33,1,3,5
4,26,1,0,0
5,26,1,0,0
6,858,0,4,6
7,4,3,1,2
8,18,1,5,3
9,9,1,6,1
