In [101]:
from semantic_uncertainty.uncertainty import utils
import pickle
import numpy as np
import os
from sentence_transformers import SentenceTransformer, util
import time


In [102]:
# Model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer("all-MiniLM-L6-v2")


In [103]:
def restore (filepath) :
    """
    read a pickle file and restore the object
    """
    with open(filepath, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [104]:
run_id = "latest-run" # latest-run for latest run
output_dir = f"A-Rush-R/uncertainty/wandb/{run_id}/files"

results_path = os.path.join(output_dir, "uncertainty_measures.pkl")
answers_path = os.path.join(output_dir, "train_generations.pkl")


results = restore(results_path)
answers = restore(answers_path)

### Debugging

In [105]:
for key in results.keys():
    print(f"Key: {key}")
    print(f"Value: {len(results[key])}")
    
print(results["semantic_ids"])

Key: uncertainty_measures
Value: 1318
Key: semantic_ids
Value: 5
Key: graphs
Value: 5
Key: validation_is_false
Value: 5
Key: validation_unanswerable
Value: 5
[[0, 1, 0, 2, 0], [0, 1, 0, 2, 0], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 2, 3]]


In [106]:
results['uncertainty_measures']['semantic_entropy']

[0.5482929330011391,
 0.5792597219026698,
 1.4792809189749923,
 1.582697469347715,
 1.1640771350975672]

In [107]:
for key in sorted(answers.keys()):
    print(key)
    for next_key in sorted(answers[key].keys()):
        print(f"  {next_key}")
        print("     ", type(answers[key][next_key]))
        
        if isinstance(answers[key][next_key], dict):
            for sub_key in sorted(answers[key][next_key].keys()):
                print(f"        {sub_key}")
                print("           ", type(answers[key][next_key][sub_key]))
                if hasattr(answers[key][next_key][sub_key], "shape"):
                    print(f"                  {answers[key][next_key][sub_key].shape}")
                elif isinstance(answers[key][next_key][sub_key], list):
                    print(f"                  {len(answers[key][next_key][sub_key])}")
                print(f"                  {(answers[key][next_key][sub_key])}")
        else:
            if hasattr(answers[key][next_key], "shape"):
                print(f"        {answers[key][next_key].shape}")
            else:
                print(f"        {len(answers[key][next_key])}")
            print(f"        {answers[key][next_key]}")
    print(200 * "-")

chal-173
  context
      <class 'str'>
        86
        Frank had some pieces of candy. If he put them into 26 bags with 33 pieces in each bag
  most_likely_answer
      <class 'dict'>
        accuracy
            <class 'float'>
                  0.0
        embedding
            <class 'torch.Tensor'>
                  torch.Size([1, 2048])
                  tensor([[-0.0683,  2.4701,  0.9007,  ..., -5.6090,  0.0349, -0.8314]])
        response
            <class 'str'>
                  9
        token_log_likelihoods
            <class 'list'>
                  2
                  [0.0, -0.7643496990203857]
  question
      <class 'str'>
        37
        How many pieces of candy did he have?
  reference
      <class 'dict'>
        answers
            <class 'dict'>
                  {'answer_start': [], 'text': ['858']}
        id
            <class 'str'>
                  chal-173
  responses
      <class 'list'>
        5
        [('27', [0.0, -3.358163356781006], tensor([[

## ID's analysis

In [108]:
# retrieve the corpus_sentences to the first question
i = 4
question_id = list(answers.keys())[i]
responses = answers[question_id]['responses']

# each question has a list of responses, that have semantic id's
semantic_ids = results['semantic_ids'][i]
print(semantic_ids)

[0, 1, 2, 2, 3]


In [109]:
print(len(responses))

5


### Generate semantic id's using the responses and similarity metric

#### 1. [fast clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/applications/clustering/fast_clustering.py)

demo 

In [110]:
corpus_sentences = [x[0] for x in responses]
print(corpus_sentences)

['2', '5', '2', '20', '15']


In [111]:
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)


print("Start clustering")
start_time = time.time()

# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least 25 elements
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=1, threshold=0.8)

print(f"Clustering done after {time.time() - start_time:.2f} sec")

# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i + 1}, #{len(cluster)} Elements ")
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

new_semantic_ids = np.array([0] * len(responses))

for i, cluster in enumerate(clusters):
    for idx in cluster:
        new_semantic_ids[idx] = i

Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec

Cluster 1, #2 Elements 
	 2
	 2
	 ...
	 2
	 2

Cluster 2, #1 Elements 
	 5
	 ...
	 5

Cluster 3, #1 Elements 
	 20
	 ...
	 20

Cluster 4, #1 Elements 
	 15
	 ...
	 15


In [112]:
# print the sentences, their semantic id and the new semantic id in a table

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.set_palette("husl", 8)
sns.set_context("notebook", font_scale=1.5)
sns.set_style("whitegrid")

# Plot the answer and new semantic id and the semantic id
df = pd.DataFrame({
    'answer': corpus_sentences,
    'semantic_id': semantic_ids,
    'new_semantic_id': new_semantic_ids
})
df['semantic_id'] = df['semantic_id'].astype(str)
df['new_semantic_id'] = df['new_semantic_id'].astype(str)


df.head()

Unnamed: 0,answer,semantic_id,new_semantic_id
0,2,0,0
1,5,1,1
2,2,2,0
3,20,2,2
4,15,3,3


<Figure size 1000x600 with 0 Axes>