In [60]:
from functools import partial
import hashlib

import datasets
from dotenv import load_dotenv
import tiktoken
import torch
import vec2text 

load_dotenv()

def compute_cosine_similarity(embeddings1, embeddings2):
    return torch.nn.functional.cosine_similarity(embeddings1, embeddings2, dim=1)


## Setup for error analysis

Let's take the first $n=100$ rows of the precomputed val dataset for MS MARCO.

In [55]:
N_SAMPLES=100
dataset = datasets.load_dataset("jxm/msmarco__openai_ada2")
dataset = dataset["train"].select(range(N_SAMPLES))

In [51]:
tokenizer = tiktoken.get_encoding("cl100k_base")
MAX_LENGTH=128

def truncate_text(example):
    text_tokens = tokenizer.encode_batch(example["text"])
    text_tokens = [tok[:MAX_LENGTH] for tok in text_tokens]
    text_list = tokenizer.decode_batch(text_tokens)
    example["text"] = text_list
    return example

In [56]:
dataset = dataset.map(truncate_text, batched=True, batch_size=1024, num_proc=12)

Map (num_proc=12): 100%|██████████| 100/100 [00:00<00:00, 806.41 examples/s]


In [57]:
# Assumes no batching
def get_text_hash(example):
    example["source_id"] = hashlib.md5(example["text"].encode()).hexdigest()
    return example
    

dataset = dataset.map(get_text_hash, batched=False, num_proc=12)

Map (num_proc=12): 100%|██████████| 100/100 [00:00<00:00, 865.39 examples/s]


In [58]:
dataset = dataset.add_column(name="step", column=[0] * N_SAMPLES)
dataset = dataset.add_column(name="sim", column=[1] * N_SAMPLES)

## Generating samples

In [2]:
corrector = vec2text.load_pretrained_corrector("text-embedding-ada-002")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [96]:
# Get prediction trajectory at n_steps=10 
# Assume non-batched
def get_trajectory(n_steps, examples):
    new_examples = {k: [] for k in examples.keys()}

    for i, original_embedding in enumerate(examples["embeddings_A"]):
        original_embedding = torch.Tensor(original_embedding).cuda().unsqueeze(0)


        output_strings, hypothesis_embeddings = vec2text.invert_embeddings_and_return_hypotheses(original_embedding, corrector, num_steps=n_steps, sequence_beam_width=4)

        # Append to example
        new_examples["source_id"] += [examples["source_id"][i] for _ in range(len(hypothesis_embeddings))]
        new_examples["text"] += [output[0] for output in output_strings]
        new_examples["embeddings_A"] += [emb.squeeze().tolist() for emb in hypothesis_embeddings]
        new_examples["step"] += range(1, len(hypothesis_embeddings) + 1)
        new_examples["sim"] += [compute_cosine_similarity(original_embedding, embedding).item() for embedding in hypothesis_embeddings]

    return {k: examples[k] + new_examples[k] for k in examples.keys()}


In [97]:
dataset.select(range(1)).map(partial(get_trajectory, 1), batched=True)

Map: 100%|██████████| 1/1 [00:01<00:00,  1.77s/ examples]

{'text': ['The scientific success of the Manhattan Project was as much a matter of communication as of the lives of thousands of scientists and engineers. What was truly meaningless was the utter lack of a doubt that the atomic bombs had a profoundly influential and awe-inspiring impact upon the lives of their fellow scientists.', 'The presence of communication among the scientists and engineers of the Manhattan Project was as important a means to their success as it was a means to their success. What was literally meaningless was the utter oblivion of hundreds of thousands of innocent lives in the atomic cradle; however, the scientific achievement had a profound impact.'], 'embeddings_A': 'Mock', 'source_id': ['8bd034ea81f91372874cf6d90dffbba1', '8bd034ea81f91372874cf6d90dffbba1'], 'step': [1, 2], 'sim': [0.9612681269645691, 0.9714975953102112]}





Dataset({
    features: ['text', 'embeddings_A', 'source_id', 'step', 'sim'],
    num_rows: 3
})