In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import matplotlib.pyplot as plt

if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    print('----------- Float tensor set --------------')
else:
    print('---------------------- No CUDA -----------------')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

  _C._set_default_tensor_type(t)


----------- Float tensor set --------------


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [2]:
valid_token_ids = []
for id in range(tokenizer.vocab_size):
    token = tokenizer.convert_ids_to_tokens(id)
    # Filter for actual words - you may need to adjust these conditions for BGE
    if (
        not token.startswith('[') and  # Skip special tokens
        not token.startswith('##') and  # Skip subword pieces
        not any(c in token for c in '〜་『』«»‰―⟩（') and  # Skip special characters
        # len(token) > 1 and  # Skip single characters
        token.isascii()   # Only keep ASCII tokens
        # not token.isnumeric()
    ):
        valid_token_ids.append(id)

print(f'Number of valid tokens: {len(valid_token_ids)}')

Number of valid tokens: 22748


In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from itertools import product

def find_matching_tokens(model, tokenizer, target_embedding, valid_token_ids, num_tokens=1, batch_size=32, similarity_threshold=0.99999):
    """Find token combinations that produce embeddings matching the target"""
    device = next(model.parameters()).device
    target_embedding = F.normalize(torch.tensor(target_embedding.clone().detach(), device=device).unsqueeze(0), p=2, dim=1)

    def process_batch(token_combos):
        input_ids = torch.tensor([[101] + list(combo) + [102] for combo in token_combos], device=device)
        attention_mask = torch.ones_like(input_ids)
        token_type_ids = torch.zeros_like(input_ids)

        with torch.no_grad():
            embeddings = model(input_ids, attention_mask, token_type_ids).pooler_output
            similarities = torch.mm(F.normalize(embeddings, p=2, dim=1), target_embedding.T).squeeze()

            matches = []
            for idx in torch.where(similarities >= similarity_threshold)[0]:
                token_ids = token_combos[idx]
                matches.append({
                    'tokens': [tokenizer.convert_ids_to_tokens(tid) for tid in token_ids],
                    'token_ids': token_ids,
                    'similarity': similarities[idx].item()
                })
            return matches

    total_combinations = len(valid_token_ids) ** num_tokens
    print(f"Searching through {total_combinations} combinations...")

    matches = []
    current_batch = []

    for combo in tqdm(product(valid_token_ids, repeat=num_tokens), total=total_combinations):
        current_batch.append(combo) # keeps looping until current batch is batch size full
        if len(current_batch) == batch_size:
            matches.extend(process_batch(current_batch))
            current_batch = []

    if current_batch:  # Process any remaining combinations
        matches.extend(process_batch(current_batch))

    return sorted(matches, key=lambda x: x['similarity'], reverse=True)

def printMatches(matchesList):
    print(f'\n\nFound {len(matchesList)} matching tokens')
    for tok in matchesList:
        print(f"Token: {tok['tokens']}, ID: {tok['token_ids']}, Similarity: {tok['similarity']:.6f}")

def processTokens(nTokenMatches):
    return list(map(lambda a: a['token_ids'][0], nTokenMatches))

In [146]:
targetText = 'magic is real'
print(tokenizer(targetText, return_tensors='pt'))
target_embedding = model(**tokenizer(targetText, return_tensors='pt'), output_hidden_states=True).pooler_output[0]
print(target_embedding)

{'input_ids': tensor([[ 101, 3894, 2003, 2613,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([-0.9450, -0.8097, -0.8362,  ...,  0.4859,  0.9817, -0.8716],
       grad_fn=<SelectBackward0>)


In [147]:
oneTokenMatches = find_matching_tokens(model, tokenizer, target_embedding, valid_token_ids, num_tokens=1, batch_size=32, similarity_threshold=0)
print(f'One token matches: {len(oneTokenMatches)}')

  target_embedding = F.normalize(torch.tensor(target_embedding.clone().detach(), device=device).unsqueeze(0), p=2, dim=1)


Searching through 22748 combinations...


100%|██████████| 22748/22748 [00:20<00:00, 1088.06it/s]

One token matches: 22748





In [148]:
printMatches(oneTokenMatches[:100])



Found 100 matching tokens
Token: ['shared'], ID: (4207,), Similarity: 0.961855
Token: ['wand'], ID: (23967,), Similarity: 0.961072
Token: ['mcgregor'], ID: (23023,), Similarity: 0.960443
Token: ['andy'], ID: (5557,), Similarity: 0.959906
Token: ['called'], ID: (2170,), Similarity: 0.957359
Token: ['apparently'], ID: (4593,), Similarity: 0.957178
Token: ['manifested'], ID: (24906,), Similarity: 0.956952
Token: ['reza'], ID: (26323,), Similarity: 0.956810
Token: ['lenny'], ID: (19065,), Similarity: 0.956553
Token: ['zee'], ID: (23727,), Similarity: 0.956521
Token: ['boss'], ID: (5795,), Similarity: 0.956143
Token: ['magician'], ID: (16669,), Similarity: 0.955991
Token: ['chong'], ID: (24008,), Similarity: 0.955913
Token: ['donkey'], ID: (20325,), Similarity: 0.955821
Token: ['garth'], ID: (21523,), Similarity: 0.955528
Token: ['scrolls'], ID: (23074,), Similarity: 0.954058
Token: ['sheng'], ID: (25981,), Similarity: 0.953799
Token: ['nedra'], ID: (28240,), Similarity: 0.953260
Token: [

In [149]:
processedTokens = processTokens(oneTokenMatches)

In [185]:
idx = processedTokens.index(3894)
print(f"The index of token magic is: {idx} -> {processedTokens[idx]}")

idx = processedTokens.index(2003)
print(f"The index of token is is: {idx} -> {processedTokens[idx]}")

idx = processedTokens.index(2613)
print(f"The index of token real is: {idx} -> {processedTokens[idx]}")

idx = processedTokens.index(16669)
print(f"The index of token magician is: {idx} -> {processedTokens[idx]}")

The index of token magic is: 198 -> 3894
The index of token is is: 9418 -> 2003
The index of token real is: 15094 -> 2613
The index of token magician is: 11 -> 16669


In [151]:
import numpy as np

pooledEmbeddingsOfValidTokens = np.load('/content/embeddingsOfAll.npy')
pooledEmbeddingsOfValidTokens = torch.tensor(pooledEmbeddingsOfValidTokens, device=device)
pooledEmbeddingsOfValidTokens

tensor([[-0.0344, -0.0173, -0.0039,  ..., -0.0270,  0.0351, -0.0341],
        [-0.0367, -0.0316,  0.0099,  ..., -0.0207,  0.0373, -0.0367],
        [-0.0384, -0.0321, -0.0104,  ..., -0.0187,  0.0386, -0.0375],
        ...,
        [-0.0377, -0.0252, -0.0004,  ..., -0.0097,  0.0391, -0.0365],
        [-0.0356, -0.0156, -0.0149,  ..., -0.0068,  0.0382, -0.0358],
        [-0.0389, -0.0378, -0.0303,  ...,  0.0041,  0.0406, -0.0389]])

In [152]:
similarity_threshold = 0.9
last_idx = -1
for idx, tok in enumerate(oneTokenMatches):
    if tok['similarity'] > similarity_threshold:
        last_idx = idx
print(f"The index of the last token with similarity above {similarity_threshold} is {last_idx}")

The index of the last token with similarity above 0.9 is 6281


In [153]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

def get_top_k_similar_tokens(input_token_ids, valid_embeddings, k=5):
    """
    Find top k most cosine similar tokens from valid_token_ids for each input token
    based on the final pooled embeddings from BGE model.

    Args:
        input_token_ids (List[int]): List of input token IDs
        valid_token_ids (List[int]): List of valid token IDs to compare against
        model_name (str): Name of the BGE model to use
        k (int): Number of similar tokens to return for each input token

    Returns:
        List[List[Tuple[int, float]]]: For each input token, returns list of (token_id, similarity_score) tuples
    """
    # Get embeddings for input tokens
    input_embeddings = []
    for token_id in input_token_ids:
        # Create input with single token
        inputs = tokenizer.encode_plus(
            tokenizer.decode([token_id]),
            return_tensors="pt",
            add_special_tokens=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            # Use pooled output for token representation
            pooled_output = outputs.pooler_output
            input_embeddings.append(pooled_output)

    # Stack all input embeddings
    input_embeddings = torch.cat(input_embeddings, dim=0)

    # Normalize embeddings
    input_embeddings = F.normalize(input_embeddings, p=2, dim=1)

    # Calculate cosine similarity
    similarity_matrix = torch.mm(input_embeddings, valid_embeddings.t())

    # Get top k similar tokens for each input token
    results = []
    top_k_values, top_k_indices = torch.topk(similarity_matrix, k=k, dim=1)

    for i in range(len(input_token_ids)):
        token_results = []
        for j in range(k):
            similar_token_id = valid_token_ids[top_k_indices[i][j].item()]
            similarity_score = top_k_values[i][j].item()
            token_results.append((similar_token_id, similarity_score, tokenizer.decode([similar_token_id])))
        results.append(token_results)

    return results

results = [j for i in get_top_k_similar_tokens(processedTokens[:50], pooledEmbeddingsOfValidTokens, k=10) for j in i]
resultsWithTokenIds = [i[0] for i in results]
for i in results:
    print(i)

(4207, 1.0000001192092896, 'shared')
(8678, 0.976253867149353, 'collaborated')
(2170, 0.9750165939331055, 'called')
(2921, 0.9737001657485962, 'kept')
(3079, 0.9735286235809326, 'owned')
(18011, 0.9720052480697632, 'entrusted')
(14382, 0.9719567894935608, 'volunteered')
(5799, 0.9718286395072937, 'linked')
(3266, 0.9712059497833252, 'managed')
(8617, 0.9709718227386475, 'owns')
(23967, 0.9999997615814209, 'wand')
(19607, 0.9738000631332397, 'orb')
(4880, 0.9729925990104675, 'cape')
(16148, 0.9719747304916382, 'forearm')
(27838, 0.9712066650390625, 'ze')
(28841, 0.9705698490142822, 'diablo')
(16669, 0.9705649018287659, 'magician')
(25989, 0.9702391624450684, 'mundo')
(26834, 0.97014319896698, 'leash')
(8353, 0.9700793027877808, 'capita')
(23023, 1.0000001192092896, 'mcgregor')
(20545, 0.981041431427002, 'conor')
(14093, 0.9763465523719788, 'abdullah')
(5557, 0.974487841129303, 'andy')
(24400, 0.9737304449081421, 'rourke')
(12164, 0.9722175598144531, 'boyd')
(27716, 0.9718471765518188, '

In [187]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(item, target_embedding, num_times=2):
    try:
        # Extract token_id from the tuple (token_id, score, text)
        token_id = item[0]  # Use the first element which is the token ID

        # # Add CLS and SEP tokens like in the first approach
        input_ids = torch.tensor([[101] + [token_id]*num_times + [102]], device=device)
        attention_mask = torch.ones_like(input_ids)
        token_type_ids = torch.zeros_like(input_ids)

        # print(input_ids)
        # print(tokenizer(item[2], return_tensors='pt'))

        with torch.no_grad():
            input_embedding = model(input_ids, attention_mask, token_type_ids).pooler_output
            input_embedding = F.normalize(input_embedding, p=2, dim=1)
            target_norm = F.normalize(target_embedding.unsqueeze(0), p=2, dim=1)
            similarity = torch.mm(input_embedding, target_norm.T).item()
            return similarity
    except Exception as e:
        print(f"Error calculating similarity for item {item}: {e}")
        return -1

def sort_list_by_similarity(data_list, target_embedding, model, tokenizer):
    def calculate_similarity_helper(item):
        return calculate_similarity(item, target_embedding)
    sorted_list = sorted(data_list, key=calculate_similarity_helper, reverse=True)
    return sorted_list

similarTokensToTopMatches = [j for i in get_top_k_similar_tokens(processedTokens[:50], pooledEmbeddingsOfValidTokens, k=10) for j in i]
sortedList = sort_list_by_similarity(similarTokensToTopMatches, target_embedding, model, tokenizer)

seen = {}
for item in sortedList:
    text = item[2]
    if text not in seen:
        seen[text] = calculate_similarity(item, target_embedding)

sortedListWithoutDupes = sorted(seen.items(), key=lambda x: x[1], reverse=True)

# Print in sorted order
for text, score in sortedListWithoutDupes:
    print(f'{text} - [{score}]')

shared - [0.9602134227752686]
andy - [0.9601321220397949]
called - [0.9582576751708984]
mana - [0.9571069478988647]
zee - [0.9563745856285095]
arthur - [0.9555802941322327]
scrolls - [0.9555515050888062]
mcgregor - [0.9553567171096802]
wand - [0.954482913017273]
manifested - [0.9544644355773926]
hunted - [0.9540863633155823]
marco - [0.953511655330658]
orb - [0.9530606269836426]
angela - [0.9527246356010437]
astrid - [0.9524677991867065]
owned - [0.9523314833641052]
garion - [0.9522385597229004]
manny - [0.9521247744560242]
eric - [0.951919436454773]
sleeve - [0.9518042206764221]
nedra - [0.9512354135513306]
trained - [0.9510442018508911]
sleeves - [0.9509919881820679]
jeff - [0.9507693648338318]
intuition - [0.9507241249084473]
abby - [0.9506998658180237]
owner - [0.950381875038147]
mikey - [0.9503069519996643]
lenny - [0.9502915143966675]
ark - [0.9502451419830322]
garth - [0.9501229524612427]
owns - [0.9499742984771729]
inevitably - [0.9499562382698059]
isaac - [0.9499236345291138]


In [163]:
print(sortedListWithoutDupes[list(map(lambda a: a[0], sortedListWithoutDupes)).index('shared')])

('shared', array([0.87078], dtype=float32))


In [60]:
threeTokenMatches = find_matching_tokens(model, tokenizer, target_embedding, resultsWithTokenIds[:100], num_tokens=3, batch_size=10_000, similarity_threshold=0.0)

  target_embedding = F.normalize(torch.tensor(target_embedding.clone().detach(), device=device).unsqueeze(0), p=2, dim=1)


Searching through 1000000 combinations...


100%|██████████| 1000000/1000000 [15:01<00:00, 1109.39it/s]


In [61]:
printMatches(threeTokenMatches[:100])



Found 100 matching tokens
Token: ['magic', 'seems', 'sheng'], ID: (3894, 3849, 25981), Similarity: 0.984808
Token: ['magic', 'magic', 'manifested'], ID: (3894, 3894, 24906), Similarity: 0.984602
Token: ['magic', 'seems', 'manifest'], ID: (3894, 3849, 19676), Similarity: 0.984374
Token: ['magic', 'seems', 'manifested'], ID: (3894, 3849, 24906), Similarity: 0.983557
Token: ['magic', 'nedra', 'sho'], ID: (3894, 28240, 26822), Similarity: 0.983211
Token: ['magic', 'seemingly', 'zee'], ID: (3894, 9428, 23727), Similarity: 0.982724
Token: ['magic', 'seems', 'zee'], ID: (3894, 3849, 23727), Similarity: 0.982524
Token: ['magic', 'seemingly', 'nedra'], ID: (3894, 9428, 28240), Similarity: 0.982422
Token: ['magic', 'seems', 'garion'], ID: (3894, 3849, 12523), Similarity: 0.982170
Token: ['magic', 'seemingly', 'manifest'], ID: (3894, 9428, 19676), Similarity: 0.981844
Token: ['manifest', 'magic', 'manifested'], ID: (19676, 3894, 24906), Similarity: 0.981650
Token: ['magic', 'manifested', 'seems

In [75]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch.nn.functional as F

def cosine_similarity(embedding1, embedding2):
  """
  Calculates the cosine similarity between two embeddings.

  Args:
    embedding1: The first embedding (PyTorch tensor).
    embedding2: The second embedding (PyTorch tensor).

  Returns:
    The cosine similarity between the two embeddings (float).
  """

  # Normalize the embeddings
  embedding1_normalized = F.normalize(embedding1, p=2, dim=0)
  embedding2_normalized = F.normalize(embedding2, p=2, dim=0)

  # Calculate the dot product
  dot_product = torch.dot(embedding1_normalized, embedding2_normalized)

  return dot_product.item()

text1 = 'magic is real'
text2 = 'magic magic magic'
print(tokenizer(text1, return_tensors='pt'))
print(tokenizer(text2, return_tensors='pt'))
cosine_similarity(model(**tokenizer(text1, return_tensors='pt'), output_hidden_states=True).pooler_output[0], model(**tokenizer(text2, return_tensors='pt'), output_hidden_states=True).pooler_output[0])

{'input_ids': tensor([[ 101, 3894, 2003, 2613,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
{'input_ids': tensor([[ 101, 3894, 3894, 3894,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


0.9505640268325806