<a target="_blank" href="https://colab.research.google.com/github/petuch03/graph-rag-research/blob/master/tokenizers/self-attention-visualization.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [3]:
from transformers import GemmaTokenizer, GemmaForCausalLM
from sub_word_tokenization import tokenize_custom
from attention_metrics import *

# from huggingface_hub import notebook_login
# 
# notebook_login()

In [4]:
# configuration = GemmaConfig()
model_version = 'google/gemma-2b-it'

model = GemmaForCausalLM.from_pretrained(model_version, output_attentions=True)
tokenizer = GemmaTokenizer.from_pretrained(model_version)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [5]:
input_text = 'Bob sent Alice a message about apples.'
comparison_pair = tokenize_custom(tokenizer, "tokenizer.json", "Bob sent Alice a message about apples.")
print(comparison_pair.default_tokens)
print(comparison_pair.sub_word_tokens)


def compute_attention_pipeline(tokenizer, input_tokens) -> SimpleNamespace:
    inputs = tokenizer.encode(input_tokens, return_tensors='pt')
    # tokens = tokenizer.convert_ids_to_tokens(inputs[0])
    outputs = model(inputs)
    attention = torch.stack(outputs[-1], dim=0)
    return SimpleNamespace(attention=attention, full_outputs=outputs)


default_model_output = compute_attention_pipeline(tokenizer, comparison_pair.default_tokens)
sub_word_model_output = compute_attention_pipeline(tokenizer, comparison_pair.sub_word_tokens)



['Bob', '▁sent', '▁Alice', '▁a', '▁message', '▁about', '▁apples', '.']
['B', 'o', 'b', '▁s', 'en', 't', '▁Al', 'ice', '▁', 'a', '▁mes', 'sage', '▁ab', 'out', '▁ap', 'ple', 's', '.']


In [6]:
show_all_metrics(default_model_output.attention, threshold=0.02)
show_all_metrics(sub_word_model_output.attention, threshold=0.02)

Threshold-based Noise Metric - Source: tensor([0.8889, 0.7878, 0.7160, 0.6628, 0.6265, 0.5586, 0.5116, 0.5239, 0.4282]), Source mean: 0.6338306069374084
Target: tensor([0.0293, 0.4660, 0.5278, 0.6073, 0.6852, 0.7515, 0.8333, 0.8750, 0.9290])Target mean: 0.6338306069374084
Entropy-based Noise Metric: 2.1522603034973145
Threshold-based Noise Metric - Source: tensor([0.9474, 0.9050, 0.8852, 0.8578, 0.8209, 0.8056, 0.7957, 0.7482, 0.7569,
        0.7515, 0.7109, 0.7105, 0.7105, 0.6860, 0.6996, 0.6784, 0.7061, 0.6849,
        0.6140]), Source mean: 0.7618498802185059
Target: tensor([0.0523, 0.6583, 0.6981, 0.6528, 0.6857, 0.7482, 0.7460, 0.7485, 0.7379,
        0.7778, 0.7865, 0.8527, 0.8198, 0.8706, 0.8757, 0.9097, 0.9243, 0.9565,
        0.9737])Target mean: 0.7618498802185059
Entropy-based Noise Metric: 2.921490430831909


In [6]:
logits = default_model_output.full_outputs.logits
predicted_token_id = logits[:, -1, :].argmax(dim=-1)

# Convert the predicted token ID to a token
predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)
predicted_token

['▁Bob']

In [15]:
def generate_sequence_from_tokens(tokenizer, input_tokens, max_length=50, num_return_sequences=1) -> list[str]:
    inputs = tokenizer.encode(input_tokens, return_tensors='pt')
    generated_sequences = model.generate(inputs, max_length=max_length, num_return_sequences=num_return_sequences)

    generated_text = [tokenizer.decode(generated_sequence, skip_special_tokens=True) for generated_sequence in
                      generated_sequences]

    return generated_text

In [20]:
generate_sequence_from_tokens(tokenizer, comparison_pair.default_tokens)

['Bob sent Alice a message about apples. Bob sent Alice a message about apples, but it was not the same message as the one he sent her. What happened?\n\nBob sent Alice a message about apples, but it was not the same message']

In [22]:
generate_sequence_from_tokens(tokenizer, comparison_pair.sub_word_tokens)

['Bob sent Alice a message about apples.\n\nSure, here\'s the message about apples:\n\n"Apples are a delicious fruit that is enjoyed by people of all ages. They are a good']

In [5]:
def process_batch(batch, tokenizer=tokenizer, tokenizer_config: str = "tokenizer.json", threshold: float = 0.01):
    batch_result = torch.empty(len(batch), 2, 2)
    for idx, input_sequence in enumerate(batch):
        comparison_pair = tokenize_custom(tokenizer, tokenizer_config, input_sequence)

        default_model_attention = compute_attention_pipeline(tokenizer, comparison_pair.default_tokens).attention
        sub_word_model_attention = compute_attention_pipeline(tokenizer, comparison_pair.sub_word_tokens).attention

        default_threshold_metric = threshold_noise_metric(default_model_attention, threshold)
        default_entropy_metric = entropy_based_noise_metric(default_model_attention)
        batch_result[idx][0][0] = default_threshold_metric.source_noise_percentage_over_tokens
        batch_result[idx][0][1] = default_entropy_metric

        sub_word_threshold_metric = threshold_noise_metric(sub_word_model_attention, threshold)
        sub_word_entropy_metric = entropy_based_noise_metric(sub_word_model_attention)
        batch_result[idx][1][0] = sub_word_threshold_metric.source_noise_percentage_over_tokens
        batch_result[idx][1][1] = sub_word_entropy_metric

    return batch_result


In [6]:
input_sequences = ["Bob sent Alice a message about apples.", "Cat didn't cross the street because it was tired."]

In [7]:
batch_result = process_batch(input_sequences, tokenizer, "tokenizer.json", 0.02)

In [8]:
batch_result

tensor([[[0.6338, 2.1523],
         [0.7618, 2.9215]],

        [[0.7086, 2.5323],
         [0.7991, 3.2424]]])

In [9]:
def merge_sentences_from_file(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        for line in file:
            # Strip leading and trailing whitespace, then remove the quotes
            sentence = line.strip().strip('"')
            sentences.append(sentence)
    return sentences


input_sequences = merge_sentences_from_file("test-sequences.txt")

In [10]:
print(input_sequences)

['The cake was untouched because it was too sweet.', 'He missed the train because his alarm failed.', 'The plant wilted not from lack of water, but from too much sun.', "She didn't answer the call, thinking it was a mistake.", 'The dog barked at the shadow, mistaking it for an intruder.', 'The code wouldn’t compile, due to an unnoticed typo.', 'They returned the gift, finding it too practical than thoughtful.', 'The book remained on the shelf, deemed too advanced for beginners.', 'Bob sent Alice a message about apples.', "Cat didn't cross the street because it was tired."]


In [17]:
comparison_pair_sequences = [tokenize_custom(tokenizer, "tokenizer.json", sequence) for sequence in input_sequences]

In [20]:
default_tokens = [comparison_pair.default_tokens for comparison_pair in comparison_pair_sequences]
sub_word_tokens = [comparison_pair.sub_word_tokens for comparison_pair in comparison_pair_sequences]

In [23]:
default_generated_sequences_list = [generate_sequence_from_tokens(tokenizer, default) for default in default_tokens]
sub_word_generated_sequences_list = [generate_sequence_from_tokens(tokenizer, sub_word) for sub_word in sub_word_tokens]

# Print the generated sequences
for i in range(0, len(default_generated_sequences_list)):
    print(f"Input: '{input_sequences[i]}'")
    for sequence in default_generated_sequences_list[i]:
        print(f"Generated: '{sequence}'")
    for sub_words_sequence in sub_word_generated_sequences_list[i]:
        print(f"Sub-word Generated: '{sub_words_sequence}'")
    print("\n===========\n")

Input: 'The cake was untouched because it was too sweet.'
Generated: 'The cake was untouched because it was too sweet.

The context suggests that the cake was too sweet, and that the speaker did not eat it.'
Sub-word Generated: 'The cake was unto untouched because it was too sweet. The caker was so scared that he could not tak the cake out of the oven.'


Input: 'He missed the train because his alarm failed.'
Generated: 'He missed the train because his alarm failed. He had to wait for two hours before the next one arrived.

The passage is about:

A. A person who missed a train.
B. A person who woke up late.
'
Sub-word Generated: 'He missed the train because his alarm failed.

The train was due to arrive at the station at 10:00 AM, but it did not arrive until 11:0'


Input: 'The plant wilted not from lack of water, but from too much sun.'
Generated: 'The plant wilted not from lack of water, but from too much sun.

Which is the cause of the wilting?

A. Lack of water
B. Too much water
C.

In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer

model = AutoModelForCausalLM.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer = GemmaTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")

# comparison_pair_sequences = [tokenize_custom(tokenizer, "tokenizer-BAAI:bge-large-en-v1.5.json", sequence) for sequence in input_sequences]
# default_tokens = [comparison_pair.default_tokens for comparison_pair in comparison_pair_sequences]
# sub_word_tokens = [comparison_pair.sub_word_tokens for comparison_pair in comparison_pair_sequences]
# default_generated_sequences_list = [generate_sequence_from_tokens(tokenizer, default) for default in default_tokens]
# sub_word_generated_sequences_list = [generate_sequence_from_tokens(tokenizer, sub_word) for sub_word in sub_word_tokens]
# 
# # Print the generated sequences
# for i in range(0, len(default_generated_sequences_list)):
#     print(f"Input: '{input_sequences[i]}'")
#     for sequence in default_generated_sequences_list[i]:
#         print(f"Generated: '{sequence}'")
#     for sub_words_sequence in sub_word_generated_sequences_list[i]:
#         print(f"Sub-word Generated: '{sub_words_sequence}'")
#     print("\n===========\n")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-large-en-v1.5 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'GemmaTokenizer'.


TypeError: not a string

In [28]:
comparison_pair_sequences = [tokenize_custom(tokenizer, "tokenizer-BAAI:bge-large-en-v1.5.json", sequence) for sequence in input_sequences]
default_tokens = [comparison_pair.default_tokens for comparison_pair in comparison_pair_sequences]
sub_word_tokens = [comparison_pair.sub_word_tokens for comparison_pair in comparison_pair_sequences]

In [32]:
inputs = tokenizer.encode(default_tokens)
generated_sequences = model.generate(inputs, max_length=50, num_return_sequences=1)

generated_text = [tokenizer.decode(generated_sequence, skip_special_tokens=True) for generated_sequence in
                      generated_sequences]
generated_text

TypeError: argument 'token': 'list' object cannot be converted to 'PyString'