Author: Amiri Hayes \
Date Updated: 7/7/25 \
Title: ViewLLM

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
nltk.download('punkt_tab')
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel

In [None]:
# POSITIONAL FILTERING PATTERNS:

def next_attention(sentence, tokenizer):
    toks = tokenizer([sentence], return_tensors="pt")
    len_seq = len(toks.input_ids[0])
    out = np.zeros((len_seq, len_seq))
    for i in range(1, len_seq-1):
        out[i, i+1] = 1
    out[0,0] = 1
    out[-1,0] = 1
    return "Next Head Attention Pattern", out

def previous_attention(sentence, tokenizer):
    toks = tokenizer([sentence], return_tensors="pt")
    len_seq = len(toks.input_ids[0])
    out = np.zeros((len_seq, len_seq))
    for i in range(1, len_seq-1):
        out[i, i-1] = 1
    out[0,0] = 1
    out[-1,0] = 1
    return "Previous Head Attention Pattern", out

def same_attention(sentence, tokenizer):
    toks = tokenizer([sentence], return_tensors="pt")
    len_seq = len(toks.input_ids[0])
    out = np.zeros((len_seq, len_seq))
    for i in range(1, len_seq-1):
        out[i, i] = 1
    out[0,0] = 1
    out[-1,0] = 1
    return "Same Head Attention Pattern", out

def punctuation_attention(sentence, tokenizer):
    toks = tokenizer([sentence], return_tensors="pt")
    len_seq = len(toks.input_ids[0])
    out = np.zeros((len_seq, len_seq))
    words = tokenizer.convert_ids_to_tokens(toks.input_ids[0])
    punctuation_set = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
    punctuation_indices = [i for i, tok in enumerate(words) if any(p in tok for p in punctuation_set)]
    for i in range(len_seq):
        future_punct = [j for j in punctuation_indices if j > i]
        if future_punct:
            for j in future_punct:
                out[i, j] = 1.0
            out[i] /= out[i].sum()
        else:
            out[i, i] = 1.0
    out += 1e-4
    out = out / out.sum(axis=1, keepdims=True)
    return "Punctuation Pattern", out

def repeated_attention(sentence, tokenizer):
    return "", 0

# LINGUISTIC ROLE ALIGNMENT PATTERNS:

def pos_alignment(sentence, tokenizer):
  return "", 0

def dependencies(sentence, tokenizer):
    toks = tokenizer([sentence], return_tensors="pt")
    len_seq = len(toks.input_ids[0])
    out = np.zeros((len_seq, len_seq))
    words = sentence.split()
    doc = nlp(" ".join(words))
    check_errors = False
    if check_errors:
        if len(doc) == 0: print("problem, doc empty")
        if len(doc) != (len_seq-2): print("problem, doc length mismatch", len(doc), len(toks)-2)
    for stok in doc:
        parent_index = stok.i
        for child_stok in stok.children:
            child_index = child_stok.i
            out[parent_index+1, child_index+1] = 1
            out[child_index+1, parent_index+1] = 1
    out[0, 0] = 1
    out[-1, 0] = 1
    out += 1e-4
    out = out / out.sum(axis=1, keepdims=True)
    return "Dependency Parsing Pattern", out

In [None]:
def repeated_attention(sentence, model, tokenizer, head, layer, output=False):

    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad(): outputs = model(**inputs)
    attention = outputs.attentions[layer]

    attn_matrix = attention[0, head]
    seq_len = attn_matrix.shape[0]

    if output:
        for i, row in enumerate(attn_matrix):
            probs = []
            for val in row: probs.append(f"{val.item():.2f}")
            print(f"Token {i}: {probs}")

    token_counts = Counter(sentence.split())
    print(f"Token counts: {token_counts}")
    repeated_tokens = {tok for tok, count in token_counts.items() if count > 1}

    if not repeated_tokens:
        return 0

    repeated_indices = [i for i, tok in enumerate(inputs) if tok in repeated_tokens]

    for i in repeated_indices:
        attn_to_repeats = sum(attn_matrix[i][j] for j in repeated_indices)
        if attn_to_repeats > 0.5:
            if output: print(f"Same token pattern detected")
            return 1

    if output: print(f"No repeated attention pattern detected")
    return 0

In [None]:
# GENERATE & VISUALIZE ATTENTION PATTERN SCORES

def js_divergence(p, q):
    p = np.clip(p, 1e-12, 1.0)
    q = np.clip(q, 1e-12, 1.0)
    p /= p.sum()
    q /= q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (np.sum(p * np.log(p / m)) + np.sum(q * np.log(q / m)))

def score_prediction(sentence, torch_model, torch_tokenizer, head_loc, pattern, distance="jsd", output=False):
    layer, head = head_loc
    tokens = torch_tokenizer(sentence, return_tensors="pt")
    if torch_model.config.architectures[0] == 'T5ForConditionalGeneration':
        decoder_input_ids = tokens["input_ids"]
        outputs = torch_model(input_ids=tokens["input_ids"], decoder_input_ids=decoder_input_ids, output_attentions=True)
        att = outputs.encoder_attentions[layer][0, head].detach().numpy()
    else:
        att = torch_model(**tokens, output_attentions=True).attentions[layer][0, head].detach().numpy()

    name, pred_att = pattern(sentence, torch_tokenizer)

    if distance == "raw":
        score = np.abs(att - pred_att).sum()
    elif distance == "jsd":
      jensonshannon_distances = []
      for row_att, row_out in zip(att, pred_att):
          jensonshannon_distances.append(np.sqrt(js_divergence(row_att, row_out)))
      score = np.mean(jensonshannon_distances)

    if output:
        colors="inferno"
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        im1 = axes[0].imshow(att, cmap=colors, aspect='auto')
        axes[0].set_title("Actual Head Attention")
        fig.colorbar(im1, ax=axes[0])
        im2 = axes[1].imshow(pred_att, cmap=colors, aspect='auto')
        axes[1].set_title("Optimal Head Attention for Pattern")
        fig.colorbar(im2, ax=axes[1])

        underlined_name_unicode = "".join([char + '\u0332' for char in name])
        plt.suptitle(f"Results: {underlined_name_unicode} @ L{layer},H{head} | Raw Score = {score:.2f}\n\nSentence: \"{sentence}\"", fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()
    return score

In [None]:
# DOWNLOAD ADDITIONAL TEXT DATA:

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/small_text.csv')

sentences = []
for paragraph in df['text']:
    sentences.extend(sent_tokenize(paragraph))
sentences = sentences[:10_000]

print("Sentences from Tiny Stories Dataset:")
print(sentences[20:30])

In [None]:
# ANALYZE PATTERN AT LAYER AND HEAD

sentence = "The quick brown fox jumps over the lazy dog."
sentence = "Hi. How are you? I'm fine! Thanks. Bye, see you tomorrow."

models = ["bert-base-uncased", "openai-community/gpt2", "google-t5/t5-small"]
model_name = models[0]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_attentions=True)
model.eval()
layer, head = 0, 10

score_prediction(sentences[0], model, tokenizer, (layer, head), next_attention, distance="jsd", output=True)

In [None]:
# ANALYZE ALL HEADS FOR A PATTERN (1 SENTENCE)

def visualize_full_model(sentence, torch_model, torch_tokenizer, pattern, title, bias_towards_best=0.9):
    num_layers = model.config.num_hidden_layers
    num_heads = model.config.num_attention_heads
    model_viz = np.zeros((num_layers, num_heads))

    for i in range(num_layers):
        for j in range(num_heads):
            score = score_prediction(sentence, torch_model, torch_tokenizer, (i, j), pattern, distance="jsd")
            model_viz[i, j] = score
    print(f"Best Score: {min(map(min, model_viz)):.2f}")

    plt.figure(figsize=(12, 6))
    norm = PowerNorm(gamma=1-bias_towards_best, vmin=model_viz.min(), vmax=model_viz.max())
    plt.imshow(model_viz, cmap='bone', aspect='auto', norm=norm)
    plt.colorbar()
    plt.title(title, fontsize=16)
    plt.xlabel("Attention Heads")
    plt.ylabel("Transformer Layers")
    plt.xticks(ticks=np.arange(num_heads), labels=[f'H{h}' for h in range(num_heads)])
    plt.yticks(ticks=np.arange(num_layers), labels=[f'L{l}' for l in range(num_layers)])
    plt.tight_layout()
    plt.show()

    top_three = list(zip(*np.unravel_index(np.argsort(model_viz, axis=None)[:3], model_viz.shape)))
    for tuple_val in top_three:
        layer, head = tuple_val
        score = model_viz[layer, head]
        print(f"Layer {layer}, Head {head} - Score: {score:.2f}")
    return model_viz, top_three

from matplotlib.colors import PowerNorm
model_name = f"Model = {model.config.architectures[0]}\n"
sentence = "Hi. How are you? I'm fine! Thanks. Bye, see you tomorrow."
pattern_name = "Punctuation Attention Pattern"
underlined_name_unicode = "".join([char + '\u0332' for char in pattern_name])
visualize_full_model(sentence, model, tokenizer, punctuation_attention, title=f"Top Heads: {underlined_name_unicode} | {model_name}\nSentence: \"{sentence}\"\n", bias_towards_best=0.7)

In [None]:
# ANALYZE ALL HEADS FOR A PATTERN (MULTIPLE SENTENCES)

def visualize_full_model(sentences, torch_model, torch_tokenizer, pattern, title, bias_towards_best=0.9):
    num_layers = torch_model.config.num_hidden_layers
    num_heads = torch_model.config.num_attention_heads

    average_score = np.zeros((num_layers, num_heads))
    for sentence in tqdm(sentences):
        model_score = np.zeros((num_layers, num_heads))
        for i in range(num_layers):
            for j in range(num_heads):
                score = score_prediction(sentence, torch_model, torch_tokenizer, (i, j), pattern)
                model_score[i, j] = score
        average_score += model_score
    average_score /= len(sentences)

    plt.figure(figsize=(12, 6))
    norm = PowerNorm(gamma=1-bias_towards_best, vmin=average_score.min(), vmax=average_score.max())
    plt.imshow(average_score, cmap='bone', aspect='auto', norm=norm)
    plt.colorbar()
    plt.title(title, fontsize=16)
    plt.xlabel("Attention Heads")
    plt.ylabel("Transformer Layers")
    plt.xticks(ticks=np.arange(num_heads), labels=[f'H{h}' for h in range(num_heads)])
    plt.yticks(ticks=np.arange(num_layers), labels=[f'L{l}' for l in range(num_layers)])
    plt.tight_layout()
    plt.show()

    top_three = list(zip(*np.unravel_index(np.argsort(average_score, axis=None)[3:], average_score.shape)))
    for tuple_val in top_three:
            layer, head = tuple_val
            score = average_score[layer, head]
            print(f"Layer {layer}, Head {head} - Score: {score:.2f}")
    return average_score, top_three

from matplotlib.colors import PowerNorm
model_name = f"Model = {model.config.architectures[0]}\n"
visualize_full_model(sentences, model, tokenizer, punctuation_attention, title="Top Heads: Punctuation Attention Pattern [AVERAGE]\n"+model_name, bias_towards_best=0.7)

In [None]:
# ANALYZE HEAD PATTERN ON ALL SENTENCES

def visualize_highest_head(sentences, top_n, torch_model, torch_tokenizer, head_loc, pattern):
    layer, head = head_loc
    scores = []

    punctuation_set = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

    for sentence in tqdm(sentences):
        # if len(sentence.split()) < 10:
        #   scores.append(100)
        #   continue

        punctuation_count = sum(1 for char in sentence if char in punctuation_set)
        if punctuation_count <= 3:
            scores.append(100)
            continue

        score = score_prediction(sentence, torch_model, torch_tokenizer, (layer, head), pattern, distance="jsd")
        scores.append(score)

    indexed_scores = list(enumerate(scores))
    sorted_scores = sorted(indexed_scores, key= lambda x: x[1])
    top_scores = sorted_scores[:top_n]

    for idx, score in top_scores:
        print(f"Sentence #{idx} Score: {score}")
        sentence = sentences[idx]
        score_prediction(sentence, torch_model, torch_tokenizer, (layer, head), pattern, output=True)
    return 0

layer, head = 7, 8
visualize_highest_head(sentences, 8, model, tokenizer, (layer, head), punctuation_attention)