In [None]:
import numpy as np

class TinyBERT:
    def __init__(self):
        self.word_to_id = {'the': 0, 'cat': 1, 'sits': 2, 'sleeps': 3}

        self.embeddings = np.array([
            [0.2, -0.5, 0.1],   # the
            [-0.3, 0.4, 0.2],   # cat
            [0.1, 0.3, -0.4],   # sits
            [-0.2, -0.1, 0.5]   # sleeps
        ])

        self.max_sequence_length = 10
        self.position_dim = 3
        self.position_embeddings = np.random.randn(self.max_sequence_length, self.position_dim) * 0.1

        self.n_heads = 2
        self.head_weights = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.head_importance = np.array([0.5, 0.5])

        self.q_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.k_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.v_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1

    def attention(self, sentence):
        print('TinyBert --- attention()')
        print(sentence)
        # convert to embeddings
        word_ids = [self.word_to_id[word] for word in sentence]
        print(word_ids)
        vectors = self.embeddings[word_ids]
        print(vectors)

        positional_vectors = self.position_embeddings[:len(sentence)]
        vectors = vectors + positional_vectors
        print(vectors)

        # calculate attention scores by seeing how aligned they are in terms of drections
        # more similar words/embeddings pointing in same direction should have similar attentions
        scores = np.dot(vectors, vectors.T)
        print(scores)
        scores = scores / np.sqrt(3) # scale scores
        print(scores)

        # convert to probabilities
        probs = np.exp(scores)
        print(probs)
        attention_probs = probs / probs.sum(axis=1, keepdims=True)
        print(attention_probs)

        # work out new embeddings for words based on weighted sum of other words embeddings
        newEmbeddings = np.dot(attention_probs, vectors)
        print(newEmbeddings)
        return attention_probs, newEmbeddings

    def two_head_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        # Create weights for two heads
        head1_weights = np.random.randn(3, 3) * 0.1
        head2_weights = np.random.randn(3, 3) * 0.1

        # each head emphasizes a different aspect about the embedding - for example is an animal, is a verb, etc.
        head1_vectors = np.dot(vectors, head1_weights)
        head2_vectors = np.dot(vectors, head2_weights)

        # the emphasized vectors then are multiplied itself for attention scores. if head1 emphasizes animals, then the animals in the vector will have similar attention
        # the first step emphasizes attributes of embeddings for this step to detect them and allocate attention along similar words
        head1_scores = np.dot(head1_vectors, head1_vectors.T)
        head2_scores = np.dot(head2_vectors, head2_vectors.T)

        # scale vectors
        head1_vectors = head1_scores / np.sqrt(3)
        head2_vectors = head2_scores / np.sqrt(3)

        # convert to probabilities
        head1_probs = np.exp(head1_vectors)
        head2_probs = np.exp(head2_vectors)

        # attention_probs
        head1_probs = head1_probs / head1_probs.sum(axis=1, keepdims=True)
        head2_probs = head2_probs / head2_probs.sum(axis=1, keepdims=True)

        # new embeddings
        head1_new_vectors = np.dot(head1_probs, vectors)
        head2_new_vectors = np.dot(head2_probs, vectors)

        # average between what attention embeddings say
        newEmbeddings = (head1_new_vectors + head2_new_vectors)/2

        return head1_probs, head2_probs, newEmbeddings

    def n_head_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        head_outputs = []
        for i in range(self.n_heads):
            # each head emphasizes a different aspect about the embedding - for example is an animal, is a verb, etc.
            head_vectors = np.dot(vectors, self.head_weights[i])

            # the emphasized vectors then are multiplied itself for attention scores. if head1 emphasizes animals, then the animals in the vector will have similar attention
            # the first step emphasizes attributes of embeddings for this step to detect them and allocate attention along similar words
            head_scores = np.dot(head_vectors, head_vectors.T)

            # scale vectors
            head_vectors = head_scores / np.sqrt(3)

            # convert to probabilities
            head_probs = np.exp(head_vectors)

            # attention_probs
            head_probs = head_probs / head_probs.sum(axis=1, keepdims=True)

            # new embeddings
            head_new_vectors = np.dot(head_probs, vectors)

            head_outputs.append(head_new_vectors)

        # stack head outputs
        head_outputs = np.stack(head_outputs)

        # weight head outputs
        weighted_heads = head_outputs * self.head_importance[:, np.newaxis, np.newaxis]
        # for each dimension, sum up head elements
        newEmbeddings = weighted_heads.sum(axis=0)

        return head_outputs, newEmbeddings

    def QKV_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        head_outputs = []
        for i in range(self.n_heads):
            # have to calculate qkv of vectors for head
            q = np.dot(vectors, self.q_weights[i])  # What each word is looking for
            k = np.dot(vectors, self.k_weights[i])  # What each word is advertising
            v = np.dot(vectors, self.v_weights[i])  # What each word has to offer

            # check out how similar query and key vectors are for this head
            # this is attention score - words that have info for each other are related
            queryKeySimilarity = np.dot(q, k.T)

           # Scale scores to prevent exploding gradients
            attention_scores = queryKeySimilarity / np.sqrt(3)

            # Convert to probabilities with softmax
            attention_probs = np.exp(attention_scores)
            attention_probs = attention_probs / attention_probs.sum(axis=1, keepdims=True)

            # new embedding by scaling value vectors by attention probabilities
            head_output = np.dot(attention_probs, v)

            head_outputs.append(head_output)

        # stack head outputs
        head_outputs = np.stack(head_outputs)

        # weight head outputs
        weighted_heads = head_outputs * self.head_importance[:, np.newaxis, np.newaxis]
        # for each dimension, sum up head elements
        newEmbeddings = weighted_heads.sum(axis=0)

        return head_outputs, newEmbeddings



    def calculate_head_loss(self, sentence, expected_relationships):
        """
        expected_relationships: list of tuples (word1_idx, word2_idx, relationship_strength)
        For example: [(0,1,1.0)] means word at index 0 should be strongly related to word at index 1
        """

        # Get current attention patterns
        head_attentions, final_embeddings = self.n_head_attention(sentence)

        loss = 0
        # For each expected relationship
        for word1_idx, word2_idx, expected_strength in expected_relationships:
            # Get similarity/attention between word embeddings
            actual_strength = np.dot(final_embeddings[word1_idx], final_embeddings[word2_idx])
            # Calculate difference from expected
            loss += (expected_strength - actual_strength) ** 2

        return loss

    def train_step(self, sentence, expected_relationships, learning_rate=0.01):
        # Store original weights
        original_weights = self.head_weights.copy()

        # Calculate original loss
        original_loss = self.calculate_head_loss(sentence, expected_relationships)

        # Initialize gradients array
        gradients = np.zeros_like(self.head_weights)

        # For each head, for each word, for each dimension
        for head in range(self.n_heads):
            for i in range(3):  # dimension size
                for j in range(3):
                    # Slightly modify weight to compute gradient
                    self.head_weights[head, i, j] += 0.0001
                    new_loss = self.calculate_head_loss(sentence, expected_relationships)

                    # Calculate and store gradient
                    gradients[head, i, j] = (new_loss - original_loss) / 0.0001

                    # Reset weight for next gradient calculation
                    self.head_weights[head, i, j] = original_weights[head, i, j]

        # Update all weights at once using calculated gradients
        self.head_weights = original_weights - learning_rate * gradients

In [None]:
bert = TinyBERT()
sentence = ['the', 'cat', 'sleeps']
head1_result, head2_result = bert.animal_attention_example()

Through animal-focused attention head:
horse: [1.2 0.2 0.2]
cat: [1. 0. 0.]
bike: [0.1 0.1 0.1]

Through ride-focused attention head:
horse: [0.2 1.2 0.2]
cat: [0.1 0.1 0.1]
bike: [0. 1. 0.]


TypeError: cannot unpack non-iterable NoneType object

In [None]:
bert = TinyBERT()
sentence = ['cat', 'sits', 'the', ]
weights, new_reps = bert.attention(sentence)

TinyBert --- attention()
['cat', 'sits', 'the']
[1, 2, 0]
[[-0.3  0.4  0.2]
 [ 0.1  0.3 -0.4]
 [ 0.2 -0.5  0.1]]
[[-0.25497954  0.4399019   0.11021631]
 [ 0.07694333  0.38871628 -0.50786617]
 [ 0.18630067 -0.41576185  0.31090431]]
[[ 0.27067589  0.09540292 -0.19613056]
 [ 0.09540292  0.41494866 -0.30517659]
 [-0.19613056 -0.30517659  0.30422735]]
[[ 0.1562748   0.0550809  -0.11323603]
 [ 0.0550809   0.23957072 -0.17619378]
 [-0.11323603 -0.17619378  0.17564574]]
[[1.16914744 1.05662609 0.89293987]
 [1.05662609 1.27070355 0.83845549]
 [0.89293987 0.83845549 1.1920157 ]]
[[0.37488133 0.33880192 0.28631675]
 [0.33376431 0.40138654 0.26484915]
 [0.30544451 0.28680725 0.40774824]]
[[-0.01617752  0.17756925 -0.04173089]
 [-0.00487748  0.19273487 -0.08472163]
 [ 0.02014957  0.07632611  0.01477595]]


In [None]:
print("\nAttention weights:")
for i, word in enumerate(sentence):
    print(f"\n{word} pays attention to each word:")
    for j, other_word in enumerate(sentence):
        print(f"{other_word}: {weights[i][j]:.3f}")


Attention weights:

cat pays attention to each word:
cat: 0.375
sits: 0.339
the: 0.286

sits pays attention to each word:
cat: 0.334
sits: 0.401
the: 0.265

the pays attention to each word:
cat: 0.305
sits: 0.287
the: 0.408


In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from typing import Dict, List
import torch.nn as nn

class ModelInspector:
    def __init__(self, model_name="BAAI/bge-large-en-v1.5"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, output_attentions=True)
        self.attention_outputs = {}

    def inspect_text(self, text: str):
        # Tokenize and get model outputs
        inputs = self.tokenizer(text, return_tensors='pt', padding=True)

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True, output_attentions=True)

        print("\n=== Token Embeddings ===")
        print("Shape:", outputs.last_hidden_state.shape)
        print("First token, first 5 values:", outputs.last_hidden_state[0][0][:5].tolist())

        print("\n=== Attention Layers ===")
        # outputs.attentions is a tuple of attention tensors for each layer
        for layer_idx, attention_layer in enumerate(outputs.attentions):
            # attention_layer shape: [batch_size, num_heads, seq_length, seq_length]
            print(f"\nLayer {layer_idx}:")
            print(f"Shape: {attention_layer.shape}")
            print(f"First head attention weights (first token):")
            print(attention_layer[0, 0, 0, :5].tolist())  # First 5 attention weights

        print("\n=== Final Pooled Output ===")
        print("Shape:", outputs.pooler_output.shape)
        print("First 5 values:", outputs.pooler_output[0][:5].tolist())

        return outputs

# Example usage
inspector = ModelInspector()
text = "This is a test blah."
outputs = inspector.inspect_text(text)
print("\n=== Model Outputs ===")
print(outputs.pooler_output[0][:5])


=== Token Embeddings ===
Shape: torch.Size([1, 8, 1024])
First token, first 5 values: [0.47757381200790405, 0.19744722545146942, 0.29085487127304077, 0.5564297437667847, -1.2077648639678955]

=== Attention Layers ===

Layer 0:
Shape: torch.Size([1, 16, 8, 8])
First head attention weights (first token):
[0.029770266264677048, 0.6047151684761047, 0.007282047998160124, 0.294673889875412, 0.00579837104305625]

Layer 1:
Shape: torch.Size([1, 16, 8, 8])
First head attention weights (first token):
[0.9059060215950012, 0.00976244080811739, 0.025496404618024826, 0.008608806878328323, 0.016318518668413162]

Layer 2:
Shape: torch.Size([1, 16, 8, 8])
First head attention weights (first token):
[0.8583857417106628, 0.012287343852221966, 0.011142122559249401, 0.01574944704771042, 0.010025042109191418]

Layer 3:
Shape: torch.Size([1, 16, 8, 8])
First head attention weights (first token):
[0.8044492602348328, 0.014801014214754105, 0.013178297318518162, 0.009986610151827335, 0.022480595856904984]

Lay

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
from typing import Dict, List, Tuple
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

class EnhancedModelInspector:
    def __init__(self, model_name="BAAI/bge-large-en-v1.5"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, output_attentions=True)
        self.layer_stats = defaultdict(dict)

    def get_parameter_stats(self) -> Dict:
        """Analyze model parameters and their statistics."""
        stats = {}
        total_params = 0
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                stats[name] = {
                    'shape': tuple(param.shape),
                    'mean': param.mean().item(),
                    'std': param.std().item(),
                    'min': param.min().item(),
                    'max': param.max().item(),
                    'num_params': param.numel(),
                    'sparsity': (param == 0).float().mean().item(),
                    'gradient_enabled': param.requires_grad
                }
                total_params += param.numel()
        stats['total_parameters'] = total_params
        return stats

    def analyze_attention_patterns(self, attention_weights: torch.Tensor) -> Dict:
        """Analyze attention patterns in detail."""
        # attention_weights shape: [batch_size, num_heads, seq_length, seq_length]
        attention_analysis = {
            'shape': attention_weights.shape,
            'entropy': self._compute_attention_entropy(attention_weights),
            'head_importance': self._compute_head_importance(attention_weights),
            'attention_sparsity': (attention_weights == 0).float().mean().item(),
            'max_attention': attention_weights.max().item(),
            'mean_attention': attention_weights.mean().item()
        }
        return attention_analysis

    def _compute_attention_entropy(self, attention_weights: torch.Tensor) -> float:
        """Compute attention entropy as a measure of focus/dispersion."""
        # Avoid log(0) by adding small epsilon
        eps = 1e-10
        entropy = -(attention_weights * torch.log(attention_weights + eps)).sum(dim=-1).mean().item()
        return entropy

    def _compute_head_importance(self, attention_weights: torch.Tensor) -> List[float]:
        """Compute importance scores for each attention head based on attention magnitude."""
        return attention_weights.mean(dim=[0, 2, 3]).tolist()

    def analyze_hidden_states(self, hidden_states: Tuple[torch.Tensor]) -> Dict:
        """Analyze hidden state activations across layers."""
        hidden_analysis = {}
        for layer_idx, hidden_state in enumerate(hidden_states):
            hidden_analysis[f'layer_{layer_idx}'] = {
                'shape': hidden_state.shape,
                'mean_activation': hidden_state.mean().item(),
                'std_activation': hidden_state.std().item(),
                'activation_sparsity': (hidden_state == 0).float().mean().item(),
                'max_activation': hidden_state.max().item(),
                'min_activation': hidden_state.min().item()
            }
        return hidden_analysis

    def inspect_text(self, text: str, verbose: bool = True) -> Dict:
        """Perform comprehensive model inspection for given text."""
        # Tokenize and get model outputs
        inputs = self.tokenizer(text, return_tensors='pt', padding=True)
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True, output_attentions=True)

        # Comprehensive analysis
        analysis = {
            'text_length': len(text),
            'num_tokens': len(tokens),
            'tokens': tokens,
            'token_ids': inputs['input_ids'][0].tolist(),
            'parameter_stats': self.get_parameter_stats(),
            'attention_analysis': {},
            'hidden_state_analysis': self.analyze_hidden_states(outputs.hidden_states),
            'pooled_output_stats': {
                'shape': outputs.pooler_output.shape,
                'mean': outputs.pooler_output.mean().item(),
                'std': outputs.pooler_output.std().item(),
                'first_values': outputs.pooler_output[0][:10].tolist()
            }
        }
        # Extract weights
        embedding_weights = self.model.embeddings.word_embeddings.weight
        layer_0_qkv = {
            'query': self.model.encoder.layer[0].attention.self.query.weight,
            'key': self.model.encoder.layer[0].attention.self.key.weight,
            'value': self.model.encoder.layer[0].attention.self.value.weight
        }
        print(layer_0_qkv)

        # Analyze attention patterns for each layer
        for layer_idx, attention_layer in enumerate(outputs.attentions):
            analysis['attention_analysis'][f'layer_{layer_idx}'] = self.analyze_attention_patterns(attention_layer)

        if verbose:
            self._print_analysis(analysis)

        return analysis

    def _print_analysis(self, analysis: Dict):
        """Pretty print the analysis results."""
        print("\n=== Model Analysis Report ===")

        print("\n1. Input Text Statistics:")
        print(f"Text length: {analysis['text_length']} characters")
        print(f"Number of tokens: {analysis['num_tokens']}")
        print("First 10 tokens:", analysis['tokens'][:10])

        print("\n2. Model Parameters Overview:")
        print(f"Total parameters: {analysis['parameter_stats']['total_parameters']:,}")

        print("\n3. Attention Analysis:")
        for layer_name, layer_data in analysis['attention_analysis'].items():
            print(f"\n{layer_name}:")
            print(f"Attention entropy: {layer_data['entropy']:.4f}")
            print(f"Attention sparsity: {layer_data['attention_sparsity']:.4f}")
            print(f"Mean attention: {layer_data['mean_attention']:.4f}")

        print("\n4. Hidden State Analysis:")
        for layer_name, layer_data in analysis['hidden_state_analysis'].items():
            print(f"\n{layer_name}:")
            print(f"Mean activation: {layer_data['mean_activation']:.4f}")
            print(f"Activation sparsity: {layer_data['activation_sparsity']:.4f}")

        print("\n5. Pooled Output Statistics:")
        print(f"Shape: {analysis['pooled_output_stats']['shape']}")
        print(f"Mean: {analysis['pooled_output_stats']['mean']:.4f}")
        print(f"Standard deviation: {analysis['pooled_output_stats']['std']:.4f}")

# Example usage
inspector = EnhancedModelInspector()
text = "This is a test sentence for analysis."
analysis = inspector.inspect_text(text)

{'query': Parameter containing:
tensor([[ 0.0196, -0.0512, -0.0378,  ...,  0.0012, -0.0132,  0.0288],
        [ 0.0232,  0.0321,  0.0128,  ..., -0.0620,  0.0114, -0.0528],
        [ 0.0332, -0.0264, -0.0607,  ...,  0.0434,  0.0133, -0.0662],
        ...,
        [-0.0061,  0.0046, -0.0341,  ...,  0.0408,  0.0260,  0.0653],
        [ 0.0475,  0.0067,  0.0667,  ..., -0.0076,  0.0105, -0.0408],
        [ 0.0256, -0.0007, -0.0361,  ...,  0.0175,  0.0473,  0.0349]],
       requires_grad=True), 'key': Parameter containing:
tensor([[-0.0544,  0.0370, -0.0173,  ...,  0.0019,  0.0319,  0.0513],
        [-0.0426,  0.0233, -0.0215,  ..., -0.0511,  0.0392,  0.0296],
        [ 0.0237,  0.0242,  0.0622,  ...,  0.0844,  0.0631, -0.0113],
        ...,
        [-0.0196,  0.0176, -0.0171,  ..., -0.0310,  0.0187,  0.0322],
        [-0.0029,  0.0076,  0.0066,  ...,  0.0197, -0.0158,  0.0461],
        [ 0.0136, -0.0209, -0.0289,  ..., -0.0079,  0.0190, -0.0126]],
       requires_grad=True), 'value': Parame