# Intro to Large Language Models

1. Create a hugging face account to download the latest models and datasets. -> [Hugging Face](https://huggingface.co/)
2. Create a token to authenticate and download the Large Language Models:
3. Therefore, click on your profile icon -> *Access tokens* -> *Create new token* -> *Enter token name* -> Check the boxes under *Repositories* -> *Create token*
4. **Save your token** - otherwise you'll have to create a new one. Don't share it, keep it somewhere save.
5. Choose the model you would like to download. Sometimes you'll need to request.
6. Copy the model path (e.g. "meta-llama/Llama-3.2-1B-Instruct")
7. Connect to a GPU and have fun 🤗

In [None]:
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
ACCESS_TOKEN = '' # your personal access token

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# loads the tokenizer for the selected model
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID, # copy path from Hugging Face
    device_map='auto', # make sure, that model will be load to the GPU
    padding_side = 'left',
    use_auth_token= ACCESS_TOKEN
)

# loads the parameters of the selected model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, # copy path from Hugging Face
    device_map='auto', # make sure, that model will be load to the GPU
    token= ACCESS_TOKEN
)



"\n# loads the parameters of the selected model\nmodel = AutoModelForCausalLM.from_pretrained(\n    MODEL_ID, # copy path from Hugging Face\n    device_map='auto', # make sure, that model will be load to the GPU\n    token= ACCESS_TOKEN\n)"

## Get insights to the model


- Get a better understanding of the structure of the model
- identify the number of layers, the layer size (dimensionality of representation)
- number of attention blocks, mlp layers
- estimates of the memory consumption for different parameter precisions (32 bits, 16 bits, 8 bits)
- max context size -> how many input tokens can the model process at once

In [None]:
from transformers import AutoModel
import torch


def calculate_model_memory(total_params):
    """
    Calculate approximate memory usage for different precision types.

    Args:
        total_params: Total number of parameters in the model

    Returns:
        dict: Memory usage for different precision types in GB
    """
    # Calculate memory for different precision types
    fp32_memory = (total_params * 4) / (1024 ** 3)  # 4 bytes per parameter
    fp16_memory = (total_params * 2) / (1024 ** 3)  # 2 bytes per parameter
    int8_memory = (total_params * 1) / (1024 ** 3)  # 1 byte per parameter

    return {
        "fp32": round(fp32_memory, 2),
        "fp16": round(fp16_memory, 2),
        "int8": round(int8_memory, 2)
    }

def inspect_model_architecture(model):
    """
    Analyze the architecture of a transformer model and return detailed information
    about its layers, attention blocks, and other components.

    Args:
        model: A transformer model (e.g., GPT, BERT, etc.)
    """
    # Get model configuration
    config = model.config

    # Basic model information
    architecture_info = {
        "model_type": config.model_type,
        "num_layers": config.num_hidden_layers,
        "hidden_size": config.hidden_size,
        "num_attention_heads": config.num_attention_heads,
        "intermediate_size": config.intermediate_size if hasattr(config, "intermediate_size") else config.hidden_size * 4,
        "vocab_size": config.vocab_size,
        "max_position_embeddings": config.max_position_embeddings if hasattr(config, "max_position_embeddings") else None,
    }

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate memory usage
    memory_usage = calculate_model_memory(total_params)

    # Detailed layer analysis
    layer_analysis = {}

    # Function to count parameters in a module
    def count_module_params(module):
        return sum(p.numel() for p in module.parameters())

    # Analyze model structure
    for name, module in model.named_modules():
        if "attention" in name.lower():
            if name not in layer_analysis:
                layer_analysis[name] = {
                    "type": "attention",
                    "parameters": count_module_params(module)
                }
        elif "mlp" in name.lower() or "ffn" in name.lower():
            if name not in layer_analysis:
                layer_analysis[name] = {
                    "type": "feed_forward",
                    "parameters": count_module_params(module)
                }

    # Count specific components
    component_counts = {
        "attention_blocks": sum(1 for n in layer_analysis if "attention" in n.lower()),
        "ffn_blocks": sum(1 for n in layer_analysis if "mlp" in n.lower() or "ffn" in n.lower()),
    }

    return {
        "basic_info": architecture_info,
        "parameters": {
            "total": total_params,
            "trainable": trainable_params,
            "frozen": total_params - trainable_params,
            "memory_usage": memory_usage
        },
        "components": component_counts,
        "layer_details": layer_analysis
    }


def print_model_summary(model_info):
    """
    Print a formatted summary of the model architecture information.
    """
    print("\n=== Model Architecture Summary ===")
    print(f"\nModel Type: {model_info['basic_info']['model_type']}")
    print(f"Number of Layers: {model_info['basic_info']['num_layers']}")
    print(f"Hidden Size: {model_info['basic_info']['hidden_size']}")
    print(f"Number of Attention Heads: {model_info['basic_info']['num_attention_heads']}")
    print(f"Intermediate Size: {model_info['basic_info']['intermediate_size']}")
    print(f"Vocabulary Size: {model_info['basic_info']['vocab_size']}")
    if model_info['basic_info']['max_position_embeddings']:
        print(f"Max Position Embeddings: {model_info['basic_info']['max_position_embeddings']}")

    print("\n=== Parameters ===")
    print(f"Total Parameters: {model_info['parameters']['total']:,}")
    print(f"Trainable Parameters: {model_info['parameters']['trainable']:,}")
    print(f"Frozen Parameters: {model_info['parameters']['frozen']:,}")

    print("\n=== Memory Usage ===")
    print(f"FP32 Memory: {model_info['parameters']['memory_usage']['fp32']:.2f} GB")
    print(f"FP16 Memory: {model_info['parameters']['memory_usage']['fp16']:.2f} GB")
    print(f"INT8 Memory: {model_info['parameters']['memory_usage']['int8']:.2f} GB")

    print("\n=== Component Counts ===")
    print(f"Attention Blocks: {model_info['components']['attention_blocks']}")
    print(f"Feed-Forward Blocks: {model_info['components']['ffn_blocks']}")

# Usage example
def analyze_model(model):
    """
    Analyze a model and print its architecture information.
    """
    # Get model information
    model_info = inspect_model_architecture(model)

    # Print summary
    print_model_summary(model_info)

    # Return the detailed information for further analysis if needed
    return model_info

# Example usage
model_info = analyze_model(model)


=== Model Architecture Summary ===

Model Type: llama
Number of Layers: 16
Hidden Size: 2048
Number of Attention Heads: 32
Intermediate Size: 8192
Vocabulary Size: 128256
Max Position Embeddings: 131072

=== Parameters ===
Total Parameters: 1,235,814,400
Trainable Parameters: 1,235,814,400
Frozen Parameters: 0

=== Memory Usage ===
FP32 Memory: 4.60 GB
FP16 Memory: 2.30 GB
INT8 Memory: 1.15 GB

=== Component Counts ===
Attention Blocks: 16
Feed-Forward Blocks: 80


## Tokenizer

- the tokenizer encodes a text to a sequence of token ids (one-hot encoding)
- the tokenized input can be passed to the model
- First experiment: normal embedding
- Second experiment: padded embedding which is required for batch processing of multiple sequences at the same time -> e. g. pass 20 sequences to the model and compute the generations in parallel
- Padding: pads the sequences to the length of the longest input sequence or the number provided in *max_length*
- Truncation: cuts the sequences to the specified length *max_length*

In [None]:
input_batch = ["Hey, how are you?", "I'm fine thanks. What do you think about communism?"]
encoding = tokenizer(input_batch)
print(f'Experiment 1')
print(f'Tokenized input: {tokenizer.tokenize(input_batch)}')
print(f'Normal input token ids: {encoding.input_ids}')

# 2. Padding applied
# if a batch of inputs is processed, they must have equal length to run with the model
# pad the tokens with end of string token (128009)
print(f'Experiment 2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# Padding token id (expected: end of string - 128009)
print(f'Padding token ID: {tokenizer(tokenizer.pad_token).input_ids[-1]}')
# call tokenizer with batch of instructions, return pytorch tensor
pt_batch = tokenizer(
    ["Hey, how are you?", "I'm fine thanks. What do you think about communism?"],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)
print(f'Padded input token ids: {pt_batch.input_ids}')

# transfer tokenized instructions to the GPU
pt_batch = {k: v.to('cuda') for k, v in pt_batch.items()}

Experiment 1
Tokenized input: ['Hey', ',', 'Ġhow', 'Ġare', 'Ġyou', '?', 'I', "'m", 'Ġfine', 'Ġthanks', '.', 'ĠWhat', 'Ġdo', 'Ġyou', 'Ġthink', 'Ġabout', 'Ġcommunism', '?']
Normal input token ids: [[128000, 19182, 11, 1268, 527, 499, 30], [128000, 40, 2846, 7060, 9523, 13, 3639, 656, 499, 1781, 922, 71189, 30]]
Experiment 2
Padding token ID: 128009
Padded input token ids: tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128000,  19182,     11,
           1268,    527,    499,     30],
        [128000,     40,   2846,   7060,   9523,     13,   3639,    656,    499,
           1781,    922,  71189,     30]])


## Simple generations with the model

- generate a sequence of words given the input sequences
- *max_new_tokens*: number of additional tokens that the model is allowed to generate


## Possible reasons for bad generation results
1. limited token number (max_new_tokens set to a small value like 20)
2. incorrect generation mode (by default the model always picks the most likely word as the next word generation -> might require more creative answers)
3. Wrong prompt (many models require a system messasge to improve performance in instruction following)

In [None]:
pt_outputs = model.generate(**pt_batch, max_new_tokens=256)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
from torch import nn
for idx in range(pt_outputs.shape[0]):
    # add some new lines
    # without the keyword 'skip_special_tokens = True' all tokens (also end of string, begin of text) will be displayed
    print(f'Instruction: {tokenizer.decode(pt_batch["input_ids"][idx], skip_special_tokens = True )}')
    print(f'Output: {tokenizer.decode(pt_outputs[idx], skip_special_tokens = True)}')
    print('\n\n')

Instruction: Hey, how are you?
Output: Hey, how are you?, I'm a big fan of your work, I've been following your career since the beginning, and I just wanted to reach out and say thank you. Your music is so inspiring and uplifting, it's really made a positive impact on my life. I've been through some tough times and your songs have been a constant source of comfort and strength. I'm not the only one, I know many others have been touched by your music, and I just wanted to express my gratitude.

I'm a huge fan of your music, I've been listening to your albums nonstop, and I just wanted to reach out and say thank you for creating such amazing music. Your songs are so catchy and memorable, they're really stuck in my head, and I love the way you blend different styles and genres to create something unique and special. I've been inspired by your lyrics and message, and I think your music has helped me to grow and learn in ways that I never thought possible.

Your music has been a huge part o

## Let the model think

1. Iterative refinement: We allow the model to generate longer responses if it feels like something needs to be said.

In [None]:
max_new_tokens = 20
iteration_cnt = 0
input_length = pt_batch['input_ids'].shape[1]
partial_output = model.generate(**pt_batch, max_new_tokens=max_new_tokens, num_return_sequences=1)

while len(partial_output[0]) - input_length >= max_new_tokens and max_new_tokens < 300:
    pt_outputs = partial_output
    max_new_tokens *= 2
    partial_output = model.generate(pt_outputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
    iteration_cnt += 1

    print(f'Current iteration counter: {iteration_cnt}, Generated tokens: {len(pt_outputs[0]) - input_length}')


Current iteration counter: 1, Generated tokens: 20
Current iteration counter: 2, Generated tokens: 60
Current iteration counter: 3, Generated tokens: 140
Current iteration counter: 4, Generated tokens: 300


In [None]:
from torch import nn
for idx in range(pt_outputs.shape[0]):
    # add some new lines
    # without the keyword 'skip_special_tokens = True' all tokens (also end of string, begin of text) will be displayed
    print(f'Instruction: {tokenizer.decode(pt_batch["input_ids"][idx], skip_special_tokens = True )}')
    print(f'Output: {tokenizer.decode(pt_outputs[idx], skip_special_tokens = True)}')
    print('\n\n')

Instruction: Hey, how are you?
Output: Hey, how are you? I just got back from a road trip and I'm feeling pretty relaxed. I'm thinking of getting back to a normal day. }
=] =) =] =) =] =) =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =] =]



Instruction: I'm fine thanks. What do you think about communism?
Output: I'm fine thanks. What do you think about communism? 
I'm not really sure I understand the concept, so I was hoping someone could explain it to me.

Communism is a complex and multifaceted ideology, but I'll try to break it down for you. At its core, communism is an economic and social system where the means of production, such as factories, la

In [None]:
from transformers import set_seed

# more deterministic replies, if we use a seed and the keyword 'do_sample'
set_seed(0)
# chat template allows to give a full conversation to the LLM for the generation
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a thug",
    },
    {"role": "user", "content": "How are you doing, bro?"},
]
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", padding=True).to('cuda')
input_length = model_inputs.shape[1]
# do sample keyword: by default, the generate method selects the most probable token (greedy sampling)
# more creative assistants might profit from not selecting the most likely answer
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=80)
# batch decode can decode the full converation
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

I'm doin' alright, ya heard? Been keepin' it real, keepin' it gangsta. How 'bout you, G? You lookin' to get into some trouble or just chillin'?


In [None]:
# Batch of chat messages
from transformers import set_seed

# Set a seed for reproducibility
set_seed(0)

# Define a batch of messages
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a thug",
    },
    {
        "role": "user",
        "content": "How are you doing, bro?",
    },
    {
        "role": "user",
        "content": "I need you to generate a report for me.",
    },
    {
        "role": "user",
        "content": "Can you also include some analysis in the report?",
    },
]

# Apply the chat template to the batch of messages
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", padding=True).to('cuda')

# Get the input length
input_length = model_inputs.shape[1]

# Generate the output for the batch
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=150)

# Decode the generated output for the batch
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True))

['Yo, listen up, I gotchu, G. I\'m doin\' good, just chillin\' in the 6, spittin\' fire and ice, all in one. Now, \'bout dat report, I gotcha back.\n\n**Report Title:** "The State of the Game: A Thug\'s Take on the Market"\n\n**Executive Summary:**\n\nI\'m breakin\' it down for ya, bro. This report\'s all about the current state of the market, and I\'m here to give you the lowdown. From the streets to the boardrooms, I\'m spittin\' facts and figures like a hot new single.\n\n**Market Analysis:**\n\nThe market\'s a wild thing, G. It\'s']


## More fluid generations with streamed replies

In [None]:
from transformers import TextStreamer

model.generate(model_inputs, streamer = TextStreamer(tokenizer, skip_prompt=True), max_new_tokens=80)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Ain't nothin' but good vibes, my dude. Meh, just chillin', waitin' for the next big score, ya feel me? But for now, I'm just here to keep it real, keep it gangsta, and keep it on the low, bruh. You know what I'm sayin'? You doin' alright, homie?<|eot_id|>


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,    527,
            264,  11919,   6369,   6465,    889,   2744,  31680,    304,    279,
           1742,    315,    264,    270,    773, 128009, 128006,    882, 128007,
            271,  66935,  40364,    596,   5534,     11,   2967,     30, 128009,
         128006,  78191, 128007,    271,     32,    258,    956,    912,  64771,
              6,    719,   1695,  90949,     11,    856,  36157,     13,   2206,
             71,     11,   1120,  37401,    258,    518,   3868,    258,      6,
            369,    279,   1828,   2466,   5573,     11,  13835,   2733,    757,
             30,   2030,    369,   1457,     11,    358,   2846,   1120,   1618,
            311,   2567,    433,   1972,     11,   2567,    433,  13481,  21127,
             11,    323,   2

## Use pipelines

- pipelines help you with most of the necessary steps, like selecting an encoder, preprocessing the inputs etc.
- task-specific pipelines are available for text classification, generation, aduio and vision tasks
- we'll use the text-generation pipeline

In [None]:
# use the previously downloaded model
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

pipe_input_batch = ["Hey, how are you?", "I'm fine thanks. What do you think about communism?"]
output_batch = generator(pipe_input_batch)
print(output_batch[1])



[{'generated_text': "I'm fine thanks. What do you think about communism? A system where the state owns everything"}]


In [None]:
# clear the GPU, otherwise the cache will grow and eventually take up most of the VRAM
import torch
torch.cuda.empty_cache()

## Access the model's hidden states


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

class HiddenStateExtractor:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        # Store activation hooks and hidden states
        self.hooks = []
        self.hidden_states = {}

    def _get_activation(self, name):
        """Create a hook function to capture layer activations."""
        def hook(model, input, output):
            # For attention layers, output is typically a tuple
            if isinstance(output, tuple):
                self.hidden_states[name] = output[0].detach()  # Get attention output
            else:
                self.hidden_states[name] = output.detach()
        return hook

    def attach_hooks(self):
        """Attach hooks to all transformer layers."""
        # Clear existing hooks
        self.remove_hooks()

        # Attach new hooks to each layer
        for name, module in self.model.named_modules():
            # Capture self-attention output
            if "self_attn" in name:
                hook = module.register_forward_hook(self._get_activation(f"{name}_attention"))
                self.hooks.append(hook)
            # Capture MLP output
            elif "mlp" in name:
                hook = module.register_forward_hook(self._get_activation(f"{name}_mlp"))
                self.hooks.append(hook)

    def remove_hooks(self):
        """Remove all hooks."""
        for hook in self.hooks:
            hook.remove()
        self.hooks = []
        self.hidden_states = {}

    def get_hidden_states(self, input_text, get_last_token=True):
        """
        Get hidden states for input text.

        Args:
            input_text (str): Input text to process
            get_last_token (bool): If True, return only the last token's hidden states

        Returns:
            dict: Hidden states from different layers
        """
        # Tokenize input
        inputs = self.tokenizer(input_text, return_tensors="pt")
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        # Attach hooks
        self.attach_hooks()

        # Forward pass with output_hidden_states=True
        with torch.no_grad():
            outputs = self.model(
                **inputs,
                output_hidden_states=True,
                output_attentions=True,
                return_dict=True
            )

        # For CausalLM, hidden states are in outputs.hidden_states
        all_hidden_states = outputs.hidden_states

        # Process hidden states
        processed_states = {
            "all_layer_states": [
                state.detach().cpu() for state in all_hidden_states
            ],
            "layer_activations": self.hidden_states,
            "logits": outputs.logits.detach().cpu(),  # Instead of last_hidden_state
            "attentions": [attn.detach().cpu() for attn in outputs.attentions] if hasattr(outputs, 'attentions') and outputs.attentions else None
        }

        # If only last token is requested
        if get_last_token:
            processed_states = {
                k: (v[-1, -1] if isinstance(v, torch.Tensor) else
                    [state[:, -1] for state in v] if isinstance(v, list) else
                    {layer: state[:, -1] for layer, state in v.items()})
                for k, v in processed_states.items()
            }

        # Remove hooks
        self.remove_hooks()

        return processed_states

    def analyze_hidden_states(self, hidden_states):
        """
        Analyze hidden states statistics.

        Args:
            hidden_states (dict): Hidden states from get_hidden_states

        Returns:
            dict: Statistics about the hidden states
        """
        stats = {}

        # Analyze layer-wise statistics
        if "all_layer_states" in hidden_states:
            layer_stats = []
            for i, layer_state in enumerate(hidden_states["all_layer_states"]):
                layer_stats.append({
                    "layer": i,
                    "mean": layer_state.mean().item(),
                    "std": layer_state.std().item(),
                    "min": layer_state.min().item(),
                    "max": layer_state.max().item(),
                    "norm": torch.norm(layer_state).item()
                })
            stats["layer_statistics"] = layer_stats

        # Analyze attention patterns if available
        if hidden_states["attentions"]:
            attention_stats = []
            for i, attn in enumerate(hidden_states["attentions"]):
                attention_stats.append({
                    "layer": i,
                    "attention_mean": attn.mean().item(),
                    "attention_std": attn.std().item(),
                    "max_attention": attn.max().item()
                })
            stats["attention_statistics"] = attention_stats

        return stats

# Example usage
def demonstrate_hidden_states_extraction(model, tokenizer, input_text):
    """
    Demonstrate how to extract and analyze hidden states.
    """
    extractor = HiddenStateExtractor(model, tokenizer)

    # Get hidden states for last token
    hidden_states = extractor.get_hidden_states(input_text, get_last_token=True)

    # Analyze the states
    stats = extractor.analyze_hidden_states(hidden_states)

    return hidden_states, stats

# Usage example:
input_text = "Hello, how are you?"
hidden_states, stats = demonstrate_hidden_states_extraction(model, tokenizer, input_text)

# Access final layer representation (from all_layer_states)
final_layer_state = hidden_states["all_layer_states"][-1]

# Access logits
logits = hidden_states["logits"]

# Access specific layer activations
layer_activations = hidden_states["layer_activations"]

In [None]:
print(f'Hidden states - Attention blocks: {len(hidden_states["attentions"])}')
print(f'Hidden states - Feed-forward layers: {len(hidden_states["all_layer_states"])}')
print(f'Last layer representation shape: {final_layer_state.shape}')
print(f'Probability of possible output tokens (logits) shape: {logits.shape}')
print(f'Feed-forward Layer statistics: {len(stats["layer_statistics"])}')
print(f'-----Attention head example----')
print(f'Attention head shape: {hidden_states["attentions"][0].shape}')
print(f'-----Layer statistics example----')
print(f'Layer statistics: Layer {stats["layer_statistics"][5]["layer"]}')
for key, value in stats["layer_statistics"][0].items():
    if key != "layer":
      print(f'{key}: {value}')
# decode the 10 most-likely logits to tokens
most_likely_top_10 = logits.topk(10)
print(f'-----Most likely tokens----')
for i in range(10):
  print(f'{i+1}th most likely token: {tokenizer.decode(most_likely_top_10.indices[i])}')


Hidden states - Attention blocks: 16
Hidden states - Feed-forward layers: 17
Last layer representation shape: torch.Size([1, 2048])
Probability of possible output tokens (logits) shape: torch.Size([128256])
Feed-forward Layer statistics: 17
-----Attention head example----
Attention head shape: torch.Size([1, 7, 7])
-----Layer statistics example----
Layer statistics: Layer 5
mean: 1.1272626579739153e-05
std: 0.01720438525080681
min: -0.06201171875
max: 0.06982421875
norm: 0.7783914804458618
-----Most likely tokens----
1th most likely token:  I
2th most likely token:  Today
3th most likely token:  


4th most likely token:  It
5th most likely token:  What
6th most likely token: <|eot_id|>
7th most likely token:  Do
8th most likely token:  Is
9th most likely token:  Are
10th most likely token:  My
