In [None]:
%reload_ext autoreload
%autoreload 2
from transformers import GPT2Model, GPT2Tokenizer
import torch

# Load pre-trained GPT-2 with eager attention
model = GPT2Model.from_pretrained("gpt2", attn_implementation="eager")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize input
text = "Hello world"
inputs = tokenizer(text, return_tensors="pt")  # returns input_ids and attention_mask
# input_ids shape: (B, T) → (1, 2)

# STEP 1: Embed tokens (wte) directly without manually adding position_ids
input_ids = inputs["input_ids"]  # (B, T) → (1, 2)

# Get token embeddings (wte) and position embeddings (wpe)
input_embeds = model.wte(input_ids)  # (B, T, D) → (1, 2, 768)

# Get hidden states with added position embeddings
hidden_states = input_embeds  # Position embedding handling is implicit

# STEP 2: Run through LayerNorm + Self-Attention from the first block
first_block = model.h[0]

# Apply layer norm before attention
normed_hidden = first_block.ln_1(hidden_states)  # (B, T, D) → (1, 2, 768)

# Run self-attention
attn_output = first_block.attn(normed_hidden, head_mask=None, output_attentions=True)
attn_output = attn_output[0]  # (B, T, D) → (1, 2, 768)
# attn_output shape: (B, T, D) → (1, 2, 768)

# Now you have the output just after self-attention!
print("Attention Output Shape:", attn_output.shape)  # (1, 2, 768)

# Example: you can take just the last token’s output if you're doing next-token prediction
last_token = attn_output[:, -1, :]  # (B, D) → (1, 768)

# Plug this into your custom head
custom_output = torch.nn.Linear(768, 42)(last_token)  # Example head: projecting to 42 classes


Attention Output Shape: torch.Size([1, 2, 768])


In [7]:
model.state_dict().keys()

odict_keys(['wte.weight', 'wpe.weight', 'h.0.ln_1.weight', 'h.0.ln_1.bias', 'h.0.attn.c_attn.weight', 'h.0.attn.c_attn.bias', 'h.0.attn.c_proj.weight', 'h.0.attn.c_proj.bias', 'h.0.ln_2.weight', 'h.0.ln_2.bias', 'h.0.mlp.c_fc.weight', 'h.0.mlp.c_fc.bias', 'h.0.mlp.c_proj.weight', 'h.0.mlp.c_proj.bias', 'h.1.ln_1.weight', 'h.1.ln_1.bias', 'h.1.attn.c_attn.weight', 'h.1.attn.c_attn.bias', 'h.1.attn.c_proj.weight', 'h.1.attn.c_proj.bias', 'h.1.ln_2.weight', 'h.1.ln_2.bias', 'h.1.mlp.c_fc.weight', 'h.1.mlp.c_fc.bias', 'h.1.mlp.c_proj.weight', 'h.1.mlp.c_proj.bias', 'h.2.ln_1.weight', 'h.2.ln_1.bias', 'h.2.attn.c_attn.weight', 'h.2.attn.c_attn.bias', 'h.2.attn.c_proj.weight', 'h.2.attn.c_proj.bias', 'h.2.ln_2.weight', 'h.2.ln_2.bias', 'h.2.mlp.c_fc.weight', 'h.2.mlp.c_fc.bias', 'h.2.mlp.c_proj.weight', 'h.2.mlp.c_proj.bias', 'h.3.ln_1.weight', 'h.3.ln_1.bias', 'h.3.attn.c_attn.weight', 'h.3.attn.c_attn.bias', 'h.3.attn.c_proj.weight', 'h.3.attn.c_proj.bias', 'h.3.ln_2.weight', 'h.3.ln_2.bia

In [None]:
%reload_ext autoreload
%autoreload 2
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from adic_components.prototype2 import P2GPTBlock
# Load GPT-2 tokenizer and base model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model_pretrained = GPT2Model.from_pretrained('gpt2')

# Get model config to know vocab size and hidden size
config = GPT2Config.from_pretrained('gpt2')
vocab_size = config.vocab_size
hidden_size = config.n_embd
gpt2_model = P2GPTBlock(config)
gpt2_model.load_state_dict(gpt2_model_pretrained.state_dict(), strict=False)
gpt2_model.eval()

# Define a language modeling head (linear layer that maps hidden state -> vocab)
lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
lm_head.eval()

# Tie weights if desired (like original GPT2LMHeadModel)
lm_head.weight = gpt2_model.wte.weight  # Tie to input embedding weights

# Input text
text = "Ive been fan"
input_ids = tokenizer.encode(text, return_tensors='pt')  # shape: [1, seq_len]

# Get hidden states from GPT2Model
with torch.no_grad():
    outputs = gpt2_model(input_ids)
    last_hidden_states = outputs.last_hidden_state  # shape: [1, seq_len, hidden_size]
    print(last_hidden_states.shape)

# Get the hidden state of the last token
last_token_hidden = last_hidden_states[:, -1, :]  # shape: [1, hidden_size]

# Pass through the LM head to get logits over vocab
logits = lm_head(last_token_hidden)  # shape: [1, vocab_size]
#softmax
preds = logits.softmax(dim=-1)  # shape: [1, vocab_size]


# Predict next token
next_token_id = torch.argmax(preds, dim=-1).item()
print(preds[0][next_token_id])
next_token = tokenizer.decode([next_token_id])

print(f"Next token: '{next_token}'")


torch.Size([1, 4, 768])
tensor([[ 4.7041e-01,  8.5578e-01, -1.3037e+00,  3.6347e-01, -7.6840e-01,
         -8.4986e-01,  2.2346e-01,  2.6831e-02,  2.9227e-01,  4.0725e-01,
          5.3422e-01, -2.6195e-01,  1.1958e-01, -5.9417e-01,  6.4630e-02,
         -4.4148e-01, -1.8101e-01, -9.5986e-01,  5.5671e-01, -7.2203e-01,
          4.3108e-02, -5.3994e-02,  9.0981e-03,  8.5935e-03, -1.1342e+00,
         -5.3440e-01, -9.5126e-02,  7.4563e-02, -3.6980e-01, -1.3001e+00,
          2.7258e-01, -3.4991e-01,  2.9148e-01,  2.2943e-01, -6.4044e-01,
          9.2986e-01,  5.2259e+01, -6.4153e-01,  9.4472e-01,  2.2320e-01,
          4.2378e-01,  6.6295e-04,  1.7511e-01, -1.3713e+00,  8.1432e-01,
         -7.6160e-01, -5.8805e-01, -2.6967e-01,  1.2243e-01,  4.4801e-01,
          4.5762e-02,  9.1929e-01, -4.2861e-01, -4.1535e-01,  4.3397e-01,
         -9.1940e-02, -5.9600e-02, -5.4464e-01,  7.7408e-01, -1.2186e+00,
          7.8279e-02,  2.8544e-01,  1.8112e-02,  1.3367e+00, -1.1104e+00,
          3.87

In [41]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pretrained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

# Input text
input_text = "Ive been fan of the game for a long time and I'm sure I'll be playing it again. I'm sure I'll be playing it again."
input_ids = tokenizer.encode(input_text, return_tensors='pt')  # shape: [1, seq_len]

# Predict logits for next token
with torch.no_grad():
    print(input_ids.shape)
    outputs = model(input_ids)
    logits = outputs.logits  # shape: [1, seq_len, vocab_size]

# Get logits for the last token in the sequence
last_token_logits = logits[:, -1, :]  # shape: [1, vocab_size]

# Sample or take argmax for prediction
predicted_token_id = torch.argmax(last_token_logits, dim=-1).item()
predicted_token = tokenizer.decode([predicted_token_id])

print(f"Next token: '{predicted_token}'")


torch.Size([1, 32])
Next token: '
'


In [2]:
%reload_ext autoreload
%autoreload 2
import sys
from loguru import logger
logger.remove()
logger_id = logger.add(sys.stderr, level="TRACE", colorize=True, format="<level>{level}</level>: {message} | {name}:{function}:{line} | {time:HH:mm:ss DD-MM-YYYY}")
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from adic_components.prototype2 import P2GPTBlock, P2ECDEC, P2Decoder
# Load GPT-2 tokenizer and base model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model_pretrained = GPT2Model.from_pretrained('gpt2')

# Get model config to know vocab size and hidden size
config = GPT2Config.from_pretrained('gpt2')
vocab_size = config.vocab_size
hidden_size = config.n_embd
gpt2_model = P2GPTBlock(config)
gpt2_model.load_state_dict(gpt2_model_pretrained.state_dict(), strict=False)
decoder = P2Decoder(config)
decoder.gpt2_model = gpt2_model

encodeco = P2ECDEC(3, 224, 224, hidden_size, decoder)

# Define a language modeling head (linear layer that maps hidden state -> vocab)
lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
lm_head.eval()

# Tie weights if desired (like original GPT2LMHeadModel)
lm_head.weight = gpt2_model.wte.weight  # Tie to input embedding weights

In [18]:

%reload_ext autoreload
%autoreload 2

# Input text
text = "The SNAP Marx Encourgeois stumbling stumbling over whether rele"
input_ids = tokenizer.encode(text, return_tensors='pt')# shape: [1, seq_len]
random_image = torch.randn(1, 3, 224, 224)  # Example input tensor

from adic_components.prototype2 import P2GPTBlock, P2ECDEC, P2Decoder
# Get hidden states from GPT2Model
with torch.autograd.set_detect_anomaly(True):
    with torch.no_grad():
        #outputs = gpt2_model(input_ids)
        outputs = encodeco(input_ids, random_image)

# Get the hidden state of the last token
logits = outputs[:, -1, :]  # shape: [1, hidden_size]

#softmax
preds = logits.softmax(dim=-1)  # shape: [1, vocab_size]


# Predict next token
next_token_id = torch.argmax(preds, dim=-1).item()
print(preds[0][next_token_id])
next_token = tokenizer.decode([next_token_id])

print(f"Next token: '{next_token}'")

[36m[1mTRACE[0m: Decoder input shape: torch.Size([1, 10]) | adic_components.prototype2:forward:360 | 09:24:25 05-04-2025
[36m[1mTRACE[0m: Encoder output shape: torch.Size([1, 196, 768]) | adic_components.prototype2:forward:361 | 09:24:25 05-04-2025
[36m[1mTRACE[0m: Decoder output shape: torch.Size([1, 10, 768]) | adic_components.prototype2:forward:363 | 09:24:25 05-04-2025
[36m[1mTRACE[0m: Cross attention output shape: torch.Size([1, 10, 768]) | adic_components.prototype2:forward:366 | 09:24:25 05-04-2025
[36m[1mTRACE[0m: LM head output shape: torch.Size([1, 10, 50257]) | adic_components.prototype2:forward:368 | 09:24:26 05-04-2025


tensor(0.0002)
Next token: ' Anthem'
