In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [2]:
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
inputs['input_ids']

tensor([[128000,    791,   6864,    315,   9822,    374]], device='cuda:0')

In [3]:
with torch.no_grad():
    outputs = model(inputs['input_ids'])

last_token_logits = outputs.logits[0, -1, :]
probabilities = torch.nn.functional.softmax(last_token_logits, dim=-1)
outputs.logits.shape, last_token_logits.shape, probabilities.shape

(torch.Size([1, 6, 128256]), torch.Size([128256]), torch.Size([128256]))

In [4]:
top_probs, top_indices = torch.topk(probabilities, 6)
for i, (prob, idx) in enumerate(zip(top_probs, top_indices), 1):
    token = tokenizer.decode([idx.item()])
    display_token = repr(token) if token.strip() != token else f"'{token}'"
    print(f"{i:2d}. {display_token:<15} {prob.item():.5f}")

 1. ' Paris'        0.39153
 2. ' a'            0.08419
 3. ' the'          0.07040
 4. ' one'          0.03096
 5. ' also'         0.03061
 6. ' home'         0.02528
