In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 1) Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Remove "device_map='auto'" and move the model to the GPU explicitly
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16
).to("cuda")  # <--- Move model to GPU

# 2) The user's visible question
user_text = "What is the capital of Turkey?"

# 3) Build the combined prompt
combined_prompt = (
    "<｜begin▁of▁sentence｜>"
    + "<｜User｜>"
    + user_text
    + "<｜Assistant｜>"
    + "<think>\n"
)

# 4) Tokenize combined prompt
encoded = tokenizer(combined_prompt, return_tensors="pt")

# Move tokenized input to GPU
input_ids = encoded.input_ids.to("cuda")
attention_mask = encoded.attention_mask.to("cuda")

# 5) Create a dictionary mapping input token IDs -> token text
input_ids_list = input_ids[0].tolist()
input_token_map = {
    tid: tokenizer.decode([tid], skip_special_tokens=False)
    for tid in input_ids_list
}

print("===== Visible User Prompt =====")
print(user_text)

print("\n===== Combined Prompt (Internal) =====")
print(combined_prompt)

print("\n===== Input Token Mapping (Combined) =====")
print(input_token_map)

# 6) Generate output using GPU
with torch.no_grad():
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=1024,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

# 7) Map generated token IDs -> token text
generated_ids_list = output_ids[0].tolist()
generated_token_map = {
    tid: tokenizer.decode([tid], skip_special_tokens=False)
    for tid in generated_ids_list
}

print("\n===== Generated Token Mapping =====")
print(generated_token_map)

# 8) Decode final output (keep special tokens visible)
generated_text = tokenizer.decode(
    generated_ids_list,
    skip_special_tokens=False
)

print("\n===== Final Decoded Output (with special tokens) =====")
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


===== Visible User Prompt =====
What is the capital of Turkey?

===== Combined Prompt (Internal) =====
<｜begin▁of▁sentence｜><｜User｜>What is the capital of Turkey?<｜Assistant｜><think>


===== Input Token Mapping (Combined) =====
{151646: '<｜begin▁of▁sentence｜>', 151644: '<｜User｜>', 3838: 'What', 374: ' is', 279: ' the', 6722: ' capital', 315: ' of', 17009: ' Turkey', 30: '?', 151645: '<｜Assistant｜>', 151648: '<think>', 198: '\n'}

===== Generated Token Mapping =====
{151646: '<｜begin▁of▁sentence｜>', 151644: '<｜User｜>', 3838: 'What', 374: ' is', 279: ' the', 6722: ' capital', 315: ' of', 17009: ' Turkey', 30: '?', 151645: '<｜Assistant｜>', 151648: '<think>', 198: '\n', 32313: 'Okay', 11: ',', 773: ' so', 358: ' I', 1184: ' need', 311: ' to', 7071: ' figure', 700: ' out', 13: '.', 2776: "'m", 537: ' not', 6896: ' exactly', 2704: ' sure', 714: ' but', 1744: ' think', 92999: ' capitals', 525: ' are', 5990: ' usually', 1887: ' main', 9720: ' cities', 1251: ' people', 728: ' go', 369: ' for', 