In [7]:
from reasoning_from_scratch.qwen3 import download_qwen3_small
download_qwen3_small(kind="base", tokenizer_only=True, out_dir="qwen3")

In [9]:
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer

tokenizer_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path) 

In [5]:
import torch

In [6]:
torch.__version__


'2.9.1+cu128'

In [41]:
prompt = "Explain LLM reasoning to me."
input_token_id_list = tokenizer.encode(prompt)
input_token_id_list

[840, 20772, 444, 10994, 32711, 311, 752, 13]

In [42]:
text = tokenizer.decode(input_token_id_list)
text

'Explain LLM reasoning to me.'

In [43]:
for i in input_token_id_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[840] --> Ex
[20772] --> plain
[444] -->  L
[10994] --> LM
[32711] -->  reasoning
[311] -->  to
[752] -->  me
[13] --> .


In [44]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [27]:
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

qwen3-0.6B-base.pth: 100% (1433 MiB / 1433 MiB)


In [45]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B

model_path = Path('qwen3') / 'qwen3-0.6B-base.pth'
model = Qwen3Model(QWEN_CONFIG_06_B)
model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [46]:
model.to(device)

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [47]:
print(f"number of tokens : {len(input_token_id_list)}")

number of tokens : 8


In [48]:
input_tensor = torch.tensor(input_token_id_list)
input_tensor_fm = input_tensor.unsqueeze(0)
input_tensor_fm = input_tensor_fm.to(device)

In [49]:
output_tensor = model(input_tensor_fm)
output_tensor_fm = output_tensor.squeeze(0)
print(f"formatted output tensor: {output_tensor_fm.shape}")

formatted output tensor: torch.Size([8, 151936])


In [50]:
last_token = output_tensor_fm[-1].detach()
print(last_token)

tensor([ 8.3125,  2.4688,  6.5312,  ..., -1.6172, -1.6172, -1.6172],
       dtype=torch.bfloat16)


In [53]:
print(last_token.argmax(keepdims=True))

tensor([358])


In [54]:
print(tokenizer.decode([358]))

 I


In [65]:
@torch.inference_mode()
def generate_text_basic(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id = tokenizer.eos_token_id
):
    input_len = token_ids.shape[1]
    model.eval()

    for _ in range(max_new_tokens):
        out = model(token_ids)[:,-1]
        next_token = torch.argmax(out, dim=-1, keepdims=True)
        
        if (eos_token_id != None and next_token.item() == eos_token_id):
            break
            
        token_ids = torch.concat([token_ids, next_token], dim = 1)
    return token_ids[:, input_len:]

In [75]:
@torch.inference_mode()
def generate_text_basic_stream(
    model,
    token_ids,
    max_new_tokens,
    eos_token_id = tokenizer.eos_token_id
):
    input_len = token_ids.shape[1]
    model.eval()

    for _ in range(max_new_tokens):
        out = model(token_ids)[:,-1]
        next_token = torch.argmax(out, dim=-1, keepdims=True)
        
        if (eos_token_id != None and next_token.item() == eos_token_id):
            break
        token_ids = torch.concat([token_ids, next_token], dim = 1)
        yield next_token

In [77]:
prompt = "explain LLMs in single sentence."
max_new_tokens = 100
input_token_ids = torch.tensor(tokenizer.encode(prompt), device=device).unsqueeze(0)
# output_ids = generate_text_basic(model, input_token_ids, max_new_tokens)

import time
start_time = time.time()
num_tokens = 0;
for token in generate_text_basic_stream(model, input_token_ids, max_new_tokens):
    # print(token)
    output_ids = token.squeeze(0)
    out_text = tokenizer.decode(output_ids.tolist())
    print(out_text, end = "" )
    num_tokens+=1
end_time = time.time()

 Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text, enabling them to process and respond to a wide range of natural language inputs.

In [81]:
time_taken = end_time - start_time
print(f"total time taken = {time_taken: .2f} s")
print(f"time per token: {num_tokens/time_taken : .2f} s/token")

total time taken =  24.94 s
time per token:  1.44 s/token


In [None]:
from reasoning_from_scratch.qwen3 import KVCache
