In [1]:
import sys
from pathlib import Path
import logging

# --- Setup Logging and Paths ---
logging.basicConfig(level=logging.INFO, filename='notebook.log', filemode='w')
logger = logging.getLogger(__name__)

project_root = Path('.').resolve()
src_path = project_root / 'layered-context-graph' / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
logger.info(f"Project root set to: {project_root}")

In [2]:
from models.qwq_model import QwQModel
import torch

In [3]:
# --- Cell 3: Model Loading ---
MODEL_PATH = '/workspaces/layer_context_seg/qwq.gguf'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

preloaded_qwq_model = None
try:
    preloaded_qwq_model = QwQModel(MODEL_PATH, device)
    logger.info("QwQModel pre-loaded successfully.")
    print("QwQModel pre-loaded successfully.")
except Exception as e:
    logger.error(f"Error pre-loading QwQModel: {e}", exc_info=True)
    print(f"Error pre-loading QwQModel: {e}")

llama_model_loader: loaded meta data with 33 key-value pairs and 771 tensors from /workspaces/layer_context_seg/qwq.gguf (version GGUF V3 (latest))


llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.


llama_model_loader: - kv   0:                       general.architecture str              = qwen2


llama_model_loader: - kv   1:                               general.type str              = model


llama_model_loader: - kv   2:                               general.name str              = QwQ 32B


llama_model_loader: - kv   3:                           general.basename str              = QwQ


llama_model_loader: - kv   4:                         general.size_label str              = 32B


llama_model_loader: - kv   5:                            general.license str              = apache-2.0


llama_model_loader: - kv   6:                       general.license.link str              = https://huggingface.co/Qwen/QWQ-32B/b...


llama_model_loader: - kv   7:                   general.base_model.count u32              = 1


llama_model_loader: - kv   8:                  general.base_model.0.name str              = Qwen2.5 32B


llama_model_loader: - kv   9:          general.base_model.0.organization str              = Qwen


llama_model_loader: - kv  10:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-32B


llama_model_loader: - kv  11:                               general.tags arr[str,2]       = ["chat", "text-generation"]


llama_model_loader: - kv  12:                          general.languages arr[str,1]       = ["en"]


llama_model_loader: - kv  13:                          qwen2.block_count u32              = 64


llama_model_loader: - kv  14:                       qwen2.context_length u32              = 40960


llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 5120


llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 27648


llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 40


llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8


llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000


llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000010


llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2


llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2


llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...


llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...


llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...


llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645


llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151643


llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 151643


llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false


llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...


llama_model_loader: - kv  31:               general.quantization_version u32              = 2


llama_model_loader: - kv  32:                          general.file_type u32              = 15


llama_model_loader: - type  f32:  321 tensors


llama_model_loader: - type q4_K:  385 tensors


llama_model_loader: - type q6_K:   65 tensors


print_info: file format = GGUF V3 (latest)


print_info: file type   = Q4_K - Medium


print_info: file size   = 18.48 GiB (4.85 BPW) 


init_tokenizer: initializing tokenizer for type 2


load: control token: 151660 '<|fim_middle|>' is not marked as EOG


load: control token: 151659 '<|fim_prefix|>' is not marked as EOG


load: control token: 151653 '<|vision_end|>' is not marked as EOG


load: control token: 151648 '<|box_start|>' is not marked as EOG


load: control token: 151646 '<|object_ref_start|>' is not marked as EOG


load: control token: 151649 '<|box_end|>' is not marked as EOG


load: control token: 151655 '<|image_pad|>' is not marked as EOG


load: control token: 151651 '<|quad_end|>' is not marked as EOG


load: control token: 151647 '<|object_ref_end|>' is not marked as EOG


load: control token: 151652 '<|vision_start|>' is not marked as EOG


load: control token: 151654 '<|vision_pad|>' is not marked as EOG


load: control token: 151656 '<|video_pad|>' is not marked as EOG


load: control token: 151644 '<|im_start|>' is not marked as EOG


load: control token: 151661 '<|fim_suffix|>' is not marked as EOG


load: control token: 151650 '<|quad_start|>' is not marked as EOG


load: special tokens cache size = 26


load: token to piece cache size = 0.9311 MB


print_info: arch             = qwen2


print_info: vocab_only       = 0


print_info: n_ctx_train      = 40960


print_info: n_embd           = 5120


print_info: n_layer          = 64


print_info: n_head           = 40


print_info: n_head_kv        = 8


print_info: n_rot            = 128


print_info: n_swa            = 0


print_info: is_swa_any       = 0


print_info: n_embd_head_k    = 128


print_info: n_embd_head_v    = 128


print_info: n_gqa            = 5


print_info: n_embd_k_gqa     = 1024


print_info: n_embd_v_gqa     = 1024


print_info: f_norm_eps       = 0.0e+00


print_info: f_norm_rms_eps   = 1.0e-05


print_info: f_clamp_kqv      = 0.0e+00


print_info: f_max_alibi_bias = 0.0e+00


print_info: f_logit_scale    = 0.0e+00


print_info: f_attn_scale     = 0.0e+00


print_info: n_ff             = 27648


print_info: n_expert         = 0


print_info: n_expert_used    = 0


print_info: causal attn      = 1


print_info: pooling type     = -1


print_info: rope type        = 2


print_info: rope scaling     = linear


print_info: freq_base_train  = 1000000.0


print_info: freq_scale_train = 1


print_info: n_ctx_orig_yarn  = 40960


print_info: rope_finetuned   = unknown


print_info: model type       = 32B


print_info: model params     = 32.76 B


print_info: general.name     = QwQ 32B


print_info: vocab type       = BPE


print_info: n_vocab          = 152064


print_info: n_merges         = 151387


print_info: BOS token        = 151643 '<|endoftext|>'


print_info: EOS token        = 151645 '<|im_end|>'


print_info: EOT token        = 151645 '<|im_end|>'


print_info: PAD token        = 151643 '<|endoftext|>'


print_info: LF token         = 198 'Ċ'


print_info: FIM PRE token    = 151659 '<|fim_prefix|>'


print_info: FIM SUF token    = 151661 '<|fim_suffix|>'


print_info: FIM MID token    = 151660 '<|fim_middle|>'


print_info: FIM PAD token    = 151662 '<|fim_pad|>'


print_info: FIM REP token    = 151663 '<|repo_name|>'


print_info: FIM SEP token    = 151664 '<|file_sep|>'


print_info: EOG token        = 151643 '<|endoftext|>'


print_info: EOG token        = 151645 '<|im_end|>'


print_info: EOG token        = 151662 '<|fim_pad|>'


print_info: EOG token        = 151663 '<|repo_name|>'


print_info: EOG token        = 151664 '<|file_sep|>'


print_info: max token length = 256


load_tensors: loading model tensors, this can take a while... (mmap = true)


load_tensors: layer   0 assigned to device CPU, is_swa = 0


load_tensors: layer   1 assigned to device CPU, is_swa = 0


load_tensors: layer   2 assigned to device CPU, is_swa = 0


load_tensors: layer   3 assigned to device CPU, is_swa = 0


load_tensors: layer   4 assigned to device CPU, is_swa = 0


load_tensors: layer   5 assigned to device CPU, is_swa = 0


load_tensors: layer   6 assigned to device CPU, is_swa = 0


load_tensors: layer   7 assigned to device CPU, is_swa = 0


load_tensors: layer   8 assigned to device CPU, is_swa = 0


load_tensors: layer   9 assigned to device CPU, is_swa = 0


load_tensors: layer  10 assigned to device CPU, is_swa = 0


load_tensors: layer  11 assigned to device CPU, is_swa = 0


load_tensors: layer  12 assigned to device CPU, is_swa = 0


load_tensors: layer  13 assigned to device CPU, is_swa = 0


load_tensors: layer  14 assigned to device CPU, is_swa = 0


load_tensors: layer  15 assigned to device CPU, is_swa = 0


load_tensors: layer  16 assigned to device CPU, is_swa = 0


load_tensors: layer  17 assigned to device CPU, is_swa = 0


load_tensors: layer  18 assigned to device CPU, is_swa = 0


load_tensors: layer  19 assigned to device CPU, is_swa = 0


load_tensors: layer  20 assigned to device CPU, is_swa = 0


load_tensors: layer  21 assigned to device CPU, is_swa = 0


load_tensors: layer  22 assigned to device CPU, is_swa = 0


load_tensors: layer  23 assigned to device CPU, is_swa = 0


load_tensors: layer  24 assigned to device CPU, is_swa = 0


load_tensors: layer  25 assigned to device CPU, is_swa = 0


load_tensors: layer  26 assigned to device CPU, is_swa = 0


load_tensors: layer  27 assigned to device CPU, is_swa = 0


load_tensors: layer  28 assigned to device CPU, is_swa = 0


load_tensors: layer  29 assigned to device CPU, is_swa = 0


load_tensors: layer  30 assigned to device CPU, is_swa = 0


load_tensors: layer  31 assigned to device CPU, is_swa = 0


load_tensors: layer  32 assigned to device CPU, is_swa = 0


load_tensors: layer  33 assigned to device CPU, is_swa = 0


load_tensors: layer  34 assigned to device CPU, is_swa = 0


load_tensors: layer  35 assigned to device CPU, is_swa = 0


load_tensors: layer  36 assigned to device CPU, is_swa = 0


load_tensors: layer  37 assigned to device CPU, is_swa = 0


load_tensors: layer  38 assigned to device CPU, is_swa = 0


load_tensors: layer  39 assigned to device CPU, is_swa = 0


load_tensors: layer  40 assigned to device CPU, is_swa = 0


load_tensors: layer  41 assigned to device CPU, is_swa = 0


load_tensors: layer  42 assigned to device CPU, is_swa = 0


load_tensors: layer  43 assigned to device CPU, is_swa = 0


load_tensors: layer  44 assigned to device CPU, is_swa = 0


load_tensors: layer  45 assigned to device CPU, is_swa = 0


load_tensors: layer  46 assigned to device CPU, is_swa = 0


load_tensors: layer  47 assigned to device CPU, is_swa = 0


load_tensors: layer  48 assigned to device CPU, is_swa = 0


load_tensors: layer  49 assigned to device CPU, is_swa = 0


load_tensors: layer  50 assigned to device CPU, is_swa = 0


load_tensors: layer  51 assigned to device CPU, is_swa = 0


load_tensors: layer  52 assigned to device CPU, is_swa = 0


load_tensors: layer  53 assigned to device CPU, is_swa = 0


load_tensors: layer  54 assigned to device CPU, is_swa = 0


load_tensors: layer  55 assigned to device CPU, is_swa = 0


load_tensors: layer  56 assigned to device CPU, is_swa = 0


load_tensors: layer  57 assigned to device CPU, is_swa = 0


load_tensors: layer  58 assigned to device CPU, is_swa = 0


load_tensors: layer  59 assigned to device CPU, is_swa = 0


load_tensors: layer  60 assigned to device CPU, is_swa = 0


load_tensors: layer  61 assigned to device CPU, is_swa = 0


load_tensors: layer  62 assigned to device CPU, is_swa = 0


load_tensors: layer  63 assigned to device CPU, is_swa = 0


load_tensors: layer  64 assigned to device CPU, is_swa = 0


load_tensors: tensor 'token_embd.weight' (q4_K) (and 770 others) cannot be used with preferred buffer type CPU_REPACK, using CPU instead


load_tensors:   CPU_Mapped model buffer size = 18926.01 MiB


.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.




llama_context: constructing llama_context


llama_context: n_seq_max     = 1


llama_context: n_ctx         = 2048


llama_context: n_ctx_per_seq = 2048


llama_context: n_batch       = 512


llama_context: n_ubatch      = 512


llama_context: causal_attn   = 1


llama_context: flash_attn    = 0


llama_context: freq_base     = 1000000.0


llama_context: freq_scale    = 1


llama_context: n_ctx_per_seq (2048) < n_ctx_train (40960) -- the full capacity of the model will not be utilized


set_abort_callback: call


llama_context:        CPU  output buffer size =     0.58 MiB


create_memory: n_ctx = 2048 (padded)


llama_kv_cache_unified: layer   0: dev = CPU


llama_kv_cache_unified: layer   1: dev = CPU


llama_kv_cache_unified: layer   2: dev = CPU


llama_kv_cache_unified: layer   3: dev = CPU


llama_kv_cache_unified: layer   4: dev = CPU


llama_kv_cache_unified: layer   5: dev = CPU


llama_kv_cache_unified: layer   6: dev = CPU


llama_kv_cache_unified: layer   7: dev = CPU


llama_kv_cache_unified: layer   8: dev = CPU


llama_kv_cache_unified: layer   9: dev = CPU


llama_kv_cache_unified: layer  10: dev = CPU


llama_kv_cache_unified: layer  11: dev = CPU


llama_kv_cache_unified: layer  12: dev = CPU


llama_kv_cache_unified: layer  13: dev = CPU


llama_kv_cache_unified: layer  14: dev = CPU


llama_kv_cache_unified: layer  15: dev = CPU


llama_kv_cache_unified: layer  16: dev = CPU


llama_kv_cache_unified: layer  17: dev = CPU


llama_kv_cache_unified: layer  18: dev = CPU


llama_kv_cache_unified: layer  19: dev = CPU


llama_kv_cache_unified: layer  20: dev = CPU


llama_kv_cache_unified: layer  21: dev = CPU


llama_kv_cache_unified: layer  22: dev = CPU


llama_kv_cache_unified: layer  23: dev = CPU


llama_kv_cache_unified: layer  24: dev = CPU


llama_kv_cache_unified: layer  25: dev = CPU


llama_kv_cache_unified: layer  26: dev = CPU


llama_kv_cache_unified: layer  27: dev = CPU


llama_kv_cache_unified: layer  28: dev = CPU


llama_kv_cache_unified: layer  29: dev = CPU


llama_kv_cache_unified: layer  30: dev = CPU


llama_kv_cache_unified: layer  31: dev = CPU


llama_kv_cache_unified: layer  32: dev = CPU


llama_kv_cache_unified: layer  33: dev = CPU


llama_kv_cache_unified: layer  34: dev = CPU


llama_kv_cache_unified: layer  35: dev = CPU


llama_kv_cache_unified: layer  36: dev = CPU


llama_kv_cache_unified: layer  37: dev = CPU


llama_kv_cache_unified: layer  38: dev = CPU


llama_kv_cache_unified: layer  39: dev = CPU


llama_kv_cache_unified: layer  40: dev = CPU


llama_kv_cache_unified: layer  41: dev = CPU


llama_kv_cache_unified: layer  42: dev = CPU


llama_kv_cache_unified: layer  43: dev = CPU


llama_kv_cache_unified: layer  44: dev = CPU


llama_kv_cache_unified: layer  45: dev = CPU


llama_kv_cache_unified: layer  46: dev = CPU


llama_kv_cache_unified: layer  47: dev = CPU


llama_kv_cache_unified: layer  48: dev = CPU


llama_kv_cache_unified: layer  49: dev = CPU


llama_kv_cache_unified: layer  50: dev = CPU


llama_kv_cache_unified: layer  51: dev = CPU


llama_kv_cache_unified: layer  52: dev = CPU


llama_kv_cache_unified: layer  53: dev = CPU


llama_kv_cache_unified: layer  54: dev = CPU


llama_kv_cache_unified: layer  55: dev = CPU


llama_kv_cache_unified: layer  56: dev = CPU


llama_kv_cache_unified: layer  57: dev = CPU


llama_kv_cache_unified: layer  58: dev = CPU


llama_kv_cache_unified: layer  59: dev = CPU


llama_kv_cache_unified: layer  60: dev = CPU


llama_kv_cache_unified: layer  61: dev = CPU


llama_kv_cache_unified: layer  62: dev = CPU


llama_kv_cache_unified: layer  63: dev = CPU


llama_kv_cache_unified:        CPU KV buffer size =   512.00 MiB


llama_kv_cache_unified: size =  512.00 MiB (  2048 cells,  64 layers,  1 seqs), K (f16):  256.00 MiB, V (f16):  256.00 MiB


llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


llama_context: enumerating backends


llama_context: backend_ptrs.size() = 1


llama_context: max_nodes = 65536


llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0


graph_reserve: reserving a graph for ubatch with n_tokens =  512, n_seqs =  1, n_outputs =  512


graph_reserve: reserving a graph for ubatch with n_tokens =    1, n_seqs =  1, n_outputs =    1


graph_reserve: reserving a graph for ubatch with n_tokens =  512, n_seqs =  1, n_outputs =  512


llama_context:        CPU compute buffer size =   307.00 MiB


llama_context: graph nodes  = 2502


llama_context: graph splits = 1


CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | F16C = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
Model metadata: {'general.file_type': '15', 'tokenizer.ggml.bos_token_id': '151643', 'qwen2.attention.layer_norm_rms_epsilon': '0.000010', 'tokenizer.ggml.eos_token_id': '151645', 'qwen2.rope.freq_base': '1000000.000000', 'qwen2.attention.head_count': '40', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'qwen2.feed_forward_length': '27648', 'general.architecture': 'qwen2', 'tokenizer.ggml.padding_token_id': '151643', 'qwen2.embedding_length': '5120', 'general.basename': 'QwQ', 'tokenizer.ggml.add_bos_token': 'false', 'general.base_model.0.organization': 'Qwen', 'tokenizer.ggml.pre': 'qwen2', 'general.name': 'QwQ 32B', 'general.base_model.0.name': 'Qwen2.5 32B', 'qwen2.block_count': '64', 'general.type': 'model', 'general.size_label': '32B', 'tokenizer.chat_template': '{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n

QwQModel pre-loaded successfully.


In [4]:
# --- Cell 4: Test Text Generation ---
if preloaded_qwq_model:
    try:
        prompt = "Once upon a time,"
        generated_text = preloaded_qwq_model.generate(prompt)
        logger.info(f"Generated text: {generated_text}")
        print(f"Generated text: {generated_text}")
    except Exception as e:
        logger.error(f"Error during text generation test: {e}", exc_info=True)
        print(f"Error during text generation test: {e}")

llama_perf_context_print:        load time =    2926.12 ms


llama_perf_context_print: prompt eval time =    2925.99 ms /     5 tokens (  585.20 ms per token,     1.71 tokens per second)


llama_perf_context_print:        eval time =  111813.28 ms /   149 runs   (  750.42 ms per token,     1.33 tokens per second)


llama_perf_context_print:       total time =  114943.95 ms /   154 tokens


Generated text:  in a kingdom far away, there was a princess named Isabella. She lived in a grand castle with her parents, the king and queen. Isabella was known throughout the land for her kindness and her love of stories. Every night, her parents would tell her a story before she went to sleep. But one night, the stories stopped. The queen had fallen ill, and the king was too busy with the affairs of the kingdom to take her place. Isabella was sad and lonely. She missed the stories and the closeness she felt with her parents during that special time. One day, while exploring the castle, Isabella discovered an old, dusty book in the library. It was a book of fairy tales, but unlike any she had
