In [None]:
import sys
from pathlib import Path
import logging
import torch
import np

# --- Setup Logging and Paths ---
logging.basicConfig(level=logging.INFO, filename='notebook.log', filemode='w')
logger = logging.getLogger(__name__)

project_root = Path('.').resolve()
src_path = project_root / 'layered-context-graph' / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
logger.info(f"Project root set to: {project_root}")

# Import the refactored model class
from models.qwq_model import QwQModel

In [4]:
# --- Cell 2: Model Initialization ---
# Note: This step is now instant because of lazy loading.
# The model will only be loaded to the GPU when a method is called.

MODEL_PATH = './QwQ_LCoT_7B_Instruct'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Initializing QwQModel (lazy)...")
model_instance = QwQModel(MODEL_PATH, device)
print("QwQModel initialized. No data has been loaded yet.")

Initializing QwQModel (lazy)...
QwQModel initialized. No data has been loaded yet.


In [5]:
# --- Cell 3: Test Text Generation ---
# This will trigger the lazy loading of the model to the GPU.

try:
    prompt = "The best way to build a knowledge graph is"
    print(f"Testing generation with prompt: '{prompt}'")
    
    # The model will be loaded here, on the first call
    generated_text = model_instance.generate(prompt, max_tokens=50)
    
    logger.info(f"Generated text: {generated_text}")
    print("\n--- Generated Text ---")
    print(generated_text)
    print("----------------------")
    
except Exception as e:
    logger.error(f"Error during text generation test: {e}", exc_info=True)
    print(f"Error during text generation test: {e}")

Testing generation with prompt: 'The best way to build a knowledge graph is'
Model device: meta
Model device after loading: cuda:0

--- Generated Text ---
The best way to build a knowledge graph is!! The!!! Best!!!!! Way!! is!!! to!!!!! start!!!!!!! with!!!!!!!!!!!!!!!!
----------------------


In [8]:
# --- Cell 4: Test Attention Extraction ---
# The model is already loaded, so this should be faster.
import numpy as np
try:
    text_for_attention = "Attention mechanisms in Transformers allow the model to focus on relevant parts of the input."
    print(f"\nTesting attention extraction with text: '{text_for_attention}'")
    
    attention_data = model_instance.extract_attention(text_for_attention)
    
    # Verify the output
    if isinstance(attention_data, dict) and 'attentions' in attention_data:
        num_layers = len(attention_data['attentions'])
        print(f"Successfully extracted attention for {num_layers} layers.")
        first_layer_shape = np.array(attention_data['attentions'][0]).shape
        print(f"Shape of attention tensor in the first layer: {first_layer_shape}")
        logger.info(f"Attention extraction successful for {num_layers} layers.")
    else:
        print("Attention extraction did not return the expected format.")

except Exception as e:
    logger.error(f"Error during attention extraction test: {e}", exc_info=True)
    print(f"Error during attention extraction test: {e}")


Testing attention extraction with text: 'Attention mechanisms in Transformers allow the model to focus on relevant parts of the input.'
Successfully extracted attention for 28 layers.
Shape of attention tensor in the first layer: (28, 16, 16)
