In [1]:
import sys
from pathlib import Path
import logging
import torch
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# --- Setup Logging and Paths ---
logging.basicConfig(level=logging.INFO, filename='notebook.log', filemode='w')
logger = logging.getLogger(__name__)

project_root = Path('.').resolve()
src_path = project_root / 'layered-context-graph' / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
logger.info(f"Project root set to: {project_root}")

from models.qwq_model import QwQModel
from partitioning.partition_manager import PartitionManager
from graph.simplicial_calculator import SimplicialComplexCalculator
from models.baai_model import BAAIModel

In [2]:
# --- Cell 2: Model and Partition Manager Initialization ---
MODEL_PATH = './QwQ_LCoT_7B_Instruct'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Initializing QwQModel (lazy)...")
qwq_model = QwQModel(MODEL_PATH, device)
print("QwQModel initialized.")

print("Initializing PartitionManager...")
partition_manager = PartitionManager(attention_extractor=qwq_model, cohesion_threshold=0.3)
print("PartitionManager initialized.")

Initializing QwQModel (lazy)...
QwQModel initialized.
Initializing PartitionManager...
PartitionManager initialized.


In [None]:
# --- Cell 3: Define Sample Text and Extract Attention ---
sample_text = ( """
Project AURA focuses on transforming linear documents into knowledge graphs. This process involves disassembly, reconstruction, and reassembly. The core idea is to use percolation theory to determine optimal chunking strategies, ensuring a 15-30% overlap between segments for global connectivity.
The system uses a language-guided processor. K-rules for disassembly define how to segment the text, for example, splitting at topic shifts or semantic boundaries. This is achieved by analyzing attention patterns from a transformer model.
Once the graph is constructed, G-rules for reassembly are applied. These rules organize the graph's nodes, which represent the segments, into a new, coherent document. This allows for the creation of purpose-driven outputs like summaries or tutorials from the same underlying knowledge structure.
A key challenge is handling long documents without exceeding GPU memory. The attention mechanism has a quadratic memory complexity, O(n^2), where n is the sequence length. To solve this, a sliding window approach is used. The document is processed in smaller, overlapping chunks, and the resulting attention matrices are stitched together to approximate the full document's attention patterns.
This ensures that the system can scale to documents of any length while still leveraging the power of attention-based semantic analysis for high-quality segmentation and graph construction. The final output is a rich, interconnected knowledge graph that can be reassembled in various ways.",
""" )

sample_text = open('Layer_Context_Window_Graphs', 'r').read() if Path('Layer_Context_Window_Graphs').exists() else sample_text

print("Extracting attention and calculating simplicial complexes using a tiling window...")
# Instantiate the calculator
s_calculator = SimplicialComplexCalculator(threshold=0.3)

# Pass the calculator and set use_sliding_window to False for tiling
simplicial_results = qwq_model.extract_attention(sample_text, use_sliding_window=False, calculator=s_calculator)

if simplicial_results:
    print(f"Successfully processed {len(simplicial_results)} windows.")
else:
    print("Failed to process attention windows.")

In [None]:
# --- Cell 4: Display Simplicial Complex Counts ---
if simplicial_results:
    for i, result in enumerate(simplicial_results):
        print(f"\n--- Window {i+1} Results ---")
        print(f"Snippet: {result['metadata']['text_snippet']}")
        for j, head_simplices in enumerate(result['simplices']):
            simplex_counts = {}
            for simplex in head_simplices:
                dim = len(simplex) - 1
                simplex_counts[dim] = simplex_counts.get(dim, 0) + 1
            print(f"\n-- Head {j+1} Simplex Counts --")
            for dim, count in sorted(simplex_counts.items()):
                print(f"  {dim}-simplices: {count}")
else:
    print("No simplicial complex data to display.")

In [3]:
# --- BAAI Model Tests ---
print("--- Starting BAAI Model Tests ---")
BAAI_MODEL_PATH = './bge-en-icl'
baai_model = BAAIModel(BAAI_MODEL_PATH, device)
print("BAAIModel initialized (lazy).")

--- Starting BAAI Model Tests ---
BAAIModel initialized (lazy).


In [4]:
# --- Cell 8: Test Encoding and Similarity ---
print("\nTesting encoding and similarity...")
text_a = "The cat sat on the mat."
text_b = "A feline was resting on the rug."
text_c = "The dog chased the ball."

embedding_a = baai_model.encode(text_a)
print(f"Embedding for text A (shape): {embedding_a.shape}")

similarity_ab = baai_model.compute_similarity(text_a, text_b)
similarity_ac = baai_model.compute_similarity(text_a, text_c)

print(f"Similarity (A vs B): {similarity_ab:.4f}")
print(f"Similarity (A vs C): {similarity_ac:.4f}")


Testing encoding and similarity...
Model device: meta
Model device after loading: cuda:0
Embedding for text A (shape): (4096,)
Similarity (A vs B): 0.2811
Similarity (A vs C): 0.4036


In [8]:
# --- Cell 9: Test Prompt-Based Segmentation ---
print("\nTesting prompt-based segmentation...")
segmentation_text = """
The first part of the process is data collection. We gather information from various sources.
Next, we move to data processing. This involves cleaning and transforming the raw data.
Finally, the analysis phase begins. We apply statistical models to uncover insights.
"""

segmentation_text = open('Project_Workflow_Segmentation', 'r').read() if Path('Project_Workflow_Segmentation').exists() else segmentation_text

segmentation_prompt = "Identify the distinct stages of the project workflow."

prompt_segments = baai_model.segment_by_prompt(segmentation_text, segmentation_prompt)

print(f"\nFound {len(prompt_segments)} segments based on the prompt:")
for i, seg in enumerate(prompt_segments):
    print(f"  Segment {i+1}: {seg['text']}")


Testing prompt-based segmentation...

Found 1 segments based on the prompt:
  Segment 1: 
The first part of the process is data collection. We gather information from various sources.
Next, we move to data processing. This involves cleaning and transforming the raw data.
Finally, the analysis phase begins. We apply statistical models to uncover insights.



In [11]:
# --- Cell 10: Test Semantic Boundary Detection ---
print("\nTesting semantic boundary detection...")
boundary_text = """
The sun rises in the east, casting long shadows across the landscape. Birds begin to sing their morning songs.
Suddenly, the topic shifts to astrophysics. The lifecycle of a star is a complex process, from nebula to supernova.
Returning to our original theme, the weather today is expected to be warm and sunny.
"""
boundary_text = open('Layer_Context_Window_Graphs', 'r').read() if Path('Layer_Context_Window_Graphs').exists() else boundary_text

boundaries = baai_model.find_semantic_boundaries(boundary_text)

print(f"\nFound {len(boundaries)} semantic boundaries at character positions: {boundaries}")


Testing semantic boundary detection...

Found 32 semantic boundaries at character positions: [257, 2034, 12100, 13962, 16024, 17581, 18729, 22864, 23278, 27093, 37334, 38557, 39606, 39755, 41132, 43263, 47664, 48404, 52324, 52499, 52646, 53347, 60668, 66126, 69518, 69619, 73122, 76180, 83572, 84233, 98823, 102667]


In [12]:
for i in range(len(boundaries) - 1):
    start = boundaries[i]
    end = boundaries[i + 1]
    segment = boundary_text[start:end]
    print(f"Segment {i+1}: {segment.strip()} (from {start} to {end})")

Segment 1: **Head Specialization Discovery**
```python
def discover_head_specializations(self, model, test_corpus):
    """Automatically find what each head is good at"""
    
    head_skills = {}
    
    for layer in range(model.config.num_hidden_layers):
        for head in range(model.config.num_attention_heads):
            # Test each head on different tasks
            boundary_score = test_boundary_detection(layer, head, test_corpus)
            relation_score = test_relation_detection(layer, head, test_corpus)
            cluster_score = test_clustering_ability(layer, head, test_corpus)
            
            head_skills[(layer, head)] = {
                'boundary': boundary_score,
                'relation': relation_score,
                'cluster': cluster_score
            }
    
    # Assign heads to rules based on their strengths
    self.boundary_heads = top_k_heads(head_skills, 'boundary', k=5)
    self.relation_heads = top_k_heads(head_skills, 'relation', k=5)
```


In [None]:
# --- Cell 11: Model Info and Teardown ---
print("\nGetting BAAI model info...")
model_info = baai_model.get_model_info()
print(model_info)

print("\nUnloading BAAI model...")
baai_model.unload()
print("BAAI model unloaded.")