In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Chunking Lesson plan with Langchain

In [2]:
# High Level explanation for lesson plan chunking: 
"""
1. Hierarchically chunked by markers and sections.
2. Embedded with metadata including grade, subject, topic, and section type.
3. Queried using metadata filters that match fields such as grade and topic.
"""

'\n1. Hierarchically chunked by markers and sections.\n2. Embedded with metadata including grade, subject, topic, and section type.\n3. Queried using metadata filters that match fields such as grade and topic.\n'

In [3]:
import re
import json
import uuid
from typing import List
from langchain.text_splitter import TextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
import chromadb

# -------------------------------
# 1. Our own Custom TextSplitter 
# note: we are using Hierarchical Chunking
# -------------------------------
class LessonPlanTextSplitter(TextSplitter):
    def split_text(self, text: str) -> List[Document]:
        stop_marker = r"Note: The following pages are intended for classroom use for students as a visual aid to learning\."
        split_block = re.split(stop_marker, text, maxsplit=1)
        content = split_block[0].strip() if split_block else text.strip()

        # Split into intro and remainder
        parts = re.split(r"Student/Teacher Actions:\s*", content, flags=re.IGNORECASE, maxsplit=1)
        if len(parts) == 2:
            intro, remainder = parts[0].strip(), parts[1].strip()
        else:
            intro, remainder = content, ""

        docs = []
        docs.append(Document(page_content=intro, metadata={"section": "intro_context"}))

        combined_pattern = r"(Assessment\s*\n|Extensions(?: and Connections)?\s*\n|Strategies for Differentiation\s*\n)"
        split_sec = re.split(combined_pattern, remainder)

        # Instructional steps (first segment)
        instr = split_sec[0].strip()
        if instr:
            docs.append(Document(page_content=instr, metadata={"section": "instructional_steps"}))

        # Subsequent header/content pairs
        for i in range(1, len(split_sec) - 1, 2):
            header = split_sec[i].strip().lower()
            content = split_sec[i + 1].strip()
            if header.startswith("assessment"):
                docs.append(Document(page_content=content, metadata={"section": "assessment"}))
            elif header.startswith("extensions"):
                docs.append(Document(page_content=content, metadata={"section": "extensions"}))
            elif header.startswith("strategies for differentiation"):
                docs.append(Document(page_content=content, metadata={"section": "differentiation"}))

        return docs

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# -------------------------------
# 2. Chunk the Lesson Plans 
# note: here our main file is test_with_notes.txt where all our lesson plan is stored
# -------------------------------
with open("test_with_notes_modified.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

# Split into individual lesson plan blocks
lesson_plan_blocks = re.findall(
    r"--- Start of Lesson Plan(.*?)--- End of Lesson Plan",
    full_text,
    re.DOTALL
)

splitter = LessonPlanTextSplitter()
all_docs: List[Document] = []

for lesson_index, block in enumerate(lesson_plan_blocks):
    docs = splitter.split_text(block)
    # Tag each chunk with its lesson index
    for doc in docs:
        doc.metadata["lesson_index"] = lesson_index
    all_docs.extend(docs)

In [5]:
# -------------------------------
# 3. Extract Lesson-Level Metadata (Updated with strand + cleaned subject)
# -------------------------------

def extract_metadata_from_intro(intro: str):
    special_grades = [
    "Kindergarten",
    "Algebra I",
    "Geometry",
    "Algebra 2",
    "Algebra, Functions & Data Analysis"
]

    # First try to match "Grade X"
    grade_match = re.search(r"Grade\s*(\d+)", intro)
    grade = grade_match.group(1) if grade_match else None

    # If no "Grade X" found, check for special labels
    if not grade:
        for sg in special_grades:
            if sg.lower() in intro.lower():
                grade = sg
                break

    # Try to extract subject between "Subject:" and "Strand:", if available
    subject_match = re.search(r"Subject:\s*(.*?)\s*Strand:", intro, flags=re.DOTALL)

    # If not found, try to extract subject from "Subject:" alone
    if not subject_match:
        subject_match = re.search(r"Subject:\s*(.+)", intro)

    strand = re.search(r"Strand:\s*(.+)", intro)
    topic = re.search(r"Topic:\s*(.+)", intro)
    title = intro.split("\n")[0].strip()

    # Clean subject text
    subject_text = subject_match.group(1).strip() if subject_match else None
    if subject_text:
        subject_text = re.sub(r'\s+', ' ', subject_text)

    return {
        "grade": grade,
        "subject": subject_text,
        "strand": strand.group(1).strip() if strand else None,
        "topic": topic.group(1).strip() if topic else None,
        "lesson_title": title
    }


# Build map: lesson_index -> metadata
lesson_metadata_map = {}
for doc in all_docs:
    if doc.metadata["section"] == "intro_context":
        lesson_metadata_map[doc.metadata["lesson_index"]] = extract_metadata_from_intro(doc.page_content)
# # -------------------------------
# # Print out the metadata for each lesson
# # -------------------------------
# for lesson_index, metadata in lesson_metadata_map.items():
#     print(f"Lesson Index: {lesson_index}")
#     print(f"  Lesson Title: {metadata.get('lesson_title')}")
#     print(f"  Grade: {metadata.get('grade')}")
#     print(f"  Strand: {metadata.get('strand')}")
#     print(f"  Subject: {metadata.get('subject')}")
#     print(f"  Topic: {metadata.get('topic')}")
#     print("-" * 2)

## Section 4

In [6]:
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient

model = SentenceTransformer("all-MiniLM-L6-v2")

client = PersistentClient(path="./chroma_store")
# client.delete_collection("lesson_plans")
collection = client.get_or_create_collection("lesson_plans")
# Optional: clear existing collection to avoid duplicates
# collection.delete()  # Uncomment this only if you want a fresh start

existing_ids = set(collection.get()["ids"])

for doc in all_docs:
    # Update metadata from lesson_metadata_map
    lesson_meta = lesson_metadata_map.get(doc.metadata["lesson_index"], {})
    doc.metadata.update(lesson_meta)

    # Clean nulls
    cleaned_metadata = {k: v for k, v in doc.metadata.items() if v is not None}

    doc_id = f"lesson{cleaned_metadata['lesson_index']}_{cleaned_metadata['section']}"
    if doc_id in existing_ids:
        continue

    embedding = model.encode(doc.page_content).tolist()
    collection.add(
        documents=[doc.page_content],
        metadatas=[cleaned_metadata],
        embeddings=[embedding],
        ids=[doc_id]
    )
    existing_ids.add(doc_id)

print("🔁 Re-embedding complete with updated metadata.")

🔁 Re-embedding complete with updated metadata.


In [7]:
# -------------------------------
# STEP 5: Querying the Lesson Plan
# note: trial sample query to check if its working
# -------------------------------
import json
from chromadb import PersistentClient

query_text = "How do I teach Absolute Value Equations and Inequalities for Algebra 2?"

metadata_filter = {
    "$and": [
        {"grade": "Algebra 2"},
        # {"strand": "Number and Number Sense"},
        {"subject": "Absolute Value Equations and Inequalities"}
    ]
}

results = collection.query(
    query_texts=[query_text],
    n_results=5,
    where=metadata_filter,
    include=["documents", "metadatas", "distances"]
)

flat = {
    "ids": results["ids"][0],
    "documents": results["documents"][0],
    "metadatas": results["metadatas"][0],
    "distances": results["distances"][0]
}

# Pretty-print for verification
print(json.dumps(flat, indent=4))

{
    "ids": [
        "lesson36_intro_context",
        "lesson36_instructional_steps",
        "lesson36_extensions",
        "lesson36_assessment",
        "lesson36_differentiation"
    ],
    "documents": [
        "Algebra 2-Equations and Inequalities-AII.3a - Absolute Value Equations and Inequalities.pdf ---\nSubject:\nAbsolute Value Equations and Inequalities\nStrand:\nEquations and Inequalities\nTopic:\nSolving absolute value equations and inequalities\nPrimary SOL:\nAII.3 The student will solve\na) The student will solve absolute value equations and\ninequalities.\nRelated SOL:\nAII.6\nMaterials:\n\uf0b7 Absolute Value Matching Cards activity sheet (attached)\n\uf0b7 Absolute Value Equations/Inequalities activity sheet (attached)\n\uf0b7 Absolute Value Stations Review activity sheet (attached)\n\uf0b7 Graphing utility\n\uf0b7 Colored pencils or highlighters.\nVocabulary\nabsolute value inequality, compound inequality, compound statement, intersection, linear\nequation, linear

In [8]:
# Verify grade distribution
sample = collection.get(include=["metadatas"])
grade_counts = {}

for meta in sample["metadatas"]:
    grade = meta.get("grade")
    if grade:
        grade_counts[grade] = grade_counts.get(grade, 0) + 1

print("📚 Document count by Grade:\n")
for grade in sorted(grade_counts):
    print(f"Grade: {grade:<45} → Count: {grade_counts[grade]}")

# Check if any expected grade is missing
expected_grades = [
    "Kindergarten",
    "Grade 1",
    "Grade 2",
    "Grade 3",
    "Grade 4",
    "Grade 5",
    "Grade 6",
    "Grade 7",
    "Grade 8",
    "Algebra I",
    "Geometry",
    "Algebra 2",
    "Algebra, Functions & Data Analysis"
]

print("\n🔎 Checking for expected grades:")
for g in expected_grades:
    if g not in grade_counts:
        print(f"⚠️  Missing: {g}")

📚 Document count by Grade:

Grade: 1                                             → Count: 144
Grade: 2                                             → Count: 156
Grade: 3                                             → Count: 170
Grade: 4                                             → Count: 190
Grade: 5                                             → Count: 112
Grade: 6                                             → Count: 122
Grade: 7                                             → Count: 106
Grade: 8                                             → Count: 112
Grade: Algebra 2                                     → Count: 112
Grade: Algebra I                                     → Count: 146
Grade: Algebra, Functions & Data Analysis            → Count: 78
Grade: Geometry                                      → Count: 96
Grade: Kindergarten                                  → Count: 128

🔎 Checking for expected grades:
⚠️  Missing: Grade 1
⚠️  Missing: Grade 2
⚠️  Missing: Grade 3
⚠️  Missing: Grade 4

In [9]:
# note: Just for confirmation we are printing full lesson plan
# Concatenate all 5 documents into a full lesson text if we want to use it as a single lesson
full_lesson = "\n\n".join(flat["documents"])

print(full_lesson)


Algebra 2-Equations and Inequalities-AII.3a - Absolute Value Equations and Inequalities.pdf ---
Subject:
Absolute Value Equations and Inequalities
Strand:
Equations and Inequalities
Topic:
Solving absolute value equations and inequalities
Primary SOL:
AII.3 The student will solve
a) The student will solve absolute value equations and
inequalities.
Related SOL:
AII.6
Materials:
 Absolute Value Matching Cards activity sheet (attached)
 Absolute Value Equations/Inequalities activity sheet (attached)
 Absolute Value Stations Review activity sheet (attached)
 Graphing utility
 Colored pencils or highlighters.
Vocabulary
absolute value inequality, compound inequality, compound statement, intersection, linear
equation, linear inequality, interval notation, union, set-builder notation, solution set

What should students be doing? What should teachers be doing?
Time: 90 minutes
1. Review graphing and writing inequalities with students. Have students work to fill in the
table following in c

# Chunking with Subject - OLD

In [1]:
import re
import json
import uuid
from typing import List
from langchain.text_splitter import TextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
import chromadb

# -------------------------------
# 1. Our own Custom TextSplitter 
# note: we are using Hierarchical Chunking
# -------------------------------
class LessonPlanTextSplitter(TextSplitter):
    def split_text(self, text: str) -> List[Document]:
        stop_marker = r"Note: The following pages are intended for classroom use for students as a visual aid to learning\."
        split_block = re.split(stop_marker, text, maxsplit=1)
        content = split_block[0].strip() if split_block else text.strip()

        # Split into intro and remainder
        parts = re.split(r"Student/Teacher Actions:\s*", content, flags=re.IGNORECASE, maxsplit=1)
        if len(parts) == 2:
            intro, remainder = parts[0].strip(), parts[1].strip()
        else:
            intro, remainder = content, ""

        docs = []
        docs.append(Document(page_content=intro, metadata={"section": "intro_context"}))

        combined_pattern = r"(Assessment\s*\n|Extensions(?: and Connections)?\s*\n|Strategies for Differentiation\s*\n)"
        split_sec = re.split(combined_pattern, remainder)

        # Instructional steps (first segment)
        instr = split_sec[0].strip()
        if instr:
            docs.append(Document(page_content=instr, metadata={"section": "instructional_steps"}))

        # Subsequent header/content pairs
        for i in range(1, len(split_sec) - 1, 2):
            header = split_sec[i].strip().lower()
            content = split_sec[i + 1].strip()
            if header.startswith("assessment"):
                docs.append(Document(page_content=content, metadata={"section": "assessment"}))
            elif header.startswith("extensions"):
                docs.append(Document(page_content=content, metadata={"section": "extensions"}))
            elif header.startswith("strategies for differentiation"):
                docs.append(Document(page_content=content, metadata={"section": "differentiation"}))

        return docs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -------------------------------
# 2. Chunk the Lesson Plans 
# note: here our main file is test_with_notes.txt where all our lesson plan is stored
# -------------------------------
with open("test_with_notes.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

# Split into individual lesson plan blocks
lesson_plan_blocks = re.findall(
    r"--- Start of Lesson Plan(.*?)--- End of Lesson Plan",
    full_text,
    re.DOTALL
)

splitter = LessonPlanTextSplitter()
all_docs: List[Document] = []

for lesson_index, block in enumerate(lesson_plan_blocks):
    docs = splitter.split_text(block)
    # Tag each chunk with its lesson index
    for doc in docs:
        doc.metadata["lesson_index"] = lesson_index
    all_docs.extend(docs)

In [3]:
# -------------------------------
# 3. Extract Lesson-Level Metadata original
# note: here our metda data is split based on lesson index and then we have grade, subject, topic and lesson title
# -------------------------------
def extract_metadata_from_intro(intro: str):
    grade = re.search(r"Grade\s*(\d+)", intro)
    subject = re.search(r"Strand:\s*(.+)", intro)
    topic = re.search(r"Topic:\s*(.+)", intro)
    title = intro.split("\n")[0].strip()
    return {
        "grade": grade.group(1) if grade else None,
        "subject": subject.group(1).strip() if subject else None,
        "topic": topic.group(1).strip() if topic else None,
        "lesson_title": title
    }

# Build map: lesson_index -> metadata
lesson_metadata_map = {}
for doc in all_docs:
    if doc.metadata["section"] == "intro_context":
        lesson_metadata_map[doc.metadata["lesson_index"]] = extract_metadata_from_intro(doc.page_content)


In [4]:
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient

model = SentenceTransformer("all-MiniLM-L6-v2")

# client = PersistentClient(path="./chroma_store1")
client = PersistentClient(path="./chroma_store")
collection = client.get_or_create_collection("lesson_plans")

# Fetch existing IDs
existing_ids = set(collection.get()["ids"])

for doc in all_docs:
    lm = lesson_metadata_map.get(doc.metadata["lesson_index"], {})
    doc.metadata.update(lm)
    cleaned_metadata = {k: v for k, v in doc.metadata.items() if v is not None}

    doc_id = f"lesson{cleaned_metadata['lesson_index']}_{cleaned_metadata['section']}"
    if doc_id in existing_ids:
        continue

    embedding = model.encode(doc.page_content).tolist()
    collection.add(
        documents=[doc.page_content],
        metadatas=[cleaned_metadata],
        embeddings=[embedding],
        ids=[doc_id]
    )
    existing_ids.add(doc_id)

print("Embedding complete. All chunks stored with full metadata.")

Embedding complete. All chunks stored with full metadata.


#### Query 

In [8]:
# -------------------------------
# STEP 5: Querying the Lesson Plan
# note: trial sample query to check if its working
# -------------------------------
import json
from chromadb import PersistentClient

query_text = "How do I teach decimal rounding in number and number sense for grade 5?"

metadata_filter = {
    "$and": [
        {"grade": "1"},
        {"strand": "Number and Number Sense"},
        {"subject": "Grouping and Counting–Part 2"}
    ]
}

results = collection.query(
    query_texts=[query_text],
    n_results=5,
    where=metadata_filter,
    include=["documents", "metadatas", "distances"]
)

flat = {
    "ids": results["ids"][0],
    "documents": results["documents"][0],
    "metadatas": results["metadatas"][0],
    "distances": results["distances"][0]
}

# Pretty-print for verification
print(json.dumps(flat, indent=4))

{
    "ids": [
        "lesson123_assessment",
        "lesson123_intro_context",
        "lesson123_differentiation",
        "lesson123_instructional_steps"
    ],
    "documents": [
        "\uf0b7 Questions\no Why do I have more groups when I count by twos than when I count by tens?\no What do you notice about the numbers when we count by \u2026\n\uf0a7 Twos? (It is every other number)\n\uf0a7 Fives? (Ends in a zero or a 5)\n\uf0a7 Tens (Ends in a zero)\u201d\n\uf0b7 Journal/writing prompts\no Given a collection of 50 objects, how many groups of twos, fives, and tens would\nI have? Show your work.\no How are counting by fives and tens alike and different.  Is it faster to count by\nfives or tens? Why?\u201d\n\uf0b7 Other Assessments\no Give students a collection of 110 objects and ask them to count them in a way\nthat would be the fastest.\no Using a blank 110 chart, ask students to fill in only the spots for the twos.  Have\nstudents erase and repeat for fives and tens.\nExtension

In [9]:
# note: Just for confirmation we are printing full lesson plan
# Concatenate all 5 documents into a full lesson text if we want to use it as a single lesson
full_lesson = "\n\n".join(flat["documents"])

print(full_lesson)


 Questions
o Why do I have more groups when I count by twos than when I count by tens?
o What do you notice about the numbers when we count by …
 Twos? (It is every other number)
 Fives? (Ends in a zero or a 5)
 Tens (Ends in a zero)”
 Journal/writing prompts
o Given a collection of 50 objects, how many groups of twos, fives, and tens would
I have? Show your work.
o How are counting by fives and tens alike and different.  Is it faster to count by
fives or tens? Why?”
 Other Assessments
o Give students a collection of 110 objects and ask them to count them in a way
that would be the fastest.
o Using a blank 110 chart, ask students to fill in only the spots for the twos.  Have
students erase and repeat for fives and tens.
Extensions and Connections (for all students)
 Show students how to skip count on a calculator.  To skip count by twos, clear the
calculator and then enter + 2 = and continue pressing =.  Students can color the
numbers shown on the calculator display on a 110 cha

# Chunking textbook using Langchain

In [9]:
# # High Level explanation
"""
1. Processed using both recursive (and optionally semantic) chunking approaches.

2. Chunked further into smaller text pieces with overlap to preserve context.

3.Embedded and stored with metadata (e.g., executive_skill and section_title).

4.Retrieval is performed using metadata filters such as "executive_skill": "Working Memory"
"""

'\n1. Processed using both recursive (and optionally semantic) chunking approaches.\n\n2. Chunked further into smaller text pieces with overlap to preserve context.\n\n3.Embedded and stored with metadata (e.g., executive_skill and section_title).\n\n4.Retrieval is performed using metadata filters such as "executive_skill": "Working Memory"\n'

In [10]:
pip install -U langchain langchain-experimental

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [11]:
import re
import json
from langchain.docstore.document import Document

# --- Step 1: Load and split the text file by markers ---
def split_by_markers(text: str):
    """
    Splits the text into sections based on markers of the form "---- <section title> ----"
    Returns a list of tuples: (section_title, content)
    """
    pattern = r"----\s*(.*?)\s*----\n(.*?)(?=----|$)"
    matches = re.findall(pattern, text, re.DOTALL)
    sections = [(title.strip(), content.strip()) for title, content in matches]
    return sections

with open("Smartbutscattered.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

sections = split_by_markers(book_text)
print(f"Total sections: {len(sections)}")
for i, (title, _) in enumerate(sections):
    print(f"Section {i+1}: {title}")

Total sections: 24
Section 1: introduction
Section 2: end of introduction
Section 3: Building Response Inhibition
Section 4: end of Building Response Inhibition
Section 5: Enhancing Working Memory
Section 6: end of Enhancing Working Memory
Section 7: Improving Emotional Control
Section 8: end of Improving Emotional Control
Section 9: Strengthening Sustained Attention
Section 10: end of Strengthening Sustained Attention
Section 11: Teaching Task Initiation
Section 12: end of Teaching Task Initiation
Section 13: Promoting, Planning, and Prioritizing
Section 14: end of Promoting, Planning, and Prioritizing
Section 15: Fostering Organization
Section 16: end of Fostering Organization
Section 17: Instilling Time Management
Section 18: end of Instilling Time Management
Section 19: Encouraging Flexibility
Section 20: end of Encouraging Flexibility
Section 21: Increasing Goal-Directed Persistence
Section 22: end of Increasing Goal-Directed Persistence
Section 23: Cultivating Metacognition
Secti

In [12]:
# --- Step 2: Semantic chunking using LangChain's SemanticChunker ---
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

chunker = SemanticChunker(embeddings=embedding_model)

docs = []
for section_title, content in sections:
    metadata = {"executive_skill": section_title, "section_title": section_title}
    chunks = chunker.split_text(content)
    for chunk in chunks:
        docs.append(Document(page_content=chunk, metadata=metadata))

print(f"Total semantic chunks created: {len(docs)}")

Total semantic chunks created: 119


### Section 4

In [13]:
# --- Step 3: Store chunks into Chroma DB with duplicate checking ---
import chromadb

# Initialize the Chroma client and create/get the "exec_skills" collection.
# client = chromadb.Client()
collection = client.get_or_create_collection("exec_skills")

existing_result = collection.get() 
existing_ids = set(existing_result.get("ids", []))

new_ids = []
new_docs = []
new_metadatas = []
new_embeddings = []

for i, doc in enumerate(docs):
    # Create a deterministic ID using a sequential number and the executive_skill (with spaces removed)
    doc_id = f"exec_{i}_{doc.metadata['executive_skill'].replace(' ', '_')}"
    if doc_id in existing_ids:
        continue

    embedding = embedding_model.embed_documents([doc.page_content])[0]
    new_ids.append(doc_id)
    new_docs.append(doc.page_content)
    new_metadatas.append(doc.metadata)
    new_embeddings.append(embedding)

if new_ids:
    collection.add(
        ids=new_ids,
        documents=new_docs,
        metadatas=new_metadatas,
        embeddings=new_embeddings
    )
    print(f"Added {len(new_ids)} new chunks to the collection.")
else:
    print("No new chunks to add.")


Added 119 new chunks to the collection.


In [14]:
print(collection.peek())

{'ids': ['exec_0_introduction', 'exec_1_introduction', 'exec_2_introduction', 'exec_3_introduction', 'exec_4_introduction', 'exec_5_introduction', 'exec_6_introduction', 'exec_7_introduction', 'exec_8_introduction', 'exec_9_introduction'], 'embeddings': array([[-0.0433003 ,  0.05404672,  0.10007481, ...,  0.04402868,
        -0.02650417,  0.01566317],
       [ 0.030204  , -0.02159898,  0.06438272, ...,  0.04969868,
        -0.00508368,  0.04048944],
       [ 0.01783722, -0.01273656,  0.02721443, ...,  0.1407281 ,
        -0.01282246,  0.01987847],
       ...,
       [-0.01212036,  0.04398706, -0.04357916, ...,  0.04279802,
        -0.06416749,  0.05369508],
       [ 0.09652939,  0.03130122,  0.01609589, ...,  0.10366048,
         0.03726649, -0.00696636],
       [ 0.0324617 , -0.09679312, -0.01684162, ...,  0.1221268 ,
         0.02831617, -0.01210798]]), 'documents': ['How Did Such a Smart Kid End Up So Scattered? Katie is 8 years old. It’s Saturday morning, and her mother has sent he

In [16]:
 #--- Step 4: Query test using metadata filtering ---
# For this example, we query for chunks where the executive_skill is "Enhancing Working Memory".
query_text = "What strategies can help improve working memory?"
metadata_filter = {"executive_skill": "Enhancing Working Memory"}

results = collection.query(
    query_texts=[query_text],
    n_results=5,
    where=metadata_filter,
    include=["documents", "metadatas", "distances"]
)

if results["documents"]:
    flat = {
        "documents": results["documents"][0],
        "metadatas": results["metadatas"][0],
        "distances": results["distances"][0]
    }
    print("Query results:")
    print(json.dumps(flat, indent=4))
else:
    print("No documents retrieved for the given query filter.")

Query results:
{
    "documents": [
        "Enhancing Working Memory Working memory is the capacity to hold information in mind while performing complex tasks. We rely on working memory all the time. It\u2019s the ability to run out to the store to buy a few things and remember what they are without having to write them down. When you remember to stop by the dry cleaner on your way home from work, you\u2019re using working memory. When you look up a phone number in the phone book and remember it long enough to make the call, you\u2019re using working memory. When your spouse asks you to do something and you say, \u201cI\u2019ll do it as soon as I finish loading the dishwasher,\u201d and then you actually remember to do it, chances are your working memory is pretty good. Odds are it\u2019s not so good, however, if you can\u2019t remember anyone\u2019s birthday, you tend to return home with only half your errands done unless you have a written agenda, and you\u2019ll do anything to avoi

# Extra things done - different methods used

## parabased chunking for tb

In [None]:
import re
import json
import uuid
from typing import List, Tuple
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
import chromadb

# -------------------------------
# 1. Top-Level Splitting by Explicit Section Markers
# -------------------------------
def split_by_markers(text: str) -> List[Tuple[str, str]]:
    """
    Splits text into sections based on markers that look like:
      ---- section title ----
    Ignores markers with "end" (case-insensitive).
    
    Returns a list of tuples: (section_title, section_content)
    """
    marker_pattern = r"(?m)^----\s*(.*?)\s*----\s*$"
    markers = list(re.finditer(marker_pattern, text))
    sections = []
    for i, m in enumerate(markers):
        title = m.group(1).strip()
        if "end" in title.lower():
            continue
        start_index = m.end()
        end_index = len(text)
        # Look for the next marker that is not an "end" marker or take rest of text.
        for j in range(i+1, len(markers)):
            next_title = markers[j].group(1).strip()
            if "end" not in next_title.lower():
                end_index = markers[j].start()
                break
            else:
                end_index = markers[j].start()
                break
        content = text[start_index:end_index].strip()
        sections.append((title, content))
    return sections

# -------------------------------
# 2. Second-Level Splitting by Paragraphs
# -------------------------------
def split_section_paragraphs(sections: List[Tuple[str, str]]) -> List[Document]:
    """
    Splits each section by paragraphs (using two or more newlines) and creates a Document for each chunk.
    Applies a simple mapping rule for sections representing executive skills.
    """
    docs = []
    
    def map_title(title: str) -> str:
        lower = title.lower()
        if lower.startswith("building response inhibition"):
            return "Response Inhibition"
        elif lower.startswith("enhancing working memory"):
            return "Working Memory"
        # Add additional mappings if needed.
        return title  # Default
    
    for title, content in sections:
        mapped_title = map_title(title)
        # Split by paragraph (two or more newlines)
        paragraphs = re.split(r"\n\s*\n", content)
        for para in paragraphs:
            para = para.strip()
            if para:
                docs.append(Document(
                    page_content=para,
                    metadata={"executive_skill": mapped_title, "section_title": title}
                ))
    return docs

# -------------------------------
# 3. Load "Smartbutscattered.txt" and Process
# -------------------------------
with open("Smartbutscattered.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

sections = split_by_markers(book_text)
exec_docs = split_section_paragraphs(sections)

# Optional: Save structured chunks to a JSON file for review
with open("exec_skill_chunks_parabased.json", "w", encoding="utf-8") as f:
    json.dump([{"executive_skill": doc.metadata["executive_skill"], "section_title": doc.metadata["section_title"], "content": doc.page_content} for doc in exec_docs], f, indent=4)

# -------------------------------
# 4. Embed & Store in ChromaDB Collection "exec_skills"
# -------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
exec_collection = client.get_or_create_collection("exec_skills")

# Use a deterministic ID: based on executive_skill and the chunk index
existing_exec_ids = set(exec_collection.get()["ids"])

for idx, doc in enumerate(exec_docs):
    skill = doc.metadata["executive_skill"].replace(" ", "_")
    doc_id = f"exec_{skill}_{idx}"
    if doc_id in existing_exec_ids:
        continue
    embedding = model.encode(doc.page_content).tolist()
    exec_collection.add(
        documents=[doc.page_content],
        metadatas=[doc.metadata],
        embeddings=[embedding],
        ids=[doc_id]
    )
    existing_exec_ids.add(doc_id)

print("Executive skill chunks embedded and stored in the 'exec_skills' collection.")

# -------------------------------
# 5. (Optional) Retrieval Example
# -------------------------------
query_text = "What are some strategies to improve Working Memory?"

# Use a single filter expression directly (do not wrap in $and if there's only one condition)
# metadata_filter = {"executive_skill": "Working Memory"}

results = exec_collection.query(
    query_texts=[query_text],
    n_results=5,
    # where=metadata_filter,
    include=["documents", "metadatas", "distances"]
)

# Unpack results from the first query batch.
flat = {
    "ids": results["ids"][0],
    "documents": results["documents"][0],
    "metadatas": results["metadatas"][0],
    "distances": results["distances"][0]
}

import json
print(json.dumps(flat, indent=4))


## Checking how are meta data is for tb

In [17]:
import json
import chromadb

# Initialize ChromaDB client and connect to your "exec_skills" collection.
client = chromadb.Client()
exec_collection = client.get_or_create_collection("exec_skills")

# Retrieve all records from the collection.
records = exec_collection.get()

# The "metadatas" field is expected to be a list of lists.
# We want to flatten that to a list of dictionaries.
flat_metadatas = []

# Loop over each batch (each batch is expected to be a list)
for batch in records.get("metadatas", []):
    if isinstance(batch, list):
        # Check each element in the batch:
        for meta in batch:
            # If the element is a dictionary, append it directly.
            if isinstance(meta, dict):
                flat_metadatas.append(meta)
            else:
                # If it is not a dict, it might be a string (i.e. a key),
                # so we skip or wrap it as needed.
                flat_metadatas.append({"value": meta})
    elif isinstance(batch, dict):
        # In case the batch itself is a dictionary, add it.
        flat_metadatas.append(batch)
    else:
        # Otherwise, add as a simple string wrapped in a dict.
        flat_metadatas.append({"value": batch})

# Print the metadata in a readable JSON format.
print("All metadata from the 'exec_skills' collection:")
print(json.dumps(flat_metadatas, indent=4))


All metadata from the 'exec_skills' collection:
[]


## Recursive Chunking

In [None]:
pip install nltk langchain chromadb sentence_transformers

In [None]:
import nltk 
nltk.download('punkt')

In [None]:
import re
import json
from typing import List, Tuple
from langchain.docstore.document import Document
import nltk
from nltk.tokenize import sent_tokenize

# -------------------------------
# Step 1: Top-Level Splitting by Explicit Markers
# -------------------------------
def split_by_markers(text: str) -> List[Tuple[str, str]]:
    """
    Splits the text using markers of the form:
       ---- Section Title ----
    Ignores markers that have "end" (case-insensitive) in the title.
    Returns a list of (section_title, section_content) tuples.
    """
    marker_pattern = r"(?m)^----\s*(.*?)\s*----\s*$"
    markers = list(re.finditer(marker_pattern, text))
    sections = []
    for i, m in enumerate(markers):
        title = m.group(1).strip()
        if "end" in title.lower():
            continue
        start_index = m.end()
        end_index = len(text)
        for j in range(i+1, len(markers)):
            next_title = markers[j].group(1).strip()
            # If next marker is an "end" marker or a new section, stop here.
            if "end" in next_title.lower() or next_title:
                end_index = markers[j].start()
                break
        content = text[start_index:end_index].strip()
        sections.append((title, content))
    return sections

# -------------------------------
# Step 2: Sentence-Based Chunking with Overlap
# -------------------------------
def group_sentences(sentences: List[str], max_chunk_size: int = 1000, overlap_sent_count: int = 2) -> List[str]:
    """
    Groups sentences into chunks that do not exceed max_chunk_size (by character count)
    and adds overlap of the last few sentences to the next chunk.
    """
    chunks = []
    current_sentences = []
    current_length = 0

    for sentence in sentences:
        # +1 for a space between sentences.
        if current_length + len(sentence) + (1 if current_sentences else 0) <= max_chunk_size:
            current_sentences.append(sentence)
            current_length = len(" ".join(current_sentences))
        else:
            # Current chunk is complete; join it.
            chunk = " ".join(current_sentences)
            chunks.append(chunk)
            # Now, determine overlap: use the last overlap_sent_count sentences.
            overlap_sentences = current_sentences[-overlap_sent_count:] if len(current_sentences) >= overlap_sent_count else current_sentences
            # Start a new chunk with the overlap and the current sentence.
            current_sentences = overlap_sentences + [sentence]
            current_length = len(" ".join(current_sentences))
    if current_sentences:
        chunks.append(" ".join(current_sentences))
    return chunks

def split_section_paragraphs(sections: List[Tuple[str, str]], max_chunk_size: int = 1000, overlap_sent_count: int = 2) -> List[Document]:
    """
    For each section (tuple of title and content), first split into paragraphs using blank lines,
    then for each paragraph split into sentences (using nltk.sent_tokenize) and group them into chunks.
    Each resulting chunk is saved as a Document with metadata.
    """
    docs = []

    def map_title(title: str) -> str:
        lower = title.lower()
        if lower.startswith("building response inhibition"):
            return "Response Inhibition"
        elif lower.startswith("enhancing working memory"):
            return "Working Memory"
        # Extend mappings for additional executive skill sections as needed.
        return title  # fallback: use the raw title

    for title, content in sections:
        canonical_skill = map_title(title)
        # First, split the section into paragraphs (using two or more newlines)
        paragraphs = re.split(r"\n\s*\n", content)
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            # Use nltk to split into sentences.
            sentences = sent_tokenize(para)
            # Group sentences into chunks with our function.
            chunk_texts = group_sentences(sentences, max_chunk_size=max_chunk_size, overlap_sent_count=overlap_sent_count)
            for chunk in chunk_texts:
                docs.append(Document(
                    page_content=chunk,
                    metadata={
                        "executive_skill": canonical_skill,
                        "section_title": title
                    }
                ))
    return docs

# -------------------------------
# Step 3: Load Text File and Process
# -------------------------------
with open("Smartbutscattered.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

sections = split_by_markers(book_text)
exec_docs = split_section_paragraphs(sections, max_chunk_size=1000, overlap_sent_count=2)

# (Optional) Write the resulting chunks (metadata and content) to a JSON file for inspection.
with open("exec_skill_chunks.json", "w", encoding="utf-8") as f:
    json.dump(
        [{"executive_skill": doc.metadata["executive_skill"],
          "section_title": doc.metadata["section_title"],
          "content": doc.page_content}
         for doc in exec_docs],
        f,
        indent=4
    )
# -------------------------------


## Langchain recursive

In [None]:
import os
import re
import json
from typing import List, Tuple
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# -------------------------------
# Step 1: Hierarchical Splitting by Explicit Markers
# -------------------------------
def split_by_markers_recursivelangchain(text: str) -> List[Tuple[str, str]]:
    """
    Splits the text using markers of the form:
        ---- Section Title ----
    Skips markers that have "end" in the title (case-insensitive). Returns a list of tuples:
    (section_title, section_content)
    """
    marker_pattern = r"(?m)^----\s*(.*?)\s*----\s*$"
    markers = list(re.finditer(marker_pattern, text))
    sections_recursivelangchain = []
    for i, m in enumerate(markers):
        title = m.group(1).strip()
        if "end" in title.lower():
            continue  # Skip markers that indicate an end section
        start_index = m.end()
        end_index = len(text)
        # Look for the next marker (regardless of its title) to define the end of this section.
        for j in range(i + 1, len(markers)):
            next_title = markers[j].group(1).strip()
            # We stop at the very next marker (even if it is an 'end' marker)
            end_index = markers[j].start()
            break
        content = text[start_index:end_index].strip()
        sections_recursivelangchain.append((title, content))
    return sections_recursivelangchain

# -------------------------------
# Step 2: Mapping Section Titles to Canonical Executive Skills
# -------------------------------
def map_title_to_skill_recursivelangchain(title: str) -> str:
    lower = title.lower()
    if lower.startswith("building response inhibition"):
        return "Response Inhibition"
    elif lower.startswith("enhancing working memory"):
        return "Working Memory"
    elif lower.startswith("improving emotional control"):
        return "Improving Emotional Control"
    elif lower.startswith("strengthening sustained attention"):
        return "Strengthening Sustained Attention"
    elif lower.startswith("teaching task initiation"):
        return "Teaching Task Initiation"
    elif lower.startswith("promoting, planning, and prioritizing"):
        return "Promoting, Planning, and Prioritizing"
    elif lower.startswith("fostering organization"):
        return "Fostering Organization"
    elif lower.startswith("instilling time management"):
        return "Instilling Time Management"
    elif lower.startswith("increasing goal-directed persistence"):
        return "Increasing Goal-Directed Persistence"
    elif lower.startswith("cultivating metacognition"):
        return "Cultivating Metacognition"
    # Fallback: return the raw title
    return title

# -------------------------------
# Step 3: Recursive Chunking Using RecursiveCharacterTextSplitter
# -------------------------------
def recursive_chunk_sections_recursivelangchain(
    sections: List[Tuple[str, str]], 
    chunk_size: int = 1000, 
    chunk_overlap: int = 200
) -> List[Document]:
    docs_recursivelangchain = []
    recursive_splitter_recursivelangchain = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    for title, content in sections:
        canonical_skill = map_title_to_skill_recursivelangchain(title)
        chunks = recursive_splitter_recursivelangchain.split_text(content)
        for chunk in chunks:
            docs_recursivelangchain.append(Document(
                page_content=chunk,
                metadata={
                    "executive_skill": canonical_skill,
                    "section_title": title
                }
            ))
    return docs_recursivelangchain

# -------------------------------
# Step 4: Load the Text File and Process into Documents
# -------------------------------
with open("Smartbutscattered.txt", "r", encoding="utf-8") as f:
    book_text_recursivelangchain = f.read()

sections_recursivelangchain = split_by_markers_recursivelangchain(book_text_recursivelangchain)
print(f"Total sections found: {len(sections_recursivelangchain)}")

exec_docs_recursivelangchain = recursive_chunk_sections_recursivelangchain(
    sections_recursivelangchain,
    chunk_size=1000,
    chunk_overlap=200
)
print(f"Total recursive chunks created: {len(exec_docs_recursivelangchain)}")

# (Optional) Write the resulting recursive chunks to a JSON file for inspection.
with open("exec_skill_chunks_recursive_recursivelangchain.json", "w", encoding="utf-8") as f:
    json.dump(
        [{
            "executive_skill": doc.metadata["executive_skill"],
            "section_title": doc.metadata["section_title"],
            "content": doc.page_content
        } for doc in exec_docs_recursivelangchain],
        f,
        indent=4
    )

# -------------------------------
# Step 5: Embed & Store Documents in ChromaDB (with duplicate check)
# -------------------------------
# Initialize the embedding model.
embedding_model_recursivelangchain = SentenceTransformer("all-MiniLM-L6-v2")

# Set the persistent directory.
persist_dir_recursivelangchain = "./chroma_exec_skills_db_recursivelangchain"

# Attempt to initialize a new Chroma client with our settings.
try:
    client_recursivelangchain = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=persist_dir_recursivelangchain
    ))
except ValueError as e:
    # If an instance already exists with different settings, reuse the existing client.
    print("Chroma client already exists with different settings. Reusing the existing client.")
    client_recursivelangchain = chromadb.Client()  # reusing default ephemeral client

# Create (or get) the collection.
collection_recursivelangchain = client_recursivelangchain.get_or_create_collection(
    name="exec_skills_recursivelangchain"
)

# Fetch existing IDs from the collection (to avoid duplicates).
existing_ids_recursivelangchain = set()
try:
    for sublist in collection_recursivelangchain.get()["ids"]:
        existing_ids_recursivelangchain.update(sublist)
except Exception:
    pass

# Loop over each document chunk and add it if not already stored.
for idx, doc in enumerate(exec_docs_recursivelangchain):
    doc_id_recursivelangchain = f"skill_{doc.metadata['executive_skill']}_{idx}_recursivelangchain"
    if doc_id_recursivelangchain in existing_ids_recursivelangchain:
        continue  # Skip duplicate chunks
    embedding_recursivelangchain = embedding_model_recursivelangchain.encode(doc.page_content).tolist()
    collection_recursivelangchain.add(
        documents=[doc.page_content],
        metadatas=[doc.metadata],
        embeddings=[embedding_recursivelangchain],
        ids=[doc_id_recursivelangchain]
    )
    existing_ids_recursivelangchain.add(doc_id_recursivelangchain)

print("Embedding complete. All recursive chunks stored in the 'exec_skills_recursivelangchain' collection.")

# -------------------------------
# Step 6: Query Example
# -------------------------------
query_text_recursivelangchain = "What are some strategies to improve working memory?"
# Use a metadata filter that selects only the chunks with executive_skill = "Working Memory"
metadata_filter_recursivelangchain = {"executive_skill": "Working Memory"}

results_recursivelangchain = collection_recursivelangchain.query(
    query_texts=[query_text_recursivelangchain],
    n_results=5,
    where=metadata_filter_recursivelangchain,
    include=["documents", "metadatas", "distances"]
)

flat_results_recursivelangchain = {
    "documents": results_recursivelangchain["documents"][0],
    "metadatas": results_recursivelangchain["metadatas"][0],
    "distances": results_recursivelangchain["distances"][0]
}

import pprint
print("Query Results (for executive_skill 'Working Memory'):")
pprint.pprint(flat_results_recursivelangchain)


## Manual Semantic Chunking

In [None]:
!pip install nltk langchain chromadb sentence_transformers scikit-learn numpy

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import re
import json
from typing import List, Tuple
from langchain.docstore.document import Document
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

# -------------------------------
# Step 1: Top-Level Splitting by Markers
# -------------------------------
def split_by_markers(text: str) -> List[Tuple[str, str]]:
    """
    Splits the text using markers (lines starting with "----" and ending with "----").
    Ignores markers that contain "end" (case-insensitive).
    Returns a list of tuples: (section_title, section_content)
    """
    marker_pattern = r"(?m)^----\s*(.*?)\s*----\s*$"
    markers = list(re.finditer(marker_pattern, text))
    sections = []
    for i, m in enumerate(markers):
        title = m.group(1).strip()
        if "end" in title.lower():
            continue
        start_index = m.end()
        end_index = len(text)
        for j in range(i+1, len(markers)):
            next_title = markers[j].group(1).strip()
            # If the next marker is an "end" marker or any marker then stop.
            if "end" in next_title.lower() or next_title:
                end_index = markers[j].start()
                break
        content = text[start_index:end_index].strip()
        sections.append((title, content))
    return sections

# -------------------------------
# Step 2: Sentence-Based Chunking with Overlap Using Semantic Grouping
# -------------------------------
def recursive_split_by_length(text: str, max_chunk_size: int) -> List[str]:
    """
    Fallback: Split a given text (which should already be a coherent chunk)
    into smaller pieces (using sentence boundaries) if it exceeds max_chunk_size.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        if current_length + len(sentence) + (1 if current_chunk else 0) <= max_chunk_size:
            current_chunk.append(sentence)
            current_length = len(" ".join(current_chunk))
        else:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence) + 1
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def semantic_chunking(text: str, model: SentenceTransformer, similarity_threshold: float = 0.7, 
                      max_chunk_size: int = 1000) -> List[str]:
    """
    Splits the text into sentences, embeds them, and clusters them using Agglomerative
    Clustering based on cosine similarity. Groups adjacent sentences that are semantically similar.
    If a group produces a chunk longer than max_chunk_size, further split it.
    Returns a list of text chunks.
    """
    sentences = sent_tokenize(text)
    if not sentences:
        return []
    
    # Compute embeddings for each sentence
    embeddings = model.encode(sentences)
    
    # Perform clustering using AgglomerativeClustering.
    # Replace the "affinity" parameter with "metric" to suit newer scikit-learn versions.
    clustering = AgglomerativeClustering(n_clusters=None, metric='cosine', linkage='average',
                                           distance_threshold=1 - similarity_threshold)
    clustering.fit(embeddings)
    cluster_labels = clustering.labels_
    
    chunks = []
    current_cluster = cluster_labels[0]
    current_sentences = []
    for i, sentence in enumerate(sentences):
        if cluster_labels[i] == current_cluster:
            current_sentences.append(sentence)
        else:
            chunk = " ".join(current_sentences)
            if len(chunk) > max_chunk_size:
                sub_chunks = recursive_split_by_length(chunk, max_chunk_size)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk)
            # Start the new chunk with overlap: include the last sentence of the previous group
            overlap = [current_sentences[-1]] if current_sentences else []
            current_sentences = overlap + [sentence]
            current_cluster = cluster_labels[i]
    if current_sentences:
        chunk = " ".join(current_sentences)
        if len(chunk) > max_chunk_size:
            sub_chunks = recursive_split_by_length(chunk, max_chunk_size)
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk)
    return chunks

def split_section_semantically(sections: List[Tuple[str, str]], model: SentenceTransformer,
                               max_chunk_size: int = 1000, similarity_threshold: float = 0.7) -> List[Document]:
    """
    For each section, split the content into paragraphs (using blank lines), then apply semantic
    chunking to each paragraph. Each resulting chunk becomes a Document with metadata.
    """
    docs = []
    
    def map_title(title: str) -> str:
        lower = title.lower()
        if lower.startswith("building response inhibition"):
            return "Response Inhibition"
        elif lower.startswith("enhancing working memory"):
            return "Working Memory"
        # Extend mappings for additional sections as needed.
        return title  # fallback using raw title
    
    for title, content in sections:
        canonical_skill = map_title(title)
        paragraphs = re.split(r"\n\s*\n", content)
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            chunks = semantic_chunking(para, model, similarity_threshold=similarity_threshold,
                                       max_chunk_size=max_chunk_size)
            for chunk in chunks:
                docs.append(Document(
                    page_content=chunk,
                    metadata={"executive_skill": canonical_skill, "section_title": title}
                ))
    return docs

# -------------------------------
# Step 3: Load the Text File and Process
# -------------------------------
with open("Smartbutscattered.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

sections = split_by_markers(book_text)

# Initialize SentenceTransformer model once for chunking
model = SentenceTransformer("all-MiniLM-L6-v2")
exec_docs = split_section_semantically(sections, model=model, max_chunk_size=1000, similarity_threshold=0.7)

# (Optional) Write the resulting chunks to a JSON file for inspection.
with open("exec_skill_chunks.json", "w", encoding="utf-8") as f:
    json.dump(
        [{"executive_skill": doc.metadata["executive_skill"],
          "section_title": doc.metadata["section_title"],
          "content": doc.page_content}
         for doc in exec_docs],
        f,
        indent=4
    )

# Extra for lesson plan

In [None]:
import re
import json
import uuid
from sentence_transformers import SentenceTransformer
import chromadb

# -------------------------------
# STEP 1: Read & Chunk the Lesson Plans
# -------------------------------

# Read the file that contains all lesson plans.
with open("lessonplans.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

lesson_plan_blocks = re.findall(
    r"--- Start of Lesson Plan(.*?)--- End of Lesson Plan",
    full_text,
    re.DOTALL
)

# Define the stop marker (we stop processing at this note)
stop_marker = r"Note: The following pages are intended for classroom use for students as a visual aid to learning\."

section_headers = {
    "assessment": r"Assessment\s*\n",
    "extensions": r"Extensions(?: and Connections)?\s*\n",
    "differentiation": r"Strategies for Differentiation\s*\n"
}

chunked_lesson_plans = []

for block in lesson_plan_blocks:
    # 1. Limit processing to the content before the stop marker.
    split_block = re.split(stop_marker, block, maxsplit=1)
    content_to_process = split_block[0].strip() if split_block else block.strip()
    
    # 2. Split into intro_context and remainder using "Student/Teacher Actions:" (case-insensitive)
    parts = re.split(r"Student/Teacher Actions:\s*", content_to_process, flags=re.IGNORECASE, maxsplit=1)
    if len(parts) == 2:
        intro_context = parts[0].strip()
        remainder = parts[1].strip()
    else:
        # If no "Student/Teacher Actions:" found, treat entire content as intro_context.
        intro_context = content_to_process
        remainder = ""

    combined_pattern = r"(Assessment\s*\n|Extensions(?: and Connections)?\s*\n|Strategies for Differentiation\s*\n)"
    split_sections = re.split(combined_pattern, remainder)
    
    # The first part (before any extra header) is the instructional_steps.
    instructional_steps = split_sections[0].strip() if split_sections else ""
    
    # Initialize placeholders for extra sections.
    assessment_text = ""
    extensions_text = ""
    differentiation_text = ""
    
    # Process remaining parts in pairs: header then content.
    for i in range(1, len(split_sections) - 1, 2):
        header = split_sections[i].strip().lower()
        content = split_sections[i+1].strip()
        if header.startswith("assessment"):
            assessment_text = content
        elif header.startswith("extensions"):
            extensions_text = content
        elif header.startswith("strategies for differentiation"):
            differentiation_text = content

    # Build the JSON structure for this lesson plan.
    lesson_plan_json = {
        "intro_context": intro_context,
        "instructional_steps": instructional_steps,
        "assessment": assessment_text,
        "extensions": extensions_text,
        "differentiation": differentiation_text
    }
    
    chunked_lesson_plans.append(lesson_plan_json)

# Optionally, save the chunked lesson plans for later reference.
with open("chunked_lesson_plans.json", "w", encoding="utf-8") as outfile:
    json.dump(chunked_lesson_plans, outfile, indent=4)

In [None]:
# -------------------------------
# STEP 2: Extract Metadata & Embed Chunks into ChromaDB
# -------------------------------

def extract_metadata_from_intro(intro_context):
    """
    Extract additional metadata from the intro_context.
    Attempts to extract:
      - grade (e.g., "Grade 5")
      - subject (from the "Strand:" line)
      - topic (from the "Topic:" line)
      - lesson_title (from the first line)
    """
    grade_match = re.search(r"Grade\s*(\d+)", intro_context)
    grade = grade_match.group(1) if grade_match else None
    
    subject_match = re.search(r"Strand:\s*(.+)", intro_context)
    subject = subject_match.group(1).strip() if subject_match else None
    
    topic_match = re.search(r"Topic:\s*(.+)", intro_context)
    topic = topic_match.group(1).strip() if topic_match else None
    
    title_line = intro_context.split("\n")[0].strip() if intro_context else ""
    lesson_title = title_line if title_line else None
    
    return {
        "grade": grade,
        "subject": subject,
        "topic": topic,
        "lesson_title": lesson_title
    }

# Define a custom embedding function class that meets the new interface.
class SentenceTransformerEmbedding:
    def __init__(self, model):
        self.model = model
    def __call__(self, input):
        # 'input' should be a list of strings.
        return self.model.encode(input).tolist()

# Initialize the embedding model.
model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize a ChromaDB client using the new client constructor.
client = chromadb.Client()

embedding_fn = SentenceTransformerEmbedding(model)
collection = client.get_or_create_collection("lesson_plans", embedding_function=embedding_fn)

# Iterate over each lesson plan and each section.
for lesson_index, lesson in enumerate(chunked_lesson_plans):

    additional_metadata = extract_metadata_from_intro(lesson.get("intro_context", ""))
    

    for section in ["intro_context", "instructional_steps", "assessment", "extensions", "differentiation"]:
        text = lesson.get(section, "").strip()
        if text:
            # Create a unique ID for this chunk.
            doc_id = f"lesson{lesson_index}_{section}_{str(uuid.uuid4())[:8]}"
            # Compute the embedding.
            embedding = model.encode(text).tolist()  # Convert numpy array to list.
        
            metadata = {
                "lesson_index": lesson_index,
                "section": section,
                **additional_metadata  # Merge in grade, subject, topic, lesson_title.
            }
            collection.add(
                documents=[text],
                metadatas=[metadata],
                embeddings=[embedding],
                ids=[doc_id]
            )

print("All lesson plan chunks have been embedded and stored in the vector database.")

In [None]:
import json

# -------------------------------
# Verification Example 1: Metadata Filter
# -------------------------------

results_metadata = collection.query(
    query_texts="", 
    n_results=100, 
    where={"lesson_index": 0}
)

print("Results filtered by metadata (lesson_index == 0):")
print(json.dumps(results_metadata, indent=4))

query_texts = "Computation and Estimation with Decimals"
results_similarity = collection.query(
    query_texts=query_texts,
    n_results=5,
    where={"lesson_index": 0}
)

print("\nResults from similarity search with query '{}':".format(query_texts))
print(json.dumps(results_similarity, indent=4))