In [15]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=400, min_chunk_size=30)

In [16]:
import json


# File path for the JSON file
file_path = "data/GM Core/GM_Core_processed.json"

# Load JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract the "text" field from each entry
text_chunks = [entry["text"] for entry in data if "text" in entry]

# Print the first few extracted texts to verify
print("\n".join(text_chunks[:5]))  # Displaying only the first few entries


GM CORE
4  SUBSYSTEMS                                                                                 182
- Introduction..................................................................................................................183
- Victory Points .............................................................................................................184
- Influence .....................................................................................................................187
- Research.......................................................................................................................190
- Chases..........................................................................................................................192
- Infiltration....................................................................................................................196
- Reputation .....................................................................................

In [17]:
# This section taken from community databricks project

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re

def perform_semantic_chunking(document, chunk_size=500, chunk_overlap=100):
    """
    Performs semantic chunking on a document using recursive character splitting 
    at logical text boundaries.
    
    Args:
        document (str): The text document to process
        chunk_size (int): The target size of each chunk in characters
        chunk_overlap (int): The number of characters of overlap between chunks
        
    Returns:
        list: The semantically chunked documents with metadata
    """
    # Create the text splitter with semantic separators
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    # Split the text into semantic chunks
    semantic_chunks = text_splitter.split_text(document)
    print(f"Document split into {len(semantic_chunks)} semantic chunks")
    
    # Determine section titles for enhanced metadata
    section_patterns = [
        r'^#+\s+(.+)$',      # Markdown headers
        r'^.+\n[=\-]{2,}$',  # Underlined headers
        r'^[A-Z\s]+:$'       # ALL CAPS section titles
    ]
    
    # Convert to Document objects with enhanced metadata
    documents = []
    current_section = "Introduction"
    
    for i, chunk in enumerate(semantic_chunks):
        # Try to identify section title from chunk
        chunk_lines = chunk.split('\n')
        for line in chunk_lines:
            for pattern in section_patterns:
                match = re.match(pattern, line, re.MULTILINE)
                if match:
                    current_section = match.group(0)
                    break
        
        # Calculate semantic density (ratio of non-stopwords to total words)
        words = re.findall(r'\b\w+\b', chunk.lower())
        stopwords = ['the', 'and', 'is', 'of', 'to', 'a', 'in', 'that', 'it', 'with', 'as', 'for']
        content_words = [w for w in words if w not in stopwords]
        semantic_density = len(content_words) / max(1, len(words))
        
        doc = Document(
            page_content=chunk,
            metadata={
                "chunk_id": i,
                "total_chunks": len(semantic_chunks),
                "chunk_size": len(chunk),
                "chunk_type": "semantic",
                "section": current_section,
                "semantic_density": round(semantic_density, 2)
            }
        )
        documents.append(doc)
    
    return documents

In [18]:
# Combining all the text chunks into a single document
full_doc = "\n".join(text_chunks)

# Perform semantic chunking on the full document
semantic_documents = perform_semantic_chunking(full_doc)

# A little peak at the results
index_to_peak = 500
for doc in semantic_documents[index_to_peak:index_to_peak+5]:
    print(f"Chunk ID: {doc.metadata['chunk_id']}, Size: {len(doc.page_content)}")
    
    # Printing the section title
    print(f"Section: {doc.metadata['section']}")
    
    # Printing the content of the document
    print(doc.page_content)
    
    # Printing the semantic density
    print(f"Semantic Density: {doc.metadata['semantic_density']}")

Document split into 6072 semantic chunks
Chunk ID: 500, Size: 447
Section: # ENDING THE ENCOUNTER
Analysis :
  • The image conveys a sense of power imbalance, with the demoness clearly overpowering the knight.
  • The presence of additional figures in the background suggests an ongoing or escalating conflict.
  • The use of contrasting warm and cool colors enhances the drama and highlights the central figures against the icy setting.
initiative order and give each PC the option to pursue
any one fleeing foe. Each PC can declare one action,
Semantic Density: 0.71
Chunk ID: 501, Size: 442
Section: # ENDING THE ENCOUNTER
any one fleeing foe. Each PC can declare one action,
spell, or other ability to use to try to keep up. Then,
compare the PC's Speed to that of the target, assess how
much the pursuer's chosen spell or ability would help,
and factor in any abilities the quarry has that would aid
escape. If you determine that the pursuer catches up, go
back into combat with the original ini

Now that we have chunks, the next step is to create some embeddings. However, something to consider is whether or not the embeddings should contain the repetitive nature of the `Section` headers. Depending on the descriptiveness of the headers, it is permissible to do so. Unfortunately, with the output we currently have, it is not feasible. We will embed purely the chunks as they stand, without including the metadata, such as `Section`.

In [20]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

# List notation for storing documents, metadata is stored separately for use later
docs_for_store = [
    Document(
        page_content=f"passage: {doc.page_content}",
        metadata={ # Dict of metadata for each document, dropped chunk_type because I don't know what it does
            "section": doc.metadata["section"],
            "chunk_id": doc.metadata["chunk_id"],
            "chunk_size": doc.metadata["chunk_size"],
            "source":   "Player Core 2"
        }
    )
    for doc in semantic_documents
]

# Embedding step, utilizing the e5 model
emb_fn = HuggingFaceEmbeddings(
    model_name="intfloat/e5-large-v2",
    encode_kwargs={"normalize_embeddings": True},   # E5 wants norm=True
)

# 4️⃣  Persist in Chroma (or your favourite store)
vectordb = Chroma.from_documents(docs_for_store, emb_fn, persist_directory="chroma_db")

This is running QA locally

In [22]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
import os

value = os.getenv('GEMINI_KEY') 
retrievalQA = RetrievalQA.from_llm(llm=ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", google_api_key=value), retriever=vectordb.as_retriever())
query = "What do Tengus look like?"
retrievalQA.run(query)

  retrievalQA.run(query)


'Tengu have many avian characteristics. Their faces are tipped with sharp beaks, and their scaled forearms and lower legs end in talons. They are rarely more than 5 feet tall and are even lighter than their smaller frames would suggest, as they have hollow bones. A small number of tengu have vestigial wings.'

Here is persisting a DB using chroma

In [23]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
import os

# os.environ["OPENAI_API_KEY"] = "My API Key Here"

# In a notebook setting, we call persist to ensure embeddings are written to the disk.
# As a script, this wouldn't be necessary.

# vectordb.persist() # Persist the vector store disk
# vectordb = None # Clear the variable to free memory

# Load the persisted database from disk, and use it as normal
vectordb = Chroma(persist_directory="chroma_db", embedding_function=emb_fn)
retrievalQA = RetrievalQA.from_llm(llm=ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", google_api_key=value), retriever=vectordb.as_retriever())

  vectordb = Chroma(persist_directory="chroma_db", embedding_function=emb_fn)


In [None]:
query = "What do Tengus look like?"
retrievalQA.run(query)

'Tengu have many avian characteristics. Their faces are tipped with sharp beaks and their scaled forearms and lower legs end in talons. They are rarely more than 5 feet tall, and they are even lighter than their smaller frames would suggest, as they have hollow bones. A small number of tengu have vestigial wings.'

Cleanup time

In [None]:
# vectordb.delete_collection()