In [2]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=400, min_chunk_size=30)

In [3]:
import json


# File path for the JSON file
file_path = "data/PC 2/PC_2_combined_cleaned.json"

# Load JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract the "text" field from each entry
text_chunks = [entry["text"] for entry in data if "text" in entry]

# Print the first few extracted texts to verify
print("\n".join(text_chunks[:5]))  # Displaying only the first few entries


# INTRODUCTION *Pathfinder is a game of imagination where you can bring nearly any idea to life. This book combines with Player Core, expanding on its options and giving you even more ways to play the character you want!*
## MORE OF EVERYTHING *Player Core 2* provides even more options for player characters. Along with the ancestries and classes summarized on the following page, there are additional general and skill feats, equipment, spells, and treasure round out your characters' abilities.
Alongside the eight new multiclass archetypes for the classes in this book, you can find archetypes for all sorts of characters starting on page 172. These can expand any Pathfinder character in unexpected directions, helping them discover new abilities to match their developing stories. Many of these archetypes support combat specializations, like the bastion and wrestler, while others show off the roles characters may be stepping into, like the blessed one and marshal. Even characters' growing s

In [4]:
# This section taken from community databricks project

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re

def perform_semantic_chunking(document, chunk_size=500, chunk_overlap=100):
    """
    Performs semantic chunking on a document using recursive character splitting 
    at logical text boundaries.
    
    Args:
        document (str): The text document to process
        chunk_size (int): The target size of each chunk in characters
        chunk_overlap (int): The number of characters of overlap between chunks
        
    Returns:
        list: The semantically chunked documents with metadata
    """
    # Create the text splitter with semantic separators
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    # Split the text into semantic chunks
    semantic_chunks = text_splitter.split_text(document)
    print(f"Document split into {len(semantic_chunks)} semantic chunks")
    
    # Determine section titles for enhanced metadata
    section_patterns = [
        r'^#+\s+(.+)$',      # Markdown headers
        r'^.+\n[=\-]{2,}$',  # Underlined headers
        r'^[A-Z\s]+:$'       # ALL CAPS section titles
    ]
    
    # Convert to Document objects with enhanced metadata
    documents = []
    current_section = "Introduction"
    
    for i, chunk in enumerate(semantic_chunks):
        # Try to identify section title from chunk
        chunk_lines = chunk.split('\n')
        for line in chunk_lines:
            for pattern in section_patterns:
                match = re.match(pattern, line, re.MULTILINE)
                if match:
                    current_section = match.group(0)
                    break
        
        # Calculate semantic density (ratio of non-stopwords to total words)
        words = re.findall(r'\b\w+\b', chunk.lower())
        stopwords = ['the', 'and', 'is', 'of', 'to', 'a', 'in', 'that', 'it', 'with', 'as', 'for']
        content_words = [w for w in words if w not in stopwords]
        semantic_density = len(content_words) / max(1, len(words))
        
        doc = Document(
            page_content=chunk,
            metadata={
                "chunk_id": i,
                "total_chunks": len(semantic_chunks),
                "chunk_size": len(chunk),
                "chunk_type": "semantic",
                "section": current_section,
                "semantic_density": round(semantic_density, 2)
            }
        )
        documents.append(doc)
    
    return documents

In [5]:
# Combining all the text chunks into a single document
full_doc = "\n".join(text_chunks)

# Perform semantic chunking on the full document
semantic_documents = perform_semantic_chunking(full_doc)

# A little peak at the results
index_to_peak = 500
for doc in semantic_documents[index_to_peak:index_to_peak+5]:
    print(f"Chunk ID: {doc.metadata['chunk_id']}, Size: {len(doc.page_content)}")
    
    # Printing the section title
    print(f"Section: {doc.metadata['section']}")
    
    # Printing the content of the document
    print(doc.page_content)
    
    # Printing the semantic density
    print(f"Semantic Density: {doc.metadata['semantic_density']}")

Document split into 3769 semantic chunks
Chunk ID: 500, Size: 116
Section: # COMMON BACKGROUNDS *Your character's abilities don't spring into existence at the moment they take up the adventuring life. Their background—the role they had before they became an adventurer—also provides a number of abilities.*
. You're trained in the Occultism skill and the Herbalism Lore skill. You gain the Root Magic skill feat (page 233).
Semantic Density: 0.76
Chunk ID: 501, Size: 454
Section: # COMMON BACKGROUNDS *Your character's abilities don't spring into existence at the moment they take up the adventuring life. Their background—the role they had before they became an adventurer—also provides a number of abilities.*
SABOTEUR BACKGROUND Whether you do it for personal enjoyment or at the behest of a mercenary company or military organization, you have a knack for destroying things. You have a sense for an object or structure's weak spots and know where to deliver a hammer strike or alchemical bomb. Y

Now that we have chunks, the next step is to create some embeddings. However, something to consider is whether or not the embeddings should contain the repetitive nature of the `Section` headers. Depending on the descriptiveness of the headers, it is permissible to do so. Unfortunately, with the output we currently have, it is not feasible. We will embed purely the chunks as they stand, without including the metadata, such as `Section`.

In [27]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

# List notation for storing documents, metadata is stored separately for use later
docs_for_store = [
    Document(
        page_content=f"passage: {doc.page_content}",
        metadata={ # Dict of metadata for each document, dropped chunk_type because I don't know what it does
            "section": doc.metadata["section"],
            "chunk_id": doc.metadata["chunk_id"],
            "chunk_size": doc.metadata["chunk_size"],
            "source":   "Player Core 2"
        }
    )
    for doc in semantic_documents
]

# Embedding step, utilizing the e5 model
emb_fn = HuggingFaceEmbeddings(
    model_name="intfloat/e5-large-v2",
    encode_kwargs={"normalize_embeddings": True},   # E5 wants norm=True
)

# 4️⃣  Persist in Chroma (or your favourite store)
vectordb = Chroma.from_documents(docs_for_store, emb_fn, persist_directory="chroma_db")

This is running QA locally

In [None]:
# from langchain.vectorstores import Chroma
# from langchain.chains import RetrievalQA
# from langchain.llms import OpenAI
# import os

# os.environ["OPENAI_API_KEY"] = "My API Key Here"
# retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=vectordb.as_retriever())
# query = "What do Tengus look like?"
# retrievalQA.run(query)

' Tengu have avian characteristics, such as beaks and talons, and are rarely more than 5 feet tall with hollow bones. Some Tengus also have vestigial wings.'

Here is persisting a DB using chroma

In [30]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os

# os.environ["OPENAI_API_KEY"] = "My API Key Here"

# In a notebook setting, we call persist to ensure embeddings are written to the disk.
# As a script, this wouldn't be necessary.

# vectordb.persist() # Persist the vector store disk
# vectordb = None # Clear the variable to free memory

# Load the persisted database from disk, and use it as normal
vectordb = Chroma(persist_directory="chroma_db", embedding_function=emb_fn)
retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=vectordb.as_retriever())

In [31]:
query = "If I wanted to play a pirate, what would be a suitable background?"
retrievalQA.run(query)

'\nA suitable background for playing a pirate could be a sailor or criminal background, as they would have the necessary skills and experience for a life of piracy.'

Cleanup time

In [None]:
# vectordb.delete_collection()