In [3]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=400, min_chunk_size=30)

In [4]:
import json


# File path for the JSON file
file_path = "data/PC 2/PC_2_combined_cleaned.json"

# Load JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract the "text" field from each entry
text_chunks = [entry["text"] for entry in data if "text" in entry]

# Print the first few extracted texts to verify
print("\n".join(text_chunks[:5]))  # Displaying only the first few entries


# INTRODUCTION *Pathfinder is a game of imagination where you can bring nearly any idea to life. This book combines with Player Core, expanding on its options and giving you even more ways to play the character you want!*
## MORE OF EVERYTHING *Player Core 2* provides even more options for player characters. Along with the ancestries and classes summarized on the following page, there are additional general and skill feats, equipment, spells, and treasure round out your characters' abilities.
Alongside the eight new multiclass archetypes for the classes in this book, you can find archetypes for all sorts of characters starting on page 172. These can expand any Pathfinder character in unexpected directions, helping them discover new abilities to match their developing stories. Many of these archetypes support combat specializations, like the bastion and wrestler, while others show off the roles characters may be stepping into, like the blessed one and marshal. Even characters' growing s

In [5]:
# This section taken from community databricks project

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re

def perform_semantic_chunking(document, chunk_size=500, chunk_overlap=100):
    """
    Performs semantic chunking on a document using recursive character splitting 
    at logical text boundaries.
    
    Args:
        document (str): The text document to process
        chunk_size (int): The target size of each chunk in characters
        chunk_overlap (int): The number of characters of overlap between chunks
        
    Returns:
        list: The semantically chunked documents with metadata
    """
    # Create the text splitter with semantic separators
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    # Split the text into semantic chunks
    semantic_chunks = text_splitter.split_text(document)
    print(f"Document split into {len(semantic_chunks)} semantic chunks")
    
    # Determine section titles for enhanced metadata
    section_patterns = [
        r'^#+\s+(.+)$',      # Markdown headers
        r'^.+\n[=\-]{2,}$',  # Underlined headers
        r'^[A-Z\s]+:$'       # ALL CAPS section titles
    ]
    
    # Convert to Document objects with enhanced metadata
    documents = []
    current_section = "Introduction"
    
    for i, chunk in enumerate(semantic_chunks):
        # Try to identify section title from chunk
        chunk_lines = chunk.split('\n')
        for line in chunk_lines:
            for pattern in section_patterns:
                match = re.match(pattern, line, re.MULTILINE)
                if match:
                    current_section = match.group(0)
                    break
        
        # Calculate semantic density (ratio of non-stopwords to total words)
        words = re.findall(r'\b\w+\b', chunk.lower())
        stopwords = ['the', 'and', 'is', 'of', 'to', 'a', 'in', 'that', 'it', 'with', 'as', 'for']
        content_words = [w for w in words if w not in stopwords]
        semantic_density = len(content_words) / max(1, len(words))
        
        doc = Document(
            page_content=chunk,
            metadata={
                "chunk_id": i,
                "total_chunks": len(semantic_chunks),
                "chunk_size": len(chunk),
                "chunk_type": "semantic",
                "section": current_section,
                "semantic_density": round(semantic_density, 2)
            }
        )
        documents.append(doc)
    
    return documents

In [6]:
# Combining all the text chunks into a single document
full_doc = "\n".join(text_chunks)

# Perform semantic chunking on the full document
semantic_documents = perform_semantic_chunking(full_doc)

# A little peak at the results
index_to_peak = 500
for doc in semantic_documents[index_to_peak:index_to_peak+5]:
    print(f"Chunk ID: {doc.metadata['chunk_id']}, Size: {len(doc.page_content)}")
    
    # Printing the section title
    print(f"Section: {doc.metadata['section']}")
    
    # Printing the content of the document
    print(doc.page_content)
    
    # Printing the semantic density
    print(f"Semantic Density: {doc.metadata['semantic_density']}")

Document split into 3769 semantic chunks
Chunk ID: 500, Size: 116
Section: # COMMON BACKGROUNDS *Your character's abilities don't spring into existence at the moment they take up the adventuring life. Their background—the role they had before they became an adventurer—also provides a number of abilities.*
. You're trained in the Occultism skill and the Herbalism Lore skill. You gain the Root Magic skill feat (page 233).
Semantic Density: 0.76
Chunk ID: 501, Size: 454
Section: # COMMON BACKGROUNDS *Your character's abilities don't spring into existence at the moment they take up the adventuring life. Their background—the role they had before they became an adventurer—also provides a number of abilities.*
SABOTEUR BACKGROUND Whether you do it for personal enjoyment or at the behest of a mercenary company or military organization, you have a knack for destroying things. You have a sense for an object or structure's weak spots and know where to deliver a hammer strike or alchemical bomb. Y