### Langchain

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("./bank.txt")
docs = loader.load()

In [None]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [None]:
texts = text_splitter.split_documents(docs)
print(texts)

### Semantic text splitter

In [None]:
with open("./bank.txt", "r", encoding="utf-8") as file:
    content = file.read()

In [None]:
from semantic_text_splitter import CharacterTextSplitter

max_characters = 200
splitter = CharacterTextSplitter(trim_chunks=False)

chunks = splitter.chunks(content, max_characters)

In [None]:
print(len(chunks))

In [None]:
from semantic_text_splitter import HuggingFaceTextSplitter
from tokenizers import Tokenizer

max_tokens = 200
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)

In [None]:
chunks = splitter.chunks(content, max_tokens)

In [None]:
print(len(chunks))

In [None]:
MIN_TOKENS = 200
MAX_TOKENS = 1000

chunks = splitter.chunks(content, chunk_capacity=(MIN_TOKENS, MAX_TOKENS))

In [None]:
print(len(chunks))

In [None]:
from semantic_text_splitter import TiktokenTextSplitter

max_tokens = 500
splitter = TiktokenTextSplitter("gpt-3.5-turbo", trim_chunks=False) # no model needed, just uses b

chunks = splitter.chunks(content, max_tokens)

In [None]:
print(len(chunks))