### Langchain

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("./bank.txt")
docs = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0
)

In [None]:
docs = text_splitter.split_documents(docs)
print(len(docs))

In [None]:
for i, doc in enumerate(docs):
    print(f"CHUNK {i+1}: ", doc)

### Semantic text splitter

In [None]:
with open("./bank.txt", "r", encoding="utf-8") as file:
    content = file.read()

In [None]:
from semantic_text_splitter import CharacterTextSplitter

max_characters = 200
splitter = CharacterTextSplitter(trim_chunks=False)

chunks_no_model = splitter.chunks(content, max_characters)

In [None]:
for i, chunk in enumerate(chunks_no_model):
    print(f"CHUNK {i+1}: ", chunk)

In [None]:
from semantic_text_splitter import HuggingFaceTextSplitter
from tokenizers import Tokenizer

max_tokens = 200
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)

In [None]:
chunks = splitter.chunks(content, max_tokens)

In [None]:
for i, chunk in enumerate(chunks):
    print(f"CHUNK {i+1}: ", chunk)

In [None]:
MIN_TOKENS = 100
MAX_TOKENS = 1000

chunks_with_model = splitter.chunks(content, chunk_capacity=(MIN_TOKENS, MAX_TOKENS))

In [None]:
for i, chunk in enumerate(chunks_with_model):
    print(f"CHUNK {i+1}: ", chunk)

### How good are the embeddings based on the sematic-text-splitter?

In [None]:
from langchain_openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = "sk-..."

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
embeddings_no_model = [ embeddings.embed_query(chunk) for chunk in chunks_no_model]
embeddings_with_model = [ embeddings.embed_query(chunk) for chunk in chunks_with_model]
question = embeddings.embed_query("What does your bank do for the local community?")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_no_model = [cosine_similarity([question], [emb])[0][0] for emb in embeddings_no_model]
cos_sim_with_model = [cosine_similarity([question], [emb])[0][0] for emb in embeddings_with_model]


In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(range(len(cos_sim_no_model)), cos_sim_no_model)
plt.title('Cosine Similarity with No Model Embeddings')
plt.xlabel('Vector Index')
plt.ylabel('Cosine Similarity')
plt.ylim(0, 1)

plt.subplot(1, 2, 2)
plt.bar(range(len(cos_sim_with_model)), cos_sim_with_model)
plt.title('Cosine Similarity with Model Embeddings')
plt.xlabel('Vector Index')
plt.ylabel('Cosine Similarity')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()