In [1]:
import openai
import langchain
import faiss
import pathlib
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
import re
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
from getpass import getpass
import tiktoken

In [11]:
os.environ['OPENAI_API_KEY'] = getpass()  #insert OpenAI API key here

#DOCS_FOLDER = "docs"  # Folder to check out to
#REPO_DOCUMENTS_PATH = "collections/_products/categories/devops/ship-hats"  # Set to "" to index the whole data folder
DOCUMENT_BASE_URL = "https://www.developer.tech.gov.sg/products/categories/devops/ship-hats"  # Actual URL
DATA_STORE_DIR = "data_store"  # Folder to save/load the database

In [48]:
DOCS_FOLDER = "test"  # Folder to check out to
REPO_DOCUMENTS_PATH = ""

In [49]:
name_filter = "**/*.md"
separator = "\n### "  # This separator assumes Markdown docs from the repo uses ### as logical main header most of the time
chunk_size_limit = 1000
max_chunk_overlap = 20

repo_path = pathlib.Path(os.path.join(DOCS_FOLDER, REPO_DOCUMENTS_PATH))
document_files = list(repo_path.glob(name_filter))

def convert_path_to_doc_url(doc_path):
  # Convert from relative path to actual document url
  return re.sub(f"{DOCS_FOLDER}/{REPO_DOCUMENTS_PATH}/(.*)\.[\w\d]+", f"{DOCUMENT_BASE_URL}/\\1", str(doc_path))

documents = [
    Document(
        page_content=open(file, "r").read(),
        metadata={"source": convert_path_to_doc_url(file)}
    )
    for file in document_files
]

text_splitter = CharacterTextSplitter(separator=separator, chunk_size=chunk_size_limit, chunk_overlap=max_chunk_overlap)
split_docs = text_splitter.split_documents(documents)

In [50]:
document_files

[WindowsPath('test/README_22.md')]

In [51]:
# create a GPT-4 encoder instance
enc = tiktoken.encoding_for_model("gpt-4")

total_word_count = sum(len(doc.page_content.split()) for doc in split_docs)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in split_docs)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0004 / 1000}")


Total word count: 255

Estimated tokens: 345

Estimated cost of embedding: $0.00013800000000000002


In [52]:
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_documents(split_docs, embeddings)

In [29]:
vector_store.save_local(DATA_STORE_DIR)

In [53]:
from IPython.display import display, Markdown

search_result = vector_store.similarity_search_with_score("What is SHIP-HATS?")

line_separator = "\n"# {line_separator}Source: {r[0].metadata['source']}{line_separator}Score:{r[1]}{line_separator}
display(Markdown(f"""
## Search results:{line_separator}
{line_separator.join([
  f'''
  ### Source:{line_separator}{r[0].metadata['source']}{line_separator}
  #### Score:{line_separator}{r[1]}{line_separator}
  #### Content:{line_separator}{r[0].page_content}{line_separator}
  '''
  for r in search_result
])}
"""))


## Search results:


  ### Source:
test\README_22.md

  #### Score:
0.6939088106155396

  #### Content:
# Text Split Explorer

![ui.png](ui.png)

Many of the most important LLM applications involve connecting LLMs to external sources of data.
A prerequisite to doing this is to ingest data into a format where LLMs can easily connect to them.
Most of the time, that means ingesting data into a vectorstore.
A prerequisite to doing this is to split the original text into smaller chunks.

While this may seem trivial, it is a nuanced and overlooked step.
When splitting text, you want to ensure that each chunk has cohesive information - e.g. you don't just want to split in the middle of sentence.
What "cohesive information" means can differ depending on the text type as well.
For example, with Markdown you have section delimiters (`##`) so you may want to keep those together, while for splitting Python code you may want to keep all classes and methods together (if possible).

This repo (and associated Streamlit app) are designed to help explore different types of text splitting.
You can adjust different parameters and choose different types of splitters.
By pasting a text file, you can apply the splitter to that text and see the resulting splits.
You are also shown a code snippet that you can copy and use in your application

## Hosted App

To use the hosted app, head to [https://langchain-text-splitter.streamlit.app/](https://langchain-text-splitter.streamlit.app/)

## Running locally

To run locally, first set up the environment by cloning the repo and running:

```shell
pip install -r requirements
```

Then, run the Streamlit app with:

```shell
streamlit run splitter.py
```

  


In [56]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [57]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=256)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

from IPython.display import display, Markdown
def print_result(result):
  output_text = f"""### Question:
  {query}
  ### Answer:
  {result['answer']}
  ### Sources:
  {result['sources']}
  ### All relevant sources:
  {' '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}
  """
  display(Markdown(output_text))

In [58]:
query = "What is SHIP-HATS?"
result = chain(query)
print_result(result)

### Question:
  What is SHIP-HATS?
  ### Answer:
  I don't know what SHIP-HATS is.
  ### Sources:
  
  ### All relevant sources:
  test\README_22.md
  

In [59]:
query = "What is the purpose of splitting text when using LLMs?"
result = chain(query)
print_result(result)

### Question:
  What is the purpose of splitting text when using LLMs?
  ### Answer:
  The purpose of splitting text when using LLMs (Language Model Models) is to ingest data into a format where LLMs can easily connect to them. Splitting text into smaller chunks allows LLMs to process and analyze the data more efficiently. It ensures that each chunk of text contains cohesive information, such as keeping sentences or specific sections together, depending on the text type. This step is crucial for connecting LLMs to external sources of data and enabling them to perform tasks like language generation, translation, summarization, and more. 


  ### Sources:
  test\README_22.md
  ### All relevant sources:
  test\README_22.md
  