Install dependencies

In [1]:
! pip install -r requirements.txt --quiet

Default chunking (if semantic doesn't work)

In [78]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


loader = DirectoryLoader("data/")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=150)
documents = loader.load()
texts = text_splitter.split_documents(documents)
len(texts)

804

Semantic chunking

In [4]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from config import azure_openai_key, azure_openai_endpoint, azure_openai_api_version, azure_openai_embedding_deployment
import os

os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_key
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint
os.environ["AZURE_OPENAI_API_VERSION"] = azure_openai_api_version

embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

loader = DirectoryLoader("data/")
documents = loader.load()

# documents[0].page_content
text_splitter = SemanticChunker(embeddings=embeddings,
                                breakpoint_threshold_type='interquartile')

chunks = text_splitter.create_documents([documents[i].page_content for i in range(len(documents))])

len(chunks)

Create embeddings and save to folder

In [1]:
import os
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from config import azure_openai_embedding_deployment
from langchain_chroma import Chroma
from config import azure_openai_key, azure_openai_endpoint, azure_openai_api_version
import os

os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_key
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint
os.environ["AZURE_OPENAI_API_VERSION"] = azure_openai_api_version

persist_directory = "chroma"

embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)

# vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
# db = Chroma()
# db.from_documents

NameError: name 'chunks' is not defined

Create persist folder for reusing model

In [2]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQA
from config import azure_openai_gpt_deployment, azure_openai_api_version


persist_directory = "chroma"

embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

gpt = AzureChatOpenAI(
    deployment_name=azure_openai_gpt_deployment,
    api_version=azure_openai_api_version
)

qa_chain = RetrievalQA.from_chain_type(
    llm=gpt,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True
)

prompt = """
System:
```
You are a search assistant.
You are an expert in "Consistent evolvement".
You will be asked a question.
Question may contain multiple answers.
Provide the full text from the retrieved documents that supports your answer.

All information will be provided in Ukrainian language.
Answer in Ukrainian language.

You should not use any external resources or make up information, but 
if you cannot find context for the answer, say 
"У контексті не було надано інфморації, шукаю по зовнішнім ресурсам" and use your knowledge and external recources.

You should think step by step and give every though.
When you wrote 10 words in 1 line, you should start a new line.
Every line should contain no more than 10 words.
Example:
```
---Відповідь---
...
---Пояснення---
...
---Думки---
...
```

```
"""

user = """
User:
```

```
"""
query = prompt + user + "\nAnswer:"
answer = qa_chain.invoke({"query": query})
print(answer.get("result"))
#print()
#answer.get("source_documents")

---Відповідь---
У контексті не було надано інформації, шукаю по зовнішнім ресурсам.

---Пояснення---
В наданих документах не міститься інформації щодо "Consistent evolvement" 
або "послідовного розвитку". 

---Думки---
1. Перевірив наданий контекст.
2. Не знайшов згадки про "Consistent evolvement".
3. Переходжу до використання зовнішніх ресурсів.
