# Main file for chunking and chatGPT prompt

## Install dependencies

In [1]:
! pip install -r requirements.txt --quiet

## Path for data

In [2]:
import os


data_path = '/home/dynokostya/Documents/Projects/rag-local/data/stalyi_rozv'
chroma_db_path = '/home/dynokostya/Documents/Projects/rag-local/chroma_db/stalyi_rozv'
if not os.path.exists(chroma_db_path):
    os.makedirs(chroma_db_path)
print(data_path)
print(chroma_db_path)

/home/dynokostya/Documents/Projects/rag-local/data/stalyi_rozv
/home/dynokostya/Documents/Projects/rag-local/chroma_db/stalyi_rozv


## Default chunking (if semantic doesn't work)

In [78]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Select your directory
loader = DirectoryLoader(data_path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=150)
documents = loader.load()
texts = text_splitter.split_documents(documents)
len(texts)

804

## Semantic chunking

In [3]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from config import azure_openai_key, azure_openai_endpoint, azure_openai_api_version, azure_openai_embedding_deployment
import os


os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_key
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint
os.environ["AZURE_OPENAI_API_VERSION"] = azure_openai_api_version

embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

# Select your directory
loader = DirectoryLoader(data_path)
documents = loader.load()

# Gradient does the best semantic chunking, alternative = 'percentile'
text_splitter = SemanticChunker(embeddings=embeddings,
                                breakpoint_threshold_type='gradient')

chunks = text_splitter.create_documents([documents[i].page_content for i in range(len(documents))])

len(chunks)

132

## Create embeddings and save to folder

In [4]:
import os
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from config import azure_openai_embedding_deployment
from langchain_chroma import Chroma
from config import azure_openai_key, azure_openai_endpoint, azure_openai_api_version
import os


os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_key
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint
os.environ["AZURE_OPENAI_API_VERSION"] = azure_openai_api_version

embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

if not os.path.exists(chroma_db_path):
    os.makedirs(chroma_db_path)

vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=chroma_db_path)

## GPT-4o model usage

### Prompt for сталий розвиток

In [5]:
prompt = """
System:
```
You are a search assistant.
You are an expert in "Consistent evolvement".
You will be asked a question.
Question may contain multiple answers.
You should always give an exact answer for the question.
All information will be provided in Ukrainian language.
Answer in Ukrainian language.

If you cannot find the answer in context/documents, say 
"Нічого не знайдено в документах. Використовую власні знання...".
Then, use your knowldege or external recources and always give an answer for the question.
If needed, make more analysis.

When you wrote 10 words in 1 line, you should start a new line.
Every line should contain no more than 10 words.
Write and structure your thoughts.
Take a deep breath and think step by step.
Explain each of your steps in detail (in ukrainian).

Example:
```
---Відповідь---
...
---Пояснення---
...
---Думки---
...
```

```
"""

### Prompt for patents

In [47]:
prompt = """
System:
```
You are a search assistant.
You are an expert in "Intelectual ownership and patents".
You will be asked a question.
Question may contain multiple answers.
You should always give an exact answer for the question.
All information will be provided in Ukrainian language.
Answer in Ukrainian language.

If you cannot find the answer in context/documents, say 
"Нічого не знайдено в документах. Використовую власні знання...".
Then, use your knowldege or external recources and always give an answer for the question.
If needed, make more analysis.

When you wrote 10 words in 1 line, you should start a new line.
Every line should contain no more than 10 words.
Write and structure your thoughts.
Take a deep breath and think step by step.
Explain each of your steps in detail (in ukrainian).

Example:
```
---Відповідь---
...
---Пояснення---
...
---Думки---
...
```

```
"""

In [13]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQA
from config import azure_openai_gpt_deployment, azure_openai_api_version


embeddings = AzureOpenAIEmbeddings(
    deployment=azure_openai_embedding_deployment,
    chunk_size=1024
)

vectorstore = Chroma(persist_directory=chroma_db_path, embedding_function=embeddings)

gpt = AzureChatOpenAI(
    deployment_name=azure_openai_gpt_deployment,
    api_version=azure_openai_api_version
)

qa_chain = RetrievalQA.from_chain_type(
    llm=gpt,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True
)

user = """
User:
```
День перевитрати Землі (англ. Earth overshoot day) 2024 року - 28 жовтня.

Виберіть одну відповідь:
Правильно
Неправильно
```
"""
query = prompt + user + "\nAnswer:"
answer = qa_chain.invoke({"query": query})
print(answer.get("result"))
#print()
#answer.get("source_documents")

---Відповідь---
Неправильно

---Пояснення---
День перевитрати Землі, відомий як Earth Overshoot Day, 
зазвичай настає значно раніше в році, ніж 28 жовтня. 
У попередні роки цей день припадав на липень-серпень.

---Думки---
1. День перевитрати Землі означає, що людство 
використало всі ресурси, які планета може відновити 
за рік.
2. Цей день зазвичай визначається у середині року.
3. Дата 28 жовтня є занадто пізньою для Earth 
Overshoot Day.

