In [1]:
import os
import weaviate

weaviate_api_key = os.getenv("weaviate_api_key") or "weaviate_api_key"
weaviate_url = os.getenv("weaviate_url") or "weaviate_url"
cohere_api_key = os.getenv("COHERE_API_KEY") or "COHERE_API_KEY"

auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)

client = weaviate.Client(
  url=weaviate_url,
  auth_client_secret=auth_config,
  additional_headers={"X-Cohere-Api-Key": cohere_api_key}
)
client.is_ready()

True

In [2]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=50,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7f33e8f1ed30>

In [4]:
from langchain.docstore.document import Document
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from pathlib import Path

chunk_size = 300
chunk_overlap = 50

input_directory = Path('./data/help')

with client.batch as batch:
    # Iterate through the files in the directory and its subdirectories
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            # Check if the file is a markdown file
            if file_name.endswith('.md'):
                file_path = os.path.join(root, file_name)

                # Read the file content
                with open(file_path, "r", encoding="utf-8") as file:
                    file_content = file.read()

                    # Then we chunk up the rest (in an intelligent way context aware)
                    # https://python.langchain.com/docs/use_cases/question_answering/how_to/document-context-aware-QA
                    headers_to_split_on = [
                        ("#", "Title"),
                        ("##", "Category"),
                        ("###", "Section"),
                    ]
                    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
                    md_header_splits = markdown_splitter.split_text(file_content)

                    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=chunk_size, chunk_overlap=chunk_overlap
                    )
                    splits = text_splitter.split_documents(md_header_splits)

                    lines = file_content.split("\n")
                    title = lines[0][2:].strip()
                    category = lines[1][13:].strip()
                    slug = file_name.split(".")[0]
                    # title = splits[0].metadata["Title"]
                    # category = splits[0].metadata["Category"][10:]
                    # prepend = f"# {title}\n## {category}\n"

                    properties = dict(
                        text=file_content,
                        title=title,
                        category=category,
                        slug=slug
                    )
                    batch.add_data_object(properties, "Help")
                    for split in splits:
                        text = split.page_content
                        properties = dict(
                            text=text,
                            title=title,
                            category=category,
                            slug=slug
                        )
                        batch.add_data_object(properties, "Help")

In [None]:
len(data_object)

In [47]:
data_object[135]

{'text': 'using the Write Off process. See Transaction Sheet Asset Input for AR and Inventory Write Off inputs.',
 'title': '# Cash-Beginning Working Capital',
 'category': 'Cash',
 'slug': 'Cash_CashBeginningWorkingCapital'}

In [62]:
category

'Exit Strategy'