In [1]:
import os
import weaviate

weaviate_api_key = os.getenv("weaviate_api_key") or "weaviate_api_key"
weaviate_url = os.getenv("weaviate_url") or "weaviate_url"
cohere_api_key = os.getenv("COHERE_API_KEY") or "COHERE_API_KEY"

auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)

client = weaviate.Client(
  url=weaviate_url,
  auth_client_secret=auth_config,
  additional_headers={"X-Cohere-Api-Key": cohere_api_key}
)
client.is_ready()

True

In [2]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=50,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7f8528addd30>

In [3]:
import json
from langchain.docstore.document import Document
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from pathlib import Path

input_file_path = Path('./data/finetune/questions.json')

with client.batch as batch:
    with open(input_file_path, "r") as input_file:
        file_content = input_file.read()
        questions = json.loads(file_content)
        file_name = None
        title = None
        category = None
        slug = None
        for qna in questions:
            if qna["source"] != file_name:
                file_name = qna["source"]
                with open(file_name, "r", encoding="utf-8") as file:
                    file_content = file.read()
                    lines = file_content.split("\n")
                    title = lines[0][2:].strip()
                    category = lines[1][13:].strip()
                    slug = file_name.split("/")[-1].split(".")[0]

                properties = dict(
                    text=f"Question: {qna['question']}\nAnswer: {qna['answer']}",
                    title=title,
                    category=category,
                    slug=slug
                )
                batch.add_data_object(properties, "Help")

In [4]:
len(questions)

9758