In [None]:
import os
import shutil
from dotenv import load_dotenv
import gradio as gr
import requests
from typing import List, Dict

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
MODEL = "gpt-4o-mini"
DB_DIR = "vector_db"
RELEVANCE_THRESHOLD = 0.3

In [None]:
load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "your-api-key")
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "your-serper-api-key")

In [None]:
folders = ["knowledge-base"]
documents = []

for folder in folders:
    loader = DirectoryLoader(
        folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}
    )
    documents.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
print(f"Loaded {len(chunks)} chunks from knowledge base.")

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Reset vector db
if os.path.exists(DB_DIR):
    shutil.rmtree(DB_DIR)

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")