# Building RAG Chatbots for Technical Documentation - Data Preparation

In [1]:
import warnings
warnings.filterwarnings('ignore')

`(1) Requirements (Python 3.11.10)`

In [None]:
! pip install -r requirements.txt

## Parts 1 & 2: Split the documments / generate and store embeddings

In [2]:
import Constants
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import shutil
import os

persist_directory = Constants.PERSIST_DIRECTORY

# If the directory exists, ask the user before removing
if os.path.exists(persist_directory):
    user_input = input(f"The directory '{persist_directory}' already exists. Do you want to overwrite it? (y/n): ").lower()
    if user_input == 'y':
        shutil.rmtree(persist_directory)
        print(f"Removed existing directory: {persist_directory}")
    else:
        print("Operation cancelled. Exiting.")
        exit()

# Create the directory if it doesn't exist
os.makedirs(persist_directory, exist_ok=True)
print(f"Created directory: {persist_directory}")


# Load Documents
loader = DirectoryLoader("../Data", glob="*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()


# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'tokenizer_kwargs': {'clean_up_tokenization_spaces': True}}
)

# Create and persist the vector store
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=embeddings, 
    persist_directory=persist_directory
)


print("Created and persisted new database.")

Removed existing directory: ../embeddings_db
Created directory: ../embeddings_db
Created and persisted new database.


Now that the embeddings are generated and stored, this process is only done again if the user wants to extract information from a different database.

*If you'd like to re-run the notebook, it's recommended to restart the kernel to avoid potential errors.*

#### Next Notebook: [RAG-Fusion](2-RAG-Fusion.ipynb)