Splits documents, tokenizes as embeddings, and stores them in a vector store

In [None]:
# import packages
from dotenv import load_dotenv
import os
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_ibm import WatsonxLLM
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes, DecodingMethods
from ibm_watsonx_ai import Credentials

root_path = os.getcwd()
root_path = os.path.dirname(os.path.dirname(os.path.dirname(root_path)))
textbook_extracted_path = os.path.join(root_path,'ai','rag','documents','kang_math_textbook_edited.md')
vector_path = os.path.join(root_path, 'ai', 'rag', 'vector-stores')

In [None]:
# Load the organized textbook data
## CHANGE FILE HERE FOR EMBEDS
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()    

In [None]:
# Create a RecursiveCharacterTextSplitter object to split the text into chunks

text_splitter = RecursiveCharacterTextSplitter(
    separators=['#','##','###'],
    chunk_size=1000,       # Maximum number of characters in each chunk
    chunk_overlap=200,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

texts = text_splitter.split_text(extracted_text)

In [None]:
texts

In [None]:
# Convert Text Chunks into Embeddings (dense vector representation of the text that capture semantic information)

# Initialize the embedding model using Model on HuggingFace
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = CohereEmbeddings(cohere_api_key="dBKmEAHnk6iFoBI26VsZGYR56lJ638CmzZcOUJeg")

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_texts(texts, embeddings)

# save_path = r'C:\Users\ediso\OneDrive\Desktop\IBM Call for Code\rita-cfc-2024\ai\course-prep\RAG\vector-stores'

# TODO relative path
# Define the save path and the name for the vector store
save_path = vector_path
vector_store_name = 'test_vector_store'

full_save_path = os.path.join(save_path, vector_store_name)
os.makedirs(full_save_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(full_save_path)