Splits documents, tokenizes as embeddings, and stores them in a vector store

In [10]:
# import packages
from dotenv import load_dotenv
import os
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings

cwd = os.getcwd()
# NOTE: change files as needed
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')
vector_path = os.path.join(cwd, 'vector-stores', 'test_vector_store')

env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)
COHERE_KEY = os.getenv("COHERE_KEY")

In [2]:
# Load the organized textbook data
## CHANGE FILE HERE FOR EMBEDS
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()    

In [3]:
# Create a RecursiveCharacterTextSplitter object to split the text into chunks

text_splitter = RecursiveCharacterTextSplitter(
    separators=['#','##','###'],
    chunk_size=1000,       # Maximum number of characters in each chunk
    chunk_overlap=200,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

texts = text_splitter.split_text(extracted_text)

In [None]:
texts

In [11]:
# Convert Text Chunks into Embeddings (dense vector representation of the text that capture semantic information)

# Initialize the embedding model using Model on HuggingFace
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY)

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_texts(texts, embeddings)

# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)