# RAG Pipeline
This file splits up processed data into chunks and store them into vector store as embeddings.
- Modify path for different data. 
- Modify splitter parameters to split documents semantically.
- Change different embedding model to test the correctness of retrieved document.

In [11]:
# import packages
from dotenv import load_dotenv
import os
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings

In [12]:
# Path parameters and environment variables
cwd = os.getcwd()
env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)

# Modify the path parameters as needed
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')
vector_path = os.path.join(cwd, 'vector-stores', 'test_vector_store')


## Part 1: Text Splitting

In [13]:
# Splitter paramters
separators = ['#','##','###']
chunk_size = 1000
chunk_overlap = 0 # we are splitting by tags so there shouldn't be tokens that got cut

In [14]:
# Load the organized textbook data
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()    

In [15]:
# Create a RecursiveCharacterTextSplitter object to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=chunk_size,       # Maximum number of characters in each chunk
    chunk_overlap=chunk_overlap,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

texts = text_splitter.split_text(extracted_text)
text_docs = text_splitter.create_documents([extracted_text])

In [16]:
# Write all chunks into files to see if they are semantically separated
new_directory = os.path.join(cwd, 'chunks')
os.makedirs(new_directory, exist_ok=True)
for i, text in enumerate(texts):
    file_path = os.path.join(new_directory, f'chunk{i}.md')
    with open(file_path, 'w') as file:
        file.write(text)

## Part 2: Convert Text Chunks into Embeddings

In [17]:
COHERE_KEY = os.getenv("COHERE_KEY")

# NOTE: Change the model as needed
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY, model="embed-multilingual-v3.0")

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_texts(texts, embeddings)

# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)

In [18]:
# Test Retrieval Behavior
from tests.test_retriever import * 
retriever = faiss_store.as_retriever(search_kwargs={"k": 5})
query = "列出第三單元的學習重點"
retrieved_path = os.path.join(cwd, 'retrieved_docs')
docs = test_retriever(retriever, query, retrieved_path, print_docs=False)

Elapsed time: 3.27 seconds


#### Dev Notes
- Seems like the right chapters are retrieved after we specified a multilingual embedding model (LOL).
- Did came accross re-rank (CohereRerank), which does not really change the originially retrieved documents from the model.
- Also stumbled upon "compression retriever", which can potentially help reduce distracting token, but we will come back to it if current solution doesn't work.