# RAG Pipeline
This file splits and stores each document into vector embeddings.

In [50]:
# import packages
from dotenv import load_dotenv
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings
from langchain_community.document_loaders import JSONLoader

In [51]:
# Path parameters and environment variables
cwd = os.getcwd()
env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)

# Modify the path parameters as needed
text_docs = []


## Part 1: Text Splitting

In [52]:
# Splitter paramters
separators = ['#','##','###']
chunk_size = 1000
chunk_overlap = 0 # we are splitting by tags so there shouldn't be tokens that got cut
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')


In [59]:
# Load the organized textbook data
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()   
# Create a RecursiveCharacterTextSplitter object to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=chunk_size,       # Maximum number of characters in each chunk
    chunk_overlap=chunk_overlap,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

text_docs.extend(text_splitter.create_documents([extracted_text]))


In [60]:
# Load json course plan
dir_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','course_plan')
documents = []

# Loop through each chapter file (1 to 10)
for filename in os.listdir(dir_path):
    # Load each JSON file
    loader = JSONLoader(file_path=os.path.join(dir_path, filename), jq_schema=".", text_content=False)
    data = loader.load()
    # Append loaded documents to the list
    documents.extend(data)

text_docs.extend(documents)


In [55]:
# Write all chunks into files to see if they are semantically separated
new_directory = os.path.join(cwd, 'chunks')
os.makedirs(new_directory, exist_ok=True)
for i, text in enumerate(text_docs):
    file_path = os.path.join(new_directory, f'chunk{i}.md')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text.page_content)

## Part 2: Convert Text Chunks into Embeddings

In [56]:
COHERE_KEY = os.getenv("COHERE_KEY")
vector_path = os.path.join(cwd, 'vector-stores', 'test_vector_store')

embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY, model="embed-multilingual-v3.0")

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_documents(text_docs, embeddings)
# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)

In [57]:
# Test Retrieval Behavior
from tests.test_retriever import * 
retriever = faiss_store.as_retriever(search_kwargs={"k": 5})
query = "生成十六週的學期進度，包含每週每堂課需要涵蓋的內容"
retrieved_path = os.path.join(cwd, 'retrieved_docs')
docs = test_retriever(retriever, query, retrieved_path, print_docs=False)

Elapsed time: 0.23 seconds
