# RAG Pipeline
This file splits and stores each document into vector embeddings.

In [33]:
# import packages
from dotenv import load_dotenv
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings
from langchain_community.document_loaders import JSONLoader

In [34]:
# Path parameters and environment variables
cwd = os.getcwd()
env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)

# Modify the path parameters as needed
text_docs = []


## Part 1: Text Splitting

In [35]:
# Splitter paramters
separators = ['#','##','###']
chunk_size = 1000
chunk_overlap = 0 # we are splitting by tags so there shouldn't be tokens that got cut
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')


In [36]:
# Load the organized textbook data
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()   
# Create a RecursiveCharacterTextSplitter object to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=chunk_size,       # Maximum number of characters in each chunk
    chunk_overlap=chunk_overlap,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

# text_docs.extend(text_splitter.create_documents([extracted_text]))


In [37]:
# Load json course plan
dir_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','course_plan')
documents = []

# Loop through each chapter file (1 to 10)
for filename in os.listdir(dir_path):
    # Load each JSON file
    loader = JSONLoader(file_path=os.path.join(dir_path, filename), jq_schema=".", text_content=False)
    data = loader.load()
    # Append loaded documents to the list
    documents.extend(data)

text_docs.extend(documents)


In [38]:
# Write all chunks into files to see if they are semantically separated
new_directory = os.path.join(cwd, 'chunks')
os.makedirs(new_directory, exist_ok=True)
for i, text in enumerate(text_docs):
    file_path = os.path.join(new_directory, f'chunk{i}.md')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text.page_content)

## Part 2: Convert Text Chunks into Embeddings

In [39]:
COHERE_KEY = os.getenv("COHERE_KEY")
vector_path = os.path.join(cwd, 'vector-stores', 'test_vector_store')

embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY, model="embed-multilingual-v3.0")

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_documents(text_docs, embeddings)
# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)

# Part 3: Testing

In [45]:
# Test Retrieval Behavior
from tests.test_retriever import * 
retriever = faiss_store.as_retriever(search_kwargs={"k": 5})
retrieved_path = os.path.join(cwd, 'retrieved_docs')

test_retrieve_semester_summary = [
    {"query":"生成十六週的學期進度，包含每週每堂課需要涵蓋的內容",
     "expected_tag0": "整學期"},
    {"query":"建立學期計畫",
     "expected_tag0": "整學期"},
    {"query":"幫我出這學期的進度",
     "expected_tag0": "整學期"},
    {"query":"Give me a semester plan that convers all the course material in 16 weeks",
     "expected_tag0": "整學期"},
    {"query":"草擬學期計畫",
     "expected_tag0": "整學期"},
    {"query":"給我課程大綱",
     "expected_tag0": "整學期"}
    ]

test_retrieve_individual_chapter_learning_objective = [
    {"query":"第一單元有什麼學習重點？",
     "expected_tag0": "第1單元"},
    {"query":"列出第四章學習重點",
     "expected_tag0": "第4單元"},
    {"query":"幫我找第六課的相關影片",
     "expected_tag0": "第6單元"},
    {"query":"幫我出第十單元的題目",
     "expected_tag0": "第10單元"},
    {"query":"What are some key takeaway in chapter three?",
     "expected_tag0": "第3單元"},
    {"query":"List out key learning points of chapter 9",
     "expected_tag0": "第9單元"},
    {"query":"列出第二章的活動內容",
     "expected_tag0": "第2單元"},
]

test_cases = [
    *test_retrieve_semester_summary,
    *test_retrieve_individual_chapter_learning_objective
    ]

for i, test in enumerate(test_cases):
    query = test["query"]
    expected = test["expected_tag0"]
    docs = test_retriever(retriever, query, retrieved_path, print_docs=False, save_docs=False)
    print(f"Test {i}: {query}")
    success = False
    for j, doc in enumerate(docs[:2]):
        if expected in doc.page_content[:40]:
            print(f"✅ Success: detected tag {expected} at doc number {j}")
            success = True
            break
        else:
            print(f"Fail: expected {expected} but the retrieved doc is {doc.page_content[:40]}..." )
    if not success:
        print(f"❌ Fail: query = {query}; expected = {expected}")



Elapsed time: 0.28 seconds
Test 0: 生成十六週的學期進度，包含每週每堂課需要涵蓋的內容
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.23 seconds
Test 1: 建立學期計畫
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.24 seconds
Test 2: 幫我出這學期的進度
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.22 seconds
Test 3: Give me a semester plan that convers all the course material in 16 weeks
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.24 seconds
Test 4: 草擬學期計畫
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.21 seconds
Test 5: 給我課程大綱
✅ Success: detected tag 整學期 at doc number 0
Elapsed time: 0.22 seconds
Test 6: 第一單元有什麼學習重點？
✅ Success: detected tag 第1單元 at doc number 0
Elapsed time: 0.21 seconds
Test 7: 列出第四章學習重點
✅ Success: detected tag 第4單元 at doc number 0
Elapsed time: 0.21 seconds
Test 8: 幫我找第六課的相關影片
✅ Success: detected tag 第6單元 at doc number 0
Elapsed time: 0.22 seconds
Test 9: 幫我出第十單元的題目
✅ Success: detected tag 第10單元 at doc number 0
Elapsed time: 0.22 seconds
Te

In [41]:
# Test specific query
query = "給我課程大綱"
docs = test_retriever(retriever, query, retrieved_path, print_docs=True, save_docs=False)

[{'tags': ['整學期', '全部內容', '學期計畫', '教學進度'....
[{'tags': ['第1單元', 'chapter1', '第一單元', '....
[{'tags': ['第4單元', 'chapter4', '第四單元', '....
[{'tags': ['第2單元', 'chapter2', '第二單元', '....
[{'tags': ['第6單元', 'chapter6', '第六單元', '....
Elapsed time: 0.21 seconds
