# RAG Pipeline
This file splits and stores each document into vector embeddings.

In [15]:

# import packages
from dotenv import load_dotenv
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain.schema import Document

In [16]:
# Path parameters and environment variables
cwd = os.getcwd()
env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)

# Modify the path parameters as needed
text_docs = []


## Part 1: Loading Documents

In [17]:
def load_split_doc(path, separators, chunk_size, chunk_overlap):
    # Load the organized textbook data
    with open(path, "r", encoding="utf-8") as file:
        extracted_text = file.read()   
    # Create a RecursiveCharacterTextSplitter object to split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size=chunk_size,       # Maximum number of characters in each chunk
        chunk_overlap=chunk_overlap,     # Number of characters that overlap between consecutive chunks
        length_function=len,    # Function to measure the length of chunks
    )
    docs = text_splitter.create_documents([extracted_text])
    text_docs.extend(docs)
    return docs

def load_doc(path):
    with open(path, "r", encoding="utf-8") as file:
        extracted_text = file.read()
    doc = Document(page_content=extracted_text)
    text_docs.append(doc)
    
def load_doc_dir(dir_path, metadata=None):
    documents = []
    # Loop through each chapter file (1 to 10)
    for filename in os.listdir(dir_path):
        with open(os.path.join(dir_path, filename), "r", encoding="utf-8") as file:
            extracted_text = file.read()
        doc = Document(page_content=extracted_text, metadata=metadata)
        documents.append(doc)
    text_docs.extend(documents)
    return documents

def load_json(path):
    loader = JSONLoader(file_path=path, jq_schema=".", text_content=False)
    data = loader.load()
    # Append loaded documents to the list
    text_docs.extend(data)
    return data
    
def load_json_dir(dir_path):
    documents = []
    # Loop through each chapter file (1 to 10)
    for filename in os.listdir(dir_path):
        # Load each JSON file
        loader = JSONLoader(file_path=os.path.join(dir_path, filename), jq_schema=".", text_content=False)
        data = loader.load()
        # Append loaded documents to the list
        documents.extend(data)

    text_docs.extend(documents)
    return documents


In [18]:
# Load the organized textbook data
separators = ['#','##','###']
chunk_size = 1000
chunk_overlap = 0 # we are splitting by tags so there shouldn't be tokens that got cut
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')

# load_split_doc(textbook_extracted_path, separators, chunk_size, chunk_overlap);

In [19]:
# Load json course plan
dir_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','course_plan')

load_json_dir(dir_path);

In [20]:
# Load video file
video_file_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','videos')

metadata = {
    "type" : "video",
    "tags" : ["video",  "教學影片", "影片", "短片", "clips", "media", "learning_resource", "multimedia"]
}
load_doc_dir(video_file_path, metadata);

In [21]:
# Write all chunks into files to see if they are semantically separated
new_directory = os.path.join(cwd, 'chunks')
os.makedirs(new_directory, exist_ok=True)
for i, text in enumerate(text_docs):
    file_path = os.path.join(new_directory, f'chunk{i}.md')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text.page_content)

## Part 2: Convert Text Chunks into Embeddings

In [22]:
COHERE_KEY = os.getenv("COHERE_KEY")
vector_path = os.path.join(cwd, 'vector-stores', 'vid_and_json_vs')

embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY, model="embed-multilingual-v3.0")

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_documents(text_docs, embeddings)
# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)

# Part 3: Testing

In [23]:
# Test Retrieval Behavior
from tests.test_retriever import * 
embedding_path = os.path.join(cwd, 'vector-stores', 'vid_and_json_vs')
faiss_store = FAISS.load_local(
    embedding_path, embeddings, allow_dangerous_deserialization=True
)
retriever = faiss_store.as_retriever(search_kwargs={"k": 5})
retrieved_path = os.path.join(cwd, 'retrieved_docs')

test_retrieve_semester_summary = [
    {"query":"生成十六週的學期進度，包含每週每堂課需要涵蓋的內容",
     "expected_tag0": "整學期"},
    {"query":"建立學期計畫",
     "expected_tag0": "整學期"},
    {"query":"幫我出這學期的進度",
     "expected_tag0": "整學期"},
    {"query":"Give me a semester plan that convers all the course material in 16 weeks",
     "expected_tag0": "整學期"},
    {"query":"草擬學期計畫",
     "expected_tag0": "整學期"},
    {"query":"給我課程大綱",
     "expected_tag0": "整學期"}
    ]

test_retrieve_individual_chapter_learning_objective = [
    {"query":"第一單元有什麼學習重點？",
     "expected_tag0": "第1單元"},
    {"query":"列出第四章學習重點",
     "expected_tag0": "第4單元"},
    {"query":"幫我找第六課的相關影片",
     "expected_tag0": "第6單元"},
    {"query":"幫我出第十單元的題目",
     "expected_tag0": "第10單元"},
    {"query":"What are some key takeaway in chapter three?",
     "expected_tag0": "第3單元"},
    {"query":"List out key learning points of chapter 9",
     "expected_tag0": "第9單元"},
    {"query":"列出第二章的活動內容",
     "expected_tag0": "第2單元"},
]

test_cases = [
    *test_retrieve_semester_summary,
    *test_retrieve_individual_chapter_learning_objective
    ]

# for i, test in enumerate(test_cases):
#     query = test["query"]
#     expected = test["expected_tag0"]
#     docs = test_retriever(retriever, query, retrieved_path, print_docs=False, save_docs=False)
#     print(f"Test {i}: {query}")
#     success = False
#     for j, doc in enumerate(docs[:2]):
#         if expected in doc.page_content[:40]:
#             print(f"✅ Success: detected tag {expected} at doc number {j}")
#             print(doc.page_content[:40])
#             success = True
#             break
#         else:
#             print(f"Fail: expected {expected} but the retrieved doc is {doc.page_content[:40]}..." )
#     if not success:
#         print(f"❌ Fail: query = {query}; expected = {expected}")



In [29]:
# Test video retrieval
query = "尋找並列出第二單元每個章節的教學影片"
docs = test_retriever(retriever, query, retrieved_path, print_docs=True, save_docs=False)

[{'tags': ['第2單元', 'chapter2', '第二單元', '....
[{'tags': ['第1單元', 'chapter1', '第一單元', '....
[{'tags': ['第4單元', 'chapter4', '第四單元', '....
[{'tags': ['第8單元', 'chapter8', '第八單元', '....
# 康軒數學五年級上學期教學影片 education video 影片 媒體 章....
Elapsed time: 0.26 seconds


In [25]:
# Test specific query
query = "給我課程大綱"
docs = test_retriever(retriever, query, retrieved_path, print_docs=True, save_docs=False)

[{'tags': ['整學期', '全部內容', '學期計畫', '教學進度'....
[{'tags': ['第1單元', 'chapter1', '第一單元', '....
[{'tags': ['第4單元', 'chapter4', '第四單元', '....
[{'tags': ['第2單元', 'chapter2', '第二單元', '....
[{'tags': ['第6單元', 'chapter6', '第六單元', '....
Elapsed time: 0.21 seconds
