# RAG Pipeline
This file splits up processed data into chunks and store them into vector store as embeddings.
- Modify path for different data. 
- Modify splitter parameters to split documents semantically.
- Change different embedding model to test the correctness of retrieved document.

In [2]:
# import packages
from dotenv import load_dotenv
import os
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings

In [3]:
# Path parameters and environment variables
cwd = os.getcwd()
env_path = os.path.join(os.path.dirname(cwd), '.env')
load_dotenv(dotenv_path=env_path)

# NOTE: Modify the path parameters as needed
textbook_extracted_path = os.path.join(cwd,'..', 'data_processing', 'processed_data','kang_math_textbook_edited.md')
vector_path = os.path.join(cwd, 'vector-stores', 'test_vector_store')


## Part 1: Text Splitting

In [4]:
# NOTE: Splitter paramters
separators = ['#','##','###']
chunk_size = 1000
chunk_overlap = 200

In [5]:
# Load the organized textbook data
with open(textbook_extracted_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()    

In [6]:
# Create a RecursiveCharacterTextSplitter object to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=chunk_size,       # Maximum number of characters in each chunk
    chunk_overlap=chunk_overlap,     # Number of characters that overlap between consecutive chunks
    length_function=len,    # Function to measure the length of chunks
)

texts = text_splitter.split_text(extracted_text)

In [7]:
# Write all chunks into files to see if they are semantically separated
new_directory = os.path.join(cwd, 'chunks')
os.makedirs(new_directory, exist_ok=True)
for i, text in enumerate(texts):
    file_path = os.path.join(new_directory, f'chunk{i}.md')
    with open(file_path, 'w') as file:
        file.write(text)

## Part 2: Convert Text Chunks into Embeddings

In [8]:
COHERE_KEY = os.getenv("COHERE_KEY")

# NOTE: Change the model as needed
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings = CohereEmbeddings(cohere_api_key=COHERE_KEY)

# Initialize FAISS (Facebook AI Similarity Search) vector store, converting raw text chunks into embeddings
faiss_store = FAISS.from_texts(texts, embeddings)

# Define the save path and the name for the vector store
os.makedirs(vector_path, exist_ok=True)

# Save FAISS vector store to disk with a name
faiss_store.save_local(vector_path)

In [17]:
# Test Retrieval Behavior
from tests.test_retriever import * 
query = """
小數加減
"""


retrieved_path = os.path.join(cwd, 'retrieved_docs')
test_retriever(vector_path, embeddings, query, retrieved_path)

# 多位小數與加減

- 多位小數的大小比較
- 多位小數的加法直式計算
- 多位小數的加減
- 多位小數的減法直式計算
- 小數取概數
- 小數取概數並估算
---
# 第一單元 多位小數與加減

....
# 第一單元 多位小數與加減

|教材地位|教學指導計畫|總節數 7 節|
|---|---|---|
|以前學過的|現在要學的|未來要學的|
|第六冊第五單元| | |
|• 認識一位小數以及「十分....
# 活動二 小數的大小比較

目標 3-1 能做多位小數的大小比較。

# 活動三 多位小數的加減

目標 4-1 能解決生活情境中多位小數的加法問題。
4-2 能解決生活情境中多位小數的減法問題。
....
# 第二單元 因數與公因數

數學素養學習單 配合5上第一單元 多位小數與加減 年 班 號 2

匯率達人匯率達人 檔案提供於電子書及題庫中究了匯率以後 發現如果要把臺幣換成美金的話 要看銀行賣出的匯....
Elapsed time: 0.45 seconds
