## 1. Create environment using conda

`# Setup conda environment`<br>
`conda create -n tcm-ai-rag python=3.10`

`# Activate environment`<br>`conda activate tcm-ai-rag`

## 2. Install dependencies

In [None]:
# !pip install llama-index
# !pip install llama-index-embeddings-huggingface
# !pip install llama-index-llms-huggingface

In [None]:
# Install CUDA-version Pytorch
# !pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118

## 3. Download models

In [None]:
# Install modelscope
# !pip install modelscope

### 3.1 Download Embedding model weights

In [None]:
# Use SDK from modelscope to download the model
from modelscope import snapshot_download

model_dir = snapshot_download(model_id="BAAI/bge-base-zh-v1.5", cache_dir="D:/work/AIProject/modelscope")

### 3.2 Download LLM weights

In [None]:
from modelscope import snapshot_download

model_dir = snapshot_download(model_id="Qwen/Qwen2.5-7B-Instruct", cache_dir="D:/work/AIProject/modelscope")

## 4. Construct a Question-and-Answer System for Traditional Chinese Medicine Clinical Diagnosis and Treatment Terminology and Syndromes

### 4.1 Corpus Preparation
#### Please download the Traditional Chinese Medicine clinical diagnosis and treatment terminology materials issued by the National Health Commission and the National Administration of Traditional Chinese Medicine for the specific corpus.

#### **Partial Content Preview**

#### **气机阻滞证 syndrome/pattern of obstructed qi movement**<br>泛指因各种原因导致气机不畅，或气郁而不散，阻滞脏腑、经络、官窍等所引起的一类证候。


#### **气机郁滞证 syndrome/pattern of qi activity stagnation**<br>因气机郁结，阻滞经络或脏腑官窍所致。临床以头颈肩背或胸胁脘腹等处闷胀，或攻窜作痛，常随紧张、抑郁等情绪缓解，或得太息、嗳气、肠鸣、矢气而减轻，脉弦，可伴见大便时秘或泻，小便不利，耳鸣、耳聋，嘶哑、呃逆等为特征的证候。


#### **气滞耳窍证 syndrome/pattern of qi stagnation in the ears**<br>因肝气郁结，气机不利，气滞耳窍所致。临床以突然耳窍失聪，或耳内堵塞，耳鸣，眩晕，脉弦，伴见胸胁胀闷，情绪抑郁等为特征的证候。


#### **气滞耳带证 syndrome/pattern of qi stagnation in the vocal fold**<br>因气机阻滞，痹阻声带所致。临床以声音不扬、嘶哑，言语费劲或磕巴，脉弦，可伴见咽喉不适，胸闷，胁胀等为特征的证候。


### 4.2 Rapid Knowledge Base Construction Based on LlamaIndex

#### 4.2.1 Import necessary packages

In [None]:
import logging
import sys
import torch
from llama_index.core import PromptTemplate, Settings, SimpleDirectoryReader, VectorStoreIndex, load_index_from_storage, StorageContext
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.node_parser import SentenceSplitter

#### 4.2.2 Configure logging

In [None]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

#### 4.2.3 Construct System Prompt

In [None]:
SYSTEM_PROMPT = """You are a helpful AI assistant."""
query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST]"
)

#### 4.2.4 Load the downloaded Local LLM

In [None]:

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name='D:/work/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
    model_name='D:/work/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16}
)
Settings.llm = llm

#### 4.2.5 Load the downloaded local embedding model

In [None]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name='./D:/work/AIProject/modelscope/BAAI/bge-base-zh-v1___5'
)

#### 4.2.6 Load the documents from the directory

In [None]:
documents = SimpleDirectoryReader("./documents", required_exts=[".txt"]).load_data()

#### 4.2.7 Build the index

In [None]:
# Split the documents and convert the texts into vectors using the embedding model
index = VectorStoreIndex.from_documents(documents, transformations=[SentenceSplitter(chunk_size=256)])

#### 4.2.8 Construct the query engine

In [None]:
query_engine = index.as_query_engine(similarity_top_k=5)

#### 4.2.9 Generate the response

response = query_engine.query("不耐疲劳，口渴、咽干可能是哪些证候？")
print(response)

## 5. Vector storage and loading

### 5.1 Vector storage

In [None]:
# Save the vector and vector index to storage
index.storage_context.persist(persist_dir="doc_emb")

In the persist_dir, there would be these json files:<br>- **default_vector_store.json**: Persist embedding vectors<br>- **docstore.json**: Persist splitted document segments (empty because of text documents used here)<br>- graph_store.json: Persist graph‑structured data (empty because of text documents used here)<br>- image__vector_store.json: Persist image data<br>- **index_store.json**: Persist vector indexes

### 5.2 Vector loading

In [None]:
 
# Load the vector and vector index from storage
storage_context = StorageContext.from_defaults(persist_dir="doc_emb")

# Reconstruct the index
index = load_index_from_storage(storage_context)

## 6. Track retrieved document segments

In [None]:
# Load the vector and vector index from storage
storage_context = StorageContext.from_defaults(persist_dir="doc_emb")

# Reconstruct the index
index = load_index_from_storage(storage_context)

# Construct the query engine
query_engine = index.as_query_engine(similarity_top_k=5)

# Fetch the top 5 similar document segments
contexts = query_engine.retrieve(QueryBundle("不耐疲劳，口燥、咽干可能是哪些证候？"))
print('-'*10 + 'ref' + '-'*10)
for i, context in enumerate(contexts):
   print('*'*10 + f'chunk {i} start' + '*'*10)
   content = context.node.get_content(metadata_mode=MetadataMode.LLM)
   print(content)
   print('*' * 10 + f'chunk {i} end' + '*' * 10)
print('-'*10 + 'ref' + '-'*10)

# Generate the response
response = query_engine.query("不耐疲劳，口燥、咽干可能是哪些证候？")
print(response)

## 7. Underlying Implementation Details of RAG Retrieval

In [None]:
import logging
import sys
import torch
from llama_index.core import PromptTemplate, Settings, StorageContext, load_index_from_storage
from llama_index.core.callbacks import LlamaDebugHandler, CallbackManager
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Construct system prompt
SYSTEM_PROMPT = """You are a helpful AI assistant."""
query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
 )

# Load the downloaded Local LLM
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name='D:/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
    model_name='D:/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16},
 )
Settings.llm = llm

# Use LlamaDebugHandler to track LlamaIndex calls
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
Settings.callback_manager = callback_manager

# Use llama-index-embeddings-huggingface to load the local embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="D:/AIProject/modelscope/BAAI/bge-base-zh-v1___5"
)

# Load the vector and vector index from storage
storage_context = StorageContext.from_defaults(persist_dir="doc_emb")
index = load_index_from_storage(storage_context)

# Construct the query engine
query_engine = index.as_query_engine(similarity_top_k=5)

# Generate the response
response = query_engine.query("不耐疲劳，口燥、咽干可能是哪些证候？")
print(response)

# return the start and end events of each LLM call
event_pairs = llama_debug.get_llm_inputs_outputs()
# print(event_pairs[0][1].payload.keys())
print(event_pairs[0][1].payload["formatted_prompt"])

## 8. Implement a simplified Chinese prompt template

In [None]:
import logging
import sys
import torch
from llama_index.core import PromptTemplate, Settings, StorageContext, load_index_from_storage
from llama_index.core.callbacks import LlamaDebugHandler, CallbackManager
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Construct system prompt
SYSTEM_PROMPT = """你是一个医疗人工智能助手。"""
query_wrapper_prompt = PromptTemplate(
   "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

# Construct Q&A prompt
qa_prompt_tmpl_str = (
    "上下文信息如下。\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "请根据上下文信息而不是先验知识来回答以下的查询。"
    "作为一个医疗人工智能助手，你的回答要尽可能严谨。\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

# Construct refine prompt
refine_prompt_tmpl_str = (
   "原始查询如下：{query_str}"
   "我们提供了现有答案：{existing_answer}"
   "我们有机会通过下面的更多上下文来完善现有答案（仅在需要时）。"
   "------------"
   "{context_msg}"
   "------------"
   "考虑到新的上下文，优化原始答案以更好地回答查询。 如果上下文没有用，请返回原始答案。"
   "Refined Answer:"
)
refine_prompt_tmpl = PromptTemplate(refine_prompt_tmpl_str)

# # Load the downloaded Local LLM
llm = HuggingFaceLLM(
   context_window=4096,
   max_new_tokens=2048,
   generate_kwargs={"temperature": 0.0, "do_sample": False},
   query_wrapper_prompt=query_wrapper_prompt,
   tokenizer_name='D:/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
   model_name='D:/AIProject/modelscope/Qwen/Qwen2___5-7B-Instruct',
   device_map="auto",
   model_kwargs={"torch_dtype": torch.float16},
)
Settings.llm = llm

# Use LlamaDebugHandler to track LlamaIndex calls
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
Settings.callback_manager = callback_manager

# Use llama-index-embeddings-huggingface to load the local embedding model
Settings.embed_model = HuggingFaceEmbedding(
   model_name="D:/AIProject/modelscope/BAAI/bge-base-zh-v1___5"
)

# Load the vector and vector index from storage
storage_context = StorageContext.from_defaults(persist_dir="doc_emb")
index = load_index_from_storage(storage_context)

# Construct the query engine
query_engine = index.as_query_engine(similarity_top_k=5)

# Print all available prompt types in the query engine
prompts_dict = query_engine.get_prompts()
print(list(prompts_dict.keys()))

# Update the query engine with the customized prompt templates
query_engine.update_prompts(
   {"response_synthesizer:text_qa_template": qa_prompt_tmpl,
    "response_synthesizer:refine_template": refine_prompt_tmpl}
)

# Generate the response
response = query_engine.query("不耐疲劳，口燥、咽干可能是哪些证候？")
print(response)

# Print formatted_prompt
event_pairs = llama_debug.get_llm_inputs_outputs()
print(event_pairs[0][1].payload["formatted_prompt"])