# IntelHealth - RAG Embedding 入库（Colab）

说明：使用 `ingest_pgvector_jsonl.py` 将文本分块 + embedding 写入 Supabase pgvector。


In [None]:
# Clone repo + pull latest
import os
if not os.path.exists('Intel_Health'):
    !git clone https://github.com/DemonRain7/Intel_Health.git
else:
    !cd Intel_Health && git pull
%cd Intel_Health

In [None]:
# Install Python dependencies
!pip -q install openai supabase tiktoken


## 1) 设置环境变量

请把自己的 key 填进去（不要提交到仓库）。


In [None]:
import os
from google.colab import userdata
os.environ["SUPABASE_URL"] = (userdata.get("SUPABASE_URL") or os.environ.get("SUPABASE_URL", "")).strip()
os.environ["SUPABASE_SERVICE_ROLE_KEY"] = (userdata.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_SERVICE_ROLE_KEY", "")).strip()
os.environ["OPENAI_API_KEY"] = (userdata.get("OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY", "")).strip()

# Embedding 模型
os.environ["RAG_EMBEDDING_MODEL"] = "text-embedding-3-small"

# 诊断语料（示例：可按需修改）
os.environ["RAG_DOCS_CORPUS"] = "diagnosis"
os.environ["RAG_DOCS_PATHS"] = ",".join([
    "datasets/rag/diagnosis/诊断学.txt",
    "datasets/rag/diagnosis/传染病学.txt",
    "datasets/rag/diagnosis/神经病学.txt",
    "datasets/rag/diagnosis/病理学.txt",
    "datasets/rag/diagnosis/病理生理学.txt",
    "datasets/rag/diagnosis/医学影像学.txt",
    "datasets/rag/diagnosis/儿科学.txt",
    "datasets/rag/diagnosis/妇产科学.txt",
    "datasets/rag/diagnosis/医学免疫学.txt",
    "datasets/rag/diagnosis/医学微生物学.txt",
    "datasets/rag/diagnosis/医学心理学.txt",
    "datasets/rag/diagnosis/精神病学.txt",
    "datasets/rag/diagnosis/耳鼻咽喉头颈外科学.txt"
])

# 运行控制
os.environ["RAG_BATCH_SIZE"] = "50"
os.environ["RAG_TEXT_CHUNK_SIZE"] = "1200"
os.environ["RAG_TOKEN_CHUNK_SIZE"] = "700"
os.environ["RAG_TOKEN_CHUNK_OVERLAP"] = "100"
os.environ["RAG_RESUME"] = "1"

In [None]:
!python llm_funcs/ingest_pgvector_jsonl.py


## 2) 用药语料（临床药理学）

把 `RAG_DOCS_CORPUS` 改成 `drug`，然后执行相同命令。


In [None]:
os.environ["RAG_DOCS_CORPUS"] = "drug"
os.environ["RAG_DOCS_PATHS"] = "datasets/rag/drug/临床药理学.txt"
!python llm_funcs/ingest_pgvector_jsonl.py


## 3) 验证入库数量

In [None]:
from supabase import create_client as _sc
_sb = _sc(os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_ROLE_KEY"])

diag = _sb.table("rag_documents").select("id", count="exact").eq("metadata->>corpus", "diagnosis").execute()
drug = _sb.table("rag_documents").select("id", count="exact").eq("metadata->>corpus", "drug").execute()
total = _sb.table("rag_documents").select("id", count="exact").execute()

print(f"diagnosis: {diag.count} rows")
print(f"drug:      {drug.count} rows")
print(f"total:     {total.count} rows")