In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

## PDF切块

In [2]:
def process_pdfs_from_directory(pdf_directory):
    """处理目录中的所有PDF文件（传统方式）"""
    
    # 加载PDF文件
    loader = DirectoryLoader(
        pdf_directory, 
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    print(f"处理了 {len(documents)} 个PDF文档")
    
    # 文本切割
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # 每块大小
        chunk_overlap=200,  # 重叠部分
        separators=["\n\n", "\n", " ", ""]  # 分割符优先级
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"切割成 {len(chunks)} 个文本块")
    
    return chunks


## 向量化存储

In [3]:
import os
import time
import torch

print("\n开始加载向量库...")
start_time = time.time()

# 检查设备可用性
print(f"CUDA 是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU 数量: {torch.cuda.device_count()}")
    print(f"当前 GPU: {torch.cuda.get_device_name(0)}")
    device = 'cuda:0'
else:
    print("未检测到 CUDA，将使用 CPU")
    device = 'cpu'

persist_dir = "./chroma_db"
collection_name = "pdf_collection"

# 全局模型缓存：只在首次运行时加载模型
if 'embedding' not in globals():
    print("首次加载嵌入模型...")
    model_start = time.time()
    
    embedding = HuggingFaceEmbeddings(
        model_name='Qwen/Qwen3-Embedding-0.6B',
        cache_folder='Models',
        model_kwargs={'device': device},
        encode_kwargs={'normalize_embeddings': True}
    )
    print(f"✅ 嵌入模型加载完成 (设备: {device})，耗时: {time.time() - model_start:.2f}秒")
else:
    print("使用已缓存的嵌入模型 ✓")

# 检查向量库是否存在
if os.path.exists(persist_dir) and os.listdir(persist_dir):
    print("发现已存在的向量库，直接加载...")
    load_start = time.time()
    
    vector_store = Chroma(
        persist_directory=persist_dir,
        embedding_function=embedding,
        collection_name=collection_name
    )
    
    print(f"✅ 向量库加载完成，耗时: {time.time() - load_start:.2f}秒")
    print(f"总耗时: {time.time() - start_time:.2f}秒")
else:
    print("向量库不存在，需要重新创建...")
    pdf_chunks = process_pdfs_from_directory("Paper/2ATAKKQD")
    vector_store = Chroma.from_documents(
        pdf_chunks,
        embedding=embedding,
        persist_directory=persist_dir,
        collection_name=collection_name
    )


开始加载向量库...
CUDA 是否可用: True
GPU 数量: 1
当前 GPU: NVIDIA GeForce RTX 4060 Laptop GPU
首次加载嵌入模型...
✅ 嵌入模型加载完成 (设备: cuda:0)，耗时: 18.02秒
发现已存在的向量库，直接加载...
✅ 向量库加载完成，耗时: 0.12秒
总耗时: 19.73秒


## 检索器设置

In [4]:

base_retriever = vector_store.as_retriever(search_kwargs={"k": 3})

## prompt模版

In [5]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful AI assistant. Use the following pieces of context to answer the question at the end, and you need to show the source of your answer.
{context}
If you don't know the answer, just say you don't know. Don't try to make up an answer.
Question: {question}
Answer in English.
""",
)

## 大语言模型设置

In [6]:
from langchain_openai import ChatOpenAI
import os
from langchain_core.messages import HumanMessage

api_key = os.environ.get("QWEN_API_KEY")

if not api_key:
    raise ValueError("QWEN_API_KEY environment variable is not set")

# 使用更新的参数名称避免警告
llm = ChatOpenAI(
    model="qwen3-next-80b-a3b-instruct",
    api_key=api_key,  # 使用api_key而不是openai_api_key
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",  # 使用base_url而不是openai_api_base
    temperature=0.1
)

# deepseek-chat 示例代码 --- IGNORE ---
# api_key = os.environ.get("DEEPSEEK_API_KEY")

# if not api_key:
#     raise ValueError("DEEPSEEK_API_KEY environment variable is not set")

# # 使用更新的参数名称避免警告
# llm = ChatOpenAI(
#     model="deepseek-chat",
#     api_key=api_key,  # 使用api_key而不是openai_api_key
#     base_url="https://api.deepseek.com/v1",  # 使用base_url而不是openai_api_base
#     temperature=0.1
# )

## 检索增强

In [7]:
relevant_docs = base_retriever.get_relevant_documents("What is the relationship between microbes and drugs?")
print(f"找到 {len(relevant_docs)} 个相关文档")

prompt_text = prompt.format(
    context="\n".join([doc.page_content for doc in relevant_docs]),
    question="What is the relationship between microbes and drugs?"
)
print("提示词:", prompt_text)

  relevant_docs = base_retriever.get_relevant_documents("What is the relationship between microbes and drugs?")
  attn_output = torch.nn.functional.scaled_dot_product_attention(


找到 3 个相关文档
提示词: 
You are a helpful AI assistant. Use the following pieces of context to answer the question at the end, and you need to show the source of your answer.
∗T o whom correspondence should be addressed.
Associate Editor: XXXXXXX
Received on XXXXX; revised on XXXXX; accepted on XXXXX
Abstract
Motivation: The microbes in human body play a crucial role in influencing the functions of drugs, as they can regulate the activities and
toxicities of drugs. Most recent methods for predicting drug-microbe associations are based on graph learning. However, the relationships
among multiple drugs and microbes are complex, diverse and heterogeneous. Existing methods often fail to fully model the relationships.
In addition, the attributes of drug-microbe pairs exhibit long-distance spatial correlations, which previous methods have not integrated
effectively.
Results: We propose a new prediction method named DHDMP which is designed to encode the relationships among multiple drugs
and microbe

## 生成回答

In [8]:
# 使用现代的LangChain调用方式
from langchain_core.messages import HumanMessage

response = llm([HumanMessage(content=prompt_text)])
print("回答:", response.content)

  response = llm([HumanMessage(content=prompt_text)])


回答: The microbes in the human body play a crucial role in influencing the functions of drugs, as they can regulate the activities and toxicities of drugs. 

Source: Abstract from the provided context.


## RAGAs评估

In [11]:
'''
{
    "user_input":[], <-- 问题基于Context
    "response":[], <-- 答案基于LLM生成
    "retrieved_contexts":[], <-- 检索到的上下文（改名）
    "reference":[] <-- 标准答案（改名）
}
'''
from datasets import Dataset

user_input = [
    "What is the relationship between microbes and drugs?"
]
response_list = [
    response.content  # 使用ChatOpenAI的response.content
]
retrieved_contexts = [
    [doc.page_content for doc in relevant_docs]  # 改为列表格式
]
reference = [
    "Microbes can influence the metabolism of drugs, affecting their efficacy and toxicity. They can activate, inactivate, or modify drugs through various biochemical processes. This interaction can lead to variations in drug response among individuals based on their unique microbiome composition."
]

data = {
    "user_input": user_input,
    "response": response_list,
    "retrieved_contexts": retrieved_contexts,
    "reference": reference
}

dataset_ragas = Dataset.from_dict(data)
dataset_ragas

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts', 'reference'],
    num_rows: 1
})

## RAGAs评估

In [23]:
import pandas as pd
from ragas import EvaluationDataset
evaluation_dataset = EvaluationDataset.from_list(dataset_ragas)

from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    LLMContextRecall,
    LLMContextPrecisionWithReference,
    Faithfulness,
    AnswerRelevancy
)
from ragas.embeddings import LangchainEmbeddingsWrapper

# 使用已经全局缓存的embedding模型
print("🔄 使用全局缓存的embedding模型...")
print(f"模型: {embedding.model_name}")

# 将LangChain的embedding包装为RAGAs可用的格式
evaluate_embedding = LangchainEmbeddingsWrapper(embedding)
evaluator_llm = LangchainLLMWrapper(llm)

# 使用RAGAs内置的嵌入模型
result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), LLMContextPrecisionWithReference(),AnswerRelevancy()],llm=evaluator_llm,embeddings=evaluate_embedding)


🔄 使用全局缓存的embedding模型...
模型: Qwen/Qwen3-Embedding-0.6B


  evaluate_embedding = LangchainEmbeddingsWrapper(embedding)


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
result_df = result.to_pandas()
result_df

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,llm_context_precision_with_reference,answer_relevancy
0,What is the relationship between microbes and ...,[∗T o whom correspondence should be addressed....,"Hello! 😊 \nI'm Qwen, a large-scale language m...",Microbes can influence the metabolism of drugs...,0.333333,,1.0,0.323818
