In [1]:
import os
from dotenv import load_dotenv

# 加载 .env 文件中的OpenAI API环境变量
load_dotenv()

True

In [10]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI


# 初始化向量数据库
def initialize_faiss_vectorstore(file_paths):
    # 1. 加载文档
    documents = []
    for file_path in file_paths:
        loader = TextLoader(file_path)
        docs = loader.load()
        documents.extend(docs)

    # 2. 分割文档
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)

    # 3. 初始化嵌入模型
    embeddings = OpenAIEmbeddings()

    # 4. 创建并初始化FAISS向量数据库
    vectorstore = FAISS.from_documents(docs, embeddings)

    return vectorstore


# 定义多重查询生成器
def generate_queries(original_query):
    prompt = ChatPromptTemplate.from_template(
        "You are a helpful assistant that generates multiple search queries based on a single input query.\n\nGenerate multiple search queries related to: {question}\n\nOutput (4 queries):"
    )
    chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
    queries = chain.invoke({"question": original_query})["text"].split("\n")
    return queries

# 定义倒数排名融合算法
def reciprocal_rank_fusion(results, k=60):
    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = str(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            fused_scores[doc_str] += 1 / (rank + k)
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

# 定义完整的 RAG-Fusion 链
def rag_fusion_chain(original_query):
    # 生成多重查询
    queries = generate_queries(original_query)
    
    # 对每个查询进行向量搜索
    results = []
    for query in queries:
        docs = vectorstore.similarity_search(query, k=4)
        results.append(docs)
    
    # 使用 RRF 算法对文档进行重新排序
    reranked_results = reciprocal_rank_fusion(results)
    
    # 提取重排序后的文档内容
    top_docs = [doc[0].page_content for doc in reranked_results[:4]]
    
    # 生成最终答案
    prompt = ChatPromptTemplate.from_template(
        "Answer the following question based on this context:\n{context}\nQuestion: {question}\n"
    )
    chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
    answer = chain.invoke({"context": "\n".join(top_docs), "question": original_query})
    return answer

# 示例问题
original_query = "示例查询内容"
file_paths = ["doc1.txt", "doc2.txt", "doc3.txt"]
vectorstore = initialize_faiss_vectorstore(file_paths)
result = rag_fusion_chain(original_query)
print(result)

# 第二种实现

In [None]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

# 1. 加载文档
loader = TextLoader("path_to_your_document.txt")
documents = loader.load()

# 2. 文档分块
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# 3. 创建嵌入模型和向量存储
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(texts, embeddings)

# 4. 定义查询生成函数
def generate_queries(question):
    # 这里可以使用任何方法生成多个查询，例如同义词替换、问题扩展等
    queries = [
        question,
        f"What is {question}?",
        f"Explain {question} in detail.",
        f"Can you provide more information about {question}?"
    ]
    return queries

# 5. 多轮检索
def multi_retrieval(queries, vectorstore):
    retrieved_docs = []
    for query in queries:
        docs = vectorstore.similarity_search(query, k=3)
        retrieved_docs.extend(docs)
    return retrieved_docs

# 6. 结果融合（简单的去重）
def fuse_results(retrieved_docs):
    unique_docs = list({doc.page_content: doc for doc in retrieved_docs}.values())
    return unique_docs

# 7. 生成答案
def generate_answer(question, docs):
    llm = OpenAI(temperature=0)
    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer:"""
    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )
    chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
    answer = chain.run(input_documents=docs, question=question)
    return answer

# 8. RAG-Fusion 主流程
def rag_fusion(question):
    queries = generate_queries(question)
    retrieved_docs = multi_retrieval(queries, vectorstore)
    fused_docs = fuse_results(retrieved_docs)
    answer = generate_answer(question, fused_docs)
    return answer

# 9. 测试
question = "What is the capital of France?"
answer = rag_fusion(question)
print(answer)