In [1]:
#加载数据集
import json

data = []
with open('../data/crag_data_200.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

print(f"Loaded {len(data)} records.")


Loaded 200 records.


In [2]:
#创建数据库
import chromadb

# Initialize Chroma client
client = chromadb.Client()

# Create a new collection
collection = client.create_collection('crag_documents')


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=64, chunk_overlap=2, add_start_index=True
)

In [4]:
# 准备并插入数据到Chroma
documents = []
metadatas = []
ids = []

for i, record in enumerate(data):
    query = record["query"]
    answer = record["answer"]
    search_results = record["search_results"]
    
    # 创建一个包含query和answer的文档字符串，并添加search_results的内容
    # document = query + " " + answer + " " + " ".join([result["page_snippet"] for result in search_results])
    text = ''.join([result["page_snippet"] for result in search_results])
    doc=text_splitter.create_documents([text])
    all_splits = text_splitter.split_documents(doc)
    
    # 遍历所有分块并分别添加到相应的列表中
    for j, split in enumerate(all_splits):
        documents.append(split.page_content)
        metadatas.append({
            "query": query,
            "answer": answer,
            "split_index": j
        })
        ids.append(f"doc_{i}_split_{j}")

# 向集合中添加文档
collection.add(
    documents=documents,  # 自动处理分词、嵌入和索引
    metadatas=metadatas,  # 可以根据这些元数据进行过滤
    ids=ids  # 每个文档的唯一标识
)

print("Data has been inserted into Chroma.")


KeyboardInterrupt: 

In [8]:
# 查询
query = "which is a better investment, gold or silver, when considering long-term return?"

# 执行查询，先筛选出包含query的所有文本块，再获取最相似的三个结果
results = collection.query(
    query_texts=[query],
    n_results=96,
    where={"query": query},  # 筛选包含该query的所有文本块
)

print("Top 3 relevant documents:")
context = ""
for result in results['documents']:
    for doc in result:  # 遍历列表中的每一个文档
        print(doc)
        context += doc + "\n"

print("Context:")
print(context)


Top 2 relevant documents:
['can you tell me the original language of the pact? en In domestic politics, pacts are usually between two or more political parties or other organizations. ... Bretton Woods Pact made stable financial and commercial relations viable between United States, Canada, Western European countries, Australia, and Japan (after 1944) Kellogg–Briand Pact, a multilateral treaty against war (1928) London Debt Agreement between London and Germany was a debt relief pact (1953) London Pact between Italy and the ...In domestic politics, pacts are usually between two or more political parties or other organizations. ... Bretton Woods Pact made stable financial and commercial relations viable between United States, Canada, Western European countries, Australia, and Japan (after 1944) Kellogg–Briand Pact, a multilateral treaty against war (1928) London Debt Agreement between London and Germany was a debt relief pact (1953) London Pact between Italy and the Triple Entente (Great

In [5]:
# 测试用例
query = "Who was the first president of the United States?"
context = """
The first president of the United States was George Washington. He served two terms as president from 1789 to 1797. Washington is often referred to as the "Father of His Country" for his pivotal role in the founding of the United States. Before becoming president, he served as the commander-in-chief of the Continental Army during the American Revolutionary War and presided over the convention that drafted the U.S. Constitution.
"""

In [6]:
import requests
# 设置百度API密钥
API_KEY = "r3eHPr2Da9rJa1yJpd5qCGy3"
SECRET_KEY = "Wyi1aOmWcjt70roUhQS4v2GbOvmfGXHn"

def get_access_token():
    """
    使用 API Key，Secret Key 获取access_token
    """
    url = f"https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={API_KEY}&client_secret={SECRET_KEY}"
    response = requests.post(url)
    return response.json().get("access_token")

def call_baidu_chat_api(query, context):
    access_token = get_access_token()
    url = f"https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/yi_34b_chat?access_token={access_token}"
    payload = json.dumps({
        "messages": [
            {
                "role": "user",
                "content": f"Q: {query}\nContext: {context}"
            }
        ]
    })
    headers = {
        'Content-Type': 'application/json'
    }
    response = requests.post(url, headers=headers, data=payload)
    return response.json()

In [7]:
# 调用百度API获取答案
response = call_baidu_chat_api(query, context)
print("Response from Baidu API:")
print(response)
if "result" in response and response["result"]:
    print(response["result"])

Response from Baidu API:
{'id': 'as-zjqjgv5c5u', 'object': 'chat.completion', 'created': 1719217206, 'result': 'A: The first president of the United States was George Washington. He served two terms as president from 1789 to 1797. Washington is often referred to as the "Father of His Country" for his pivotal role in the founding of the United States. Before becoming president, he served as the commander-in-chief of the Continental Army during the American Revolutionary War and presided over the convention that drafted the U.S. Constitution.', 'is_truncated': False, 'need_clear_history': False, 'usage': {'prompt_tokens': 112, 'completion_tokens': 96, 'total_tokens': 208}}
A: The first president of the United States was George Washington. He served two terms as president from 1789 to 1797. Washington is often referred to as the "Father of His Country" for his pivotal role in the founding of the United States. Before becoming president, he served as the commander-in-chief of the Continent