In [2]:
from typing import List
from pypdf import PdfReader
import os
import glob

def split_into_chunks(doc_file: str, chunk_size: int = 500) -> List[str]:
    """读取单个PDF文件并切分成chunks"""
    try:
        reader = PdfReader(doc_file)
        text = []
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
        content = "\n".join(text)

        # 切分成 chunk（按长度）
        chunks = []
        start = 0
        while start < len(content):
            end = min(start + chunk_size, len(content))
            chunk_text = content[start:end].strip()
            if chunk_text:  # 只添加非空chunk
                chunks.append(chunk_text)
            start = end
        return chunks
    except Exception as e:
        print(f"读取文件 {doc_file} 时出错: {e}")
        return []

def load_all_reports(report_dir: str = "Annual Survey Report", chunk_size: int = 500) -> List[dict]:
    """读取所有年度报告文件"""
    all_chunks = []
    all_metadata = []
    
    # 获取所有PDF文件
    pdf_files = glob.glob(os.path.join(report_dir, "*.pdf"))
    pdf_files.sort()  # 按文件名排序
    
    print(f"找到 {len(pdf_files)} 个PDF文件:")
    for pdf_file in pdf_files:
        print(f"  - {os.path.basename(pdf_file)}")
    
    print("\n开始处理文件...")
    
    for pdf_file in pdf_files:
        filename = os.path.basename(pdf_file)
        print(f"正在处理: {filename}")
        
        chunks = split_into_chunks(pdf_file, chunk_size)
        
        if chunks:
            # 为每个chunk添加元数据
            for i, chunk in enumerate(chunks):
                metadata = {
                    'source_file': filename,
                    'chunk_index': i,
                    'year': extract_year_from_filename(filename)
                }
                all_metadata.append(metadata)
            
            all_chunks.extend(chunks)
            print(f"  - 提取了 {len(chunks)} 个chunks")
        else:
            print(f"  - 未能提取内容")
    
    print(f"\n总共处理了 {len(all_chunks)} 个chunks")
    return all_chunks, all_metadata

def extract_year_from_filename(filename: str) -> str:
    """从文件名中提取年份"""
    import re
    # 匹配4位数字的年份
    year_match = re.search(r'20\d{2}', filename)
    if year_match:
        return year_match.group()
    return "unknown"

# 调用函数读取所有报告
all_chunks, all_metadata = load_all_reports()

# 显示前几个chunks作为示例
print(f"\n前3个chunks示例:")
for i, chunk in enumerate(all_chunks[:3]):
    metadata = all_metadata[i]
    print(f"[{i}] 来源: {metadata['source_file']} ({metadata['year']})")
    print(f"内容: {chunk[:200]}...\n")

找到 18 个PDF文件:
  - 2015AnnualSurveyReport.pdf
  - 2016AnnualSurveyReport.pdf
  - 2017AnnualSurveyReport.pdf
  - 2018AnnualSurveyReport.pdf
  - 2019AnnualSurveyReport.pdf
  - 2020AnnualSurveyReport.pdf
  - 2021AnnualSurveyReport.pdf
  - 2022AnnualSurveyReport.pdf
  - 2023AnnualSurveyReport.pdf
  - 2024AnnualSurveyReport.pdf
  - RCMSUR2007.pdf
  - RCMSUR2008.pdf
  - RCMSUR2009.pdf
  - RCMSUR2010.pdf
  - RCMSUR2011.pdf
  - RCMSUR2012.pdf
  - RCMSUR2013.pdf
  - RCMSUR2014.pdf

开始处理文件...
正在处理: 2015AnnualSurveyReport.pdf
  - 提取了 336 个chunks
正在处理: 2016AnnualSurveyReport.pdf
  - 提取了 340 个chunks
正在处理: 2017AnnualSurveyReport.pdf
  - 提取了 384 个chunks
正在处理: 2018AnnualSurveyReport.pdf
  - 提取了 333 个chunks
正在处理: 2019AnnualSurveyReport.pdf
  - 提取了 331 个chunks
正在处理: 2020AnnualSurveyReport.pdf
  - 提取了 299 个chunks
正在处理: 2021AnnualSurveyReport.pdf
  - 提取了 300 个chunks
正在处理: 2022AnnualSurveyReport.pdf
  - 提取了 408 个chunks
正在处理: 2023AnnualSurveyReport.pdf
  - 提取了 433 个chunks
正在处理: 2024AnnualSurveyReport.pdf
  -

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("shibing624/text2vec-base-chinese")

def embed_chunk(chunk: str) -> List[float]:
    embedding = embedding_model.encode(chunk, normalize_embeddings=True)
    return embedding.tolist()


embedding = embed_chunk("测试内容")
print(len(embedding))
print(embedding)

  from .autonotebook import tqdm as notebook_tqdm


768
[0.026805469766259193, 0.008382044732570648, 0.0003433591336943209, 0.007298996206372976, 0.05433321371674538, -0.05325592681765556, 0.0013655625516548753, -0.001318183378316462, -0.03671126440167427, 0.07188180834054947, -0.0072706774808466434, -0.007053022272884846, 0.04253280535340309, -0.03675280883908272, -0.05475054681301117, -0.009598606266081333, 0.017105495557188988, 0.059153590351343155, -0.03335002437233925, 0.06237659230828285, -0.004888521041721106, -0.034539539366960526, -0.07407601177692413, 0.04422200098633766, 0.010516893118619919, -0.03707779571413994, -0.027029866352677345, 0.038303591310977936, 0.0212824996560812, -0.011811444535851479, -0.00540876854211092, 0.0026590467896312475, -0.023298583924770355, 0.05299092084169388, 0.005149427335709333, 0.029624197632074356, -0.0308096744120121, -0.017856156453490257, 0.042446065694093704, -0.00769231328740716, -0.010638119652867317, 0.03210863843560219, -0.06592466682195663, -0.01210093405097723, 0.006814629305154085, 

In [4]:
# 为所有chunks生成embeddings
print(f"开始为 {len(all_chunks)} 个chunks生成embeddings...")
embeddings = []

# 批量处理，避免内存问题
batch_size = 50
for i in range(0, len(all_chunks), batch_size):
    batch_chunks = all_chunks[i:i+batch_size]
    print(f"处理第 {i+1}-{min(i+batch_size, len(all_chunks))} 个chunks...")
    
    batch_embeddings = [embed_chunk(chunk) for chunk in batch_chunks]
    embeddings.extend(batch_embeddings)

print(f"完成！总共生成了 {len(embeddings)} 个embeddings")
print(f"每个embedding的维度: {len(embeddings[0]) if embeddings else 0}")

开始为 5063 个chunks生成embeddings...
处理第 1-50 个chunks...
处理第 51-100 个chunks...
处理第 101-150 个chunks...
处理第 151-200 个chunks...
处理第 201-250 个chunks...
处理第 251-300 个chunks...
处理第 301-350 个chunks...
处理第 351-400 个chunks...
处理第 401-450 个chunks...
处理第 451-500 个chunks...
处理第 501-550 个chunks...
处理第 551-600 个chunks...
处理第 601-650 个chunks...
处理第 651-700 个chunks...
处理第 701-750 个chunks...
处理第 751-800 个chunks...
处理第 801-850 个chunks...
处理第 851-900 个chunks...
处理第 901-950 个chunks...
处理第 951-1000 个chunks...
处理第 1001-1050 个chunks...
处理第 1051-1100 个chunks...
处理第 1101-1150 个chunks...
处理第 1151-1200 个chunks...
处理第 1201-1250 个chunks...
处理第 1251-1300 个chunks...
处理第 1301-1350 个chunks...
处理第 1351-1400 个chunks...
处理第 1401-1450 个chunks...
处理第 1451-1500 个chunks...
处理第 1501-1550 个chunks...
处理第 1551-1600 个chunks...
处理第 1601-1650 个chunks...
处理第 1651-1700 个chunks...
处理第 1701-1750 个chunks...
处理第 1751-1800 个chunks...
处理第 1801-1850 个chunks...
处理第 1851-1900 个chunks...
处理第 1901-1950 个chunks...
处理第 1951-2000 个chunks...
处理第 2001-20

In [6]:
import chromadb

chromadb_client = chromadb.PersistentClient(path="chroma_db1")
chromadb_collection = chromadb_client.get_or_create_collection(name="coralkita_reports")

def save_embeddings(chunks: List[str], embeddings: List[List[float]], metadata: List[dict]) -> None:
    """保存embeddings到ChromaDB，包含元数据"""
    print(f"开始保存 {len(chunks)} 个文档到ChromaDB...")
    
    # 批量保存，避免内存问题
    batch_size = 100
    for i in range(0, len(chunks), batch_size):
        end_idx = min(i + batch_size, len(chunks))
        batch_chunks = chunks[i:end_idx]
        batch_embeddings = embeddings[i:end_idx]
        batch_metadata = metadata[i:end_idx]
        
        # 准备数据
        ids = [f"chunk_{i+j}" for j in range(len(batch_chunks))]
        metadatas = []
        
        for j, meta in enumerate(batch_metadata):
            metadatas.append({
                'source_file': meta['source_file'],
                'chunk_index': meta['chunk_index'],
                'year': meta['year'],
                'global_index': i + j
            })
        
        # 添加到集合
        chromadb_collection.add(
            documents=batch_chunks,
            embeddings=batch_embeddings,
            metadatas=metadatas,
            ids=ids
        )
        
        print(f"已保存第 {i+1}-{end_idx} 个文档")

save_embeddings(all_chunks, embeddings, all_metadata)
print("所有文档已保存到ChromaDB!")

开始保存 5063 个文档到ChromaDB...
已保存第 1-100 个文档
已保存第 101-200 个文档
已保存第 201-300 个文档
已保存第 301-400 个文档
已保存第 401-500 个文档
已保存第 501-600 个文档
已保存第 601-700 个文档
已保存第 701-800 个文档
已保存第 801-900 个文档
已保存第 901-1000 个文档
已保存第 1001-1100 个文档
已保存第 1101-1200 个文档
已保存第 1201-1300 个文档
已保存第 1301-1400 个文档
已保存第 1401-1500 个文档
已保存第 1501-1600 个文档
已保存第 1601-1700 个文档
已保存第 1701-1800 个文档
已保存第 1801-1900 个文档
已保存第 1901-2000 个文档
已保存第 2001-2100 个文档
已保存第 2101-2200 个文档
已保存第 2201-2300 个文档
已保存第 2301-2400 个文档
已保存第 2401-2500 个文档
已保存第 2501-2600 个文档
已保存第 2601-2700 个文档
已保存第 2701-2800 个文档
已保存第 2801-2900 个文档
已保存第 2901-3000 个文档
已保存第 3001-3100 个文档
已保存第 3101-3200 个文档
已保存第 3201-3300 个文档
已保存第 3301-3400 个文档
已保存第 3401-3500 个文档
已保存第 3501-3600 个文档
已保存第 3601-3700 个文档
已保存第 3701-3800 个文档
已保存第 3801-3900 个文档
已保存第 3901-4000 个文档
已保存第 4001-4100 个文档
已保存第 4101-4200 个文档
已保存第 4201-4300 个文档
已保存第 4301-4400 个文档
已保存第 4401-4500 个文档
已保存第 4501-4600 个文档
已保存第 4601-4700 个文档
已保存第 4701-4800 个文档
已保存第 4801-4900 个文档
已保存第 4901-5000 个文档
已保存第 5001-5063 个文档
所有文档已保存到ChromaDB!


In [9]:
def retrieve(query: str, top_k: int) -> tuple:
    """检索相关文档，返回文档内容和元数据"""
    query_embedding = embed_chunk(query)
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results['documents'][0], results['metadatas'][0], results['distances'][0]

def display_retrieved_docs(query: str, top_k: int = 5):
    """显示检索结果，包含来源信息"""
    documents, metadatas, distances = retrieve(query, top_k)
    
    print(f"查询: {query}")
    print(f"找到 {len(documents)} 个相关文档:\n")
    
    for i, (doc, meta, distance) in enumerate(zip(documents, metadatas, distances)):
        print(f"[{i+1}] 来源: {meta['source_file']} ({meta['year']})")
        print(f"    相似度分数: {1-distance:.3f}")
        print(f"    内容: {doc[:300]}...")
        print("-" * 80)

# 测试查询
query = "马来西亚2024年珊瑚礁情况如何 中文回复"
display_retrieved_docs(query, 5)

查询: 马来西亚2024年珊瑚礁情况如何 中文回复
找到 5 个相关文档:

[1] 来源: 2024AnnualSurveyReport.pdf (2024)
    相似度分数: 0.091
    内容: variation over the years.  
• The cause of the drastic deterioration in 2021 was not known. 
• The deterioration in 2024 was due to a combination of physical damage caused by human activities and/or 
storm, raised level of nutrient in the waters around the island and the 4th Global Coral Bleaching E...
--------------------------------------------------------------------------------
[2] 来源: 2024AnnualSurveyReport.pdf (2024)
    相似度分数: 0.066
    内容: deterioration in coral reef health noted in our 2023 survey programme has continued into 2024. 
63% of the islands/areas surveyed saw a decrease in Live Coral Cover (LCC) , a key coral reef health 
indicator.  
• As in previous years, the abundance of most fish and invertebrate indicators continues ...
--------------------------------------------------------------------------------
[3] 来源: 2023AnnualSurveyReport.pdf (2023)
    相似度分数: 0.05

In [None]:
from sentence_transformers import CrossEncoder

def rerank(query: str, documents: List[str], metadatas: List[dict], top_k: int) -> tuple:
    """重排序检索结果，返回排序后的文档和元数据"""
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    pairs = [(query, doc) for doc in documents]
    scores = cross_encoder.predict(pairs)

    # 组合文档、元数据和分数
    scored_items = list(zip(documents, metadatas, scores))
    scored_items.sort(key=lambda x: x[2], reverse=True)

    # 分离排序后的结果
    reranked_docs = [item[0] for item in scored_items[:top_k]]
    reranked_metas = [item[1] for item in scored_items[:top_k]]
    reranked_scores = [item[2] for item in scored_items[:top_k]]
    
    return reranked_docs, reranked_metas, reranked_scores

def display_reranked_docs(query: str, top_k: int = 3):
    """显示重排序后的结果"""
    # 先检索更多文档
    documents, metadatas, distances = retrieve(query, 10)
    
    # 重排序
    reranked_docs, reranked_metas, reranked_scores = rerank(query, documents, metadatas, top_k)
    
    print(f"重排序后的前 {top_k} 个结果:")
    print("=" * 80)
    
    for i, (doc, meta, score) in enumerate(zip(reranked_docs, reranked_metas, reranked_scores)):
        print(f"[{i+1}] 来源: {meta['source_file']} ({meta['year']})")
        print(f"    重排序分数: {score:.3f}")
        print(f"    内容: {doc[:300]}...")
        print("-" * 80)

# 获取重排序后的结果用于后续生成
documents, metadatas, distances = retrieve(query, 10)
reranked_chunks, reranked_metas, reranked_scores = rerank(query, documents, metadatas, 3)

# 显示重排序结果
display_reranked_docs(query, 3)

重排序后的前 3 个结果:
[1] 来源: RCMSUR2009.pdf (2009)
    重排序分数: 4.192
    内容: rts. 
 
 

6 
 
Status of Coral Reefs in Malaysia 2009   
1.3 Reef Check in Malaysia 
 
Malaysia is part of the “Coral Triangle”, the area of the world’s oceans recognized by scientists as 
having the highest marine biodiversity. Coral reefs represent an economically important ecosystem 
and are the...
--------------------------------------------------------------------------------
[2] 来源: RCMSUR2014.pdf (2014)
    重排序分数: 2.548
    内容: s are generally in “fair” or 
“good” condition, though it is acknowledged that these averages mask variations in different reef areas.  
 
Coral reefs are an important biological and economic resource in Malaysia, providing food and jobs for 
thousands of people. Reefs must be conserved for the bene...
--------------------------------------------------------------------------------
[3] 来源: RCMSUR2009.pdf (2009)
    重排序分数: 1.988
    内容: efs for future generations. 
37 
 
Status of Coral 

In [7]:
from dotenv import load_dotenv
from google import genai

load_dotenv()
google_client = genai.Client()

def generate_with_sources(query: str, chunks: List[str], metadatas: List[dict]) -> str:
    """生成答案并包含来源信息"""
    # 构建带来源信息的prompt
    sources_info = []
    for i, (chunk, meta) in enumerate(zip(chunks, metadatas)):
        source_desc = f"片段{i+1} (来源: {meta['source_file']}, 年份: {meta['year']})"
        sources_info.append(f"{source_desc}:\n{chunk}")
    
    prompt = f"""你是一位珊瑚知识助手，请根据用户的问题和下列片段生成准确的回答。

用户问题: {query}

相关片段:
{"\n\n".join(sources_info)}

请基于上述内容作答，不要编造信息。在回答末尾，请列出你参考的主要来源文件。"""

    print("正在生成答案...")
    print(f"使用的片段来源: {[meta['source_file'] for meta in metadatas]}")
    print("-" * 80)

    try:
        response = google_client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt
        )
        return response.text
    except Exception as e:
        return f"生成答案时出错: {e}"

def display_final_answer(query: str):
    """显示完整的问答结果"""
    print(f"问题: {query}")
    print("=" * 80)
    
    # 生成答案
    answer = generate_with_sources(query, reranked_chunks, reranked_metas)
    
    print("答案:")
    print("-" * 40)
    print(answer)
    
    print("\n参考来源:")
    print("-" * 40)
    for i, meta in enumerate(reranked_metas):
        print(f"{i+1}. {meta['source_file']} ({meta['year']})")

# 显示最终答案
display_final_answer(query)

问题: General situation of coral reefs in Malaysia in 2014
正在生成答案...
使用的片段来源: ['RCMSUR2009.pdf', 'RCMSUR2014.pdf', 'RCMSUR2009.pdf']
--------------------------------------------------------------------------------
答案:
----------------------------------------
根据2014年的报告，马来西亚的珊瑚礁总体状况被评估为“一般”（fair）或“良好”（good）。然而，需要注意的是，这些平均状况可能掩盖了不同珊瑚礁区域的差异。

珊瑚礁是马来西亚重要的生物和经济资源，为成千上万的人提供食物和就业机会。尽管当时的状况看似相对稳定，但报告强调不能因此自满，并指出必须为子孙后代保护珊瑚礁。

**主要来源文件：**
*   RCMSUR2014.pdf

参考来源:
----------------------------------------
1. RCMSUR2009.pdf (2009)
2. RCMSUR2014.pdf (2014)
3. RCMSUR2009.pdf (2009)
