# 多模态多意图检索系统 - Vespa本地部署

本notebook演示如何使用Vespa在本地部署一个多模态多意图检索系统，基于ColQwen2.5模型和BGE嵌入模型。整个流程包括：

1. 初始化必要的模型（文本嵌入、图像嵌入、重排序器等）
2. 在本地Docker中部署Vespa服务
3. 将PDF文档索引到Vespa中
4. 执行多意图检索测试

该系统能够处理包含多个信息需求的复杂查询，并通过意图拆分和精化技术提高检索效果。

## 1. 环境准备和依赖安装

In [None]:
# 安装必要的依赖
! pip install pyvespa vespacli torch FlagEmbedding pdf2image pytesseract paddleocr colpali-engine tqdm python-dotenv matplotlib numpy pillow

### 安装poppler-utils（PDF处理必需）

In [3]:
import sys
import subprocess

# 检测操作系统并安装poppler
# if sys.platform.startswith('linux'):
#     # Linux (Ubuntu/Debian)
#     !sudo apt-get install -y poppler-utils
# elif sys.platform == 'darwin':
#     # macOS
#     !brew install poppler
# else:
#     print("请手动安装poppler-utils，Windows上请参考pdf2image文档")

### 检查Docker是否安装

In [4]:
def check_docker_running():
    """检查Docker是否运行中"""
    try:
        result = subprocess.run(["docker", "info"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("Docker正在运行，可以继续操作")
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("Docker未运行或未安装，请确保Docker服务已启动")
        return False

check_docker_running()

Docker正在运行，可以继续操作


True

## 2. 导入必要的库和设置日志

In [5]:
import os
import sys
import json
import time
import socket
import logging
import traceback
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import io
import base64
from copy import deepcopy
from textwrap import dedent
from pdf2image import convert_from_path
from pathlib import Path
from dotenv import load_dotenv

# 创建日志目录
log_dir = Path("/Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/log")
log_dir.mkdir(exist_ok=True)
log_file = log_dir / "multimodal_intent_test.log"

root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(str(log_file), mode='a', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# 开头加入测试日志
logger.info("=== 日志系统初始化完成，日志文件: %s ===", log_file)

# 添加必要的路径
sys.path.append("DeepRAG_Multimodal/deep_retrieve")
# 加载环境变量
load_dotenv(
    "/Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/configs/.env")

2025-04-26 22:23:38,554 - INFO - === 日志系统初始化完成，日志文件: /Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/log/multimodal_intent_test.log ===


True

In [6]:
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
from FlagEmbedding import FlagModel

text_model = FlagModel(
    "BAAI/bge-large-en-v1.5",
    query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
    use_fp16=True,
    device='cpu'
)


image_model = ColQwen2_5.from_pretrained(
    "vidore/colqwen2.5-v0.2",
    torch_dtype=torch.float16,
    device_map='cpu'
).eval()

processor = ColQwen2_5_Processor.from_pretrained(
    "vidore/colqwen2.5-v0.1",
    size={"shortest_edge": 512, "longest_edge": 1024})

2025-04-26 22:34:21,797 - INFO - NumExpr defaulting to 16 threads.
2025-04-26 22:34:22,542 - INFO - PyTorch version 2.5.1 available.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
def load_test_data():
    """加载测试数据集"""
    logger.info("加载测试数据: /Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc/selected_LongDocURL_public_with_subtask_category.jsonl")
    test_data = []

    with open("/Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc/selected_LongDocURL_public_with_subtask_category.jsonl", 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                test_data.append(json.loads(line))

    np.random.seed(42)  # 设置随机种子确保可重复性
    test_data = np.random.choice(test_data, 1, replace=False).tolist()

    logger.info(f"成功加载 {len(test_data)} 条测试数据")
    return test_data

def process_single_document(doc_data):
    """处理单个文档，使用预处理文本和PDF图像

    根据MultimodalMatcher的接口要求，格式化文档数据
    """
    documents = []

    # 获取PDF文件路径
    pdf_path = os.path.join('/Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc', doc_data["pdf_path"])

    try:
        pages = convert_from_path(pdf_path)
        logger.info(f"成功将PDF转换为 {len(pages)} 页图像")
    except Exception as e:
        logger.error(f"转换PDF时出错：{str(e)}")
        return []

    # 获取预处理的OCR结果
    ocr_file = os.path.join(
        "/Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc",
        "paddleocr_save",
        f"{os.path.basename(doc_data['pdf_path']).replace('.pdf', '.json')}"
    )

    # 读取预处理的文本数据
    if os.path.exists(ocr_file):
        with open(ocr_file, 'r', encoding='utf-8') as f:
            loaded_data = json.load(f)
        logger.info(f"成功读取预处理文本文件: {ocr_file}")
    else:
        logger.warning(f"找不到预处理文本文件: {ocr_file}")
        return []

    # 验证页面数量匹配
    if len(loaded_data) != len(pages):
        logger.warning(f"OCR数据页数 ({len(loaded_data)}) 与PDF页数 ({len(pages)}) 不匹配")
        # 使用较小的数量
        page_count = min(len(loaded_data), len(pages))
    else:
        page_count = len(pages)

    # 为每一页创建文档对象
    page_keys = list(loaded_data.keys())
    for idx in range(page_count):
        if idx >= len(pages):
            break

        # 检查页面尺寸是否有效
        page = pages[idx]
        width, height = page.size
        if width <= 0 or height <= 0:
            logger.warning(f"跳过无效页面 {idx + 1}：尺寸 {width}x{height}")
            continue

        # 获取OCR文本
        page_text = loaded_data[page_keys[idx]] if idx < len(page_keys) else ""

        # 创建文档结构
        documents.append({
            "text": page_text,
            "image": page,
            "metadata": {
                "page_index": idx + 1,
                "pdf_path": doc_data.get("pdf_path", "")
            }
        })

    logger.info(f"成功创建 {len(documents)} 个文档对象")
    return documents

test_data = load_test_data()
documents = process_single_document(test_data[0])
print(documents)

2025-04-26 22:45:33,696 - INFO - 加载测试数据: /Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc/selected_LongDocURL_public_with_subtask_category.jsonl
2025-04-26 22:45:33,699 - INFO - 成功加载 1 条测试数据
2025-04-26 22:45:35,236 - INFO - 成功将PDF转换为 83 页图像
2025-04-26 22:45:35,243 - INFO - 成功读取预处理文本文件: /Users/chloe/Documents/Academic/AI/Project/基于Colpali的多模态检索标准框架/multimodal-RAG/DeepRAG_Multimodal/picked_LongDoc/paddleocr_save/4126467.json
2025-04-26 22:45:35,243 - INFO - 成功创建 83 个文档对象




In [16]:
from vespa.application import Vespa
vespa_url = "http://localhost:8080"
app = Vespa(url=vespa_url)

In [19]:
def _compute_text_embedding(text: str):
    """计算归一化文本嵌入"""
    if not text or not text.strip():
        return np.zeros(384)

    try:
        with torch.no_grad():
            # 尝试使用不同模型接口
            if hasattr(text_model, 'encode'):
                # BGE/FlagModel方式
                embedding = text_model.encode([text])
                if isinstance(embedding, torch.Tensor):
                    vector = embedding[0].cpu().numpy()
                else:
                    vector = embedding[0]
            else:
                # 其他方式
                inputs = processor.process_queries([text]).to(text_model.device)
                outputs = text_model(**inputs)
                vector = outputs[0].cpu().numpy() if isinstance(outputs, torch.Tensor) else outputs[0]

            # 归一化向量
            norm = np.linalg.norm(vector)
            if norm > 0:
                vector = vector / norm

            return vector[:384]
    except Exception as e:
        logger.error(f"计算文本嵌入时出错: {str(e)}")
        return np.zeros(384)
    
def index_documents(documents):
    """将文档索引到Vespa"""
    if app is None:
        logger.warning("Vespa应用未初始化，无法索引文档")
        return False

    logger.info(f"开始索引 {len(documents)} 个文档...")

    # 创建vespa_feed列表存储所有要索引的文档
    images = [doc["image"] for doc in documents]
    img_embeddings = []
    batch_size = 2
    
    for i in range(0, len(images), batch_size):
        batch_images = images[i:i+batch_size]
        with torch.no_grad():
            # 处理图像批次
            batch_inputs = processor.process_images(batch_images).to('cpu')
            batch_embeddings = image_model(**batch_inputs)
            img_embeddings.extend(list(torch.unbind(batch_embeddings.to("cpu"))))
    
    vespa_feed = []
    for i, (doc, embedding) in enumerate(zip(documents, img_embeddings)):
        # 创建文档ID
        pdf_name = doc["metadata"]["pdf_path"].split('.')[0]
        doc_id = f"{pdf_name}_page{doc['metadata']['page_index']}"
        text = doc.get("text", "")
        
        # 处理嵌入向量
        embedding_dict = {}
        for idx, patch_embedding in enumerate(embedding):
            # 将向量转换为二进制表示
            binary_vector = (
                np.packbits(np.where(patch_embedding.numpy() > 0, 1, 0))
                .astype(np.int8)
                .tobytes()
                .hex()
            )
            embedding_dict[idx] = binary_vector
        
        # 创建Vespa文档
        vespa_doc = {
            "id": doc_id,
            "fields": {
                "id": doc_id,
                "pdf_path": doc['metadata']['pdf_path'],
                "page_index": doc['metadata']['page_index'],
                "text": doc['text'],
                "embedding": embedding_dict,
                "text_embedding": _compute_text_embedding(text).tolist()
                # 如果需要也可以添加text_embedding
            }
        }
        
        vespa_feed.append(vespa_doc)
    
    return vespa_feed

vespa_feed = index_documents(documents)
print(vespa_feed)

2025-04-28 17:11:15,178 - INFO - 开始索引 83 个文档...




## 6. 索引PDF文档

这个步骤将选择一个PDF文档，将其索引到Vespa中

In [20]:
def callback(response, id):
    if not response.is_successful():
        logger.warning(f"Failed to feed document {id}: {response.get_status_code()}")
        logger.warning(response.json)

try:
# 使用较小的批处理大小
    BATCH_SIZE = 1  # 减小批处理大小
    logger.info(f"批量索引 {len(vespa_feed)} 个文档到Vespa...")
    for i in range(0, len(vespa_feed), BATCH_SIZE):
        batch = vespa_feed[i:i + BATCH_SIZE]
        try:
            logger.info(
                f"索引批次 {i // BATCH_SIZE + 1}/{(len(vespa_feed) - 1) // BATCH_SIZE + 1}，共 {len(batch)} 个文档...")
            app.feed_iterable(
                batch,
                schema="pdf_page",
                callback=callback,
                timeout=180  # 增加超时时间
            )
            # 每批后短暂暂停
            time.sleep(1)
        except Exception as e:
            logger.error(f"批量索引文档时出错 (批次 {i // BATCH_SIZE + 1}): {str(e)}")
            logger.error(traceback.format_exc())
    logger.info("文档索引完成")

except Exception as e:
    logger.error(f"批量索引文档时出错: {str(e)}")
    logger.error(traceback.format_exc())

2025-04-28 17:14:41,548 - INFO - 批量索引 83 个文档到Vespa...
2025-04-28 17:14:41,549 - INFO - 索引批次 1/83，共 1 个文档...
2025-04-28 17:14:42,944 - INFO - 索引批次 2/83，共 1 个文档...
2025-04-28 17:14:44,015 - INFO - 索引批次 3/83，共 1 个文档...
2025-04-28 17:14:45,078 - INFO - 索引批次 4/83，共 1 个文档...
2025-04-28 17:14:46,129 - INFO - 索引批次 5/83，共 1 个文档...
2025-04-28 17:14:47,185 - INFO - 索引批次 6/83，共 1 个文档...
2025-04-28 17:14:48,250 - INFO - 索引批次 7/83，共 1 个文档...
2025-04-28 17:14:49,315 - INFO - 索引批次 8/83，共 1 个文档...
2025-04-28 17:14:50,366 - INFO - 索引批次 9/83，共 1 个文档...
2025-04-28 17:14:51,431 - INFO - 索引批次 10/83，共 1 个文档...
2025-04-28 17:14:52,482 - INFO - 索引批次 11/83，共 1 个文档...
2025-04-28 17:14:53,543 - INFO - 索引批次 12/83，共 1 个文档...
2025-04-28 17:14:54,599 - INFO - 索引批次 13/83，共 1 个文档...
2025-04-28 17:14:55,646 - INFO - 索引批次 14/83，共 1 个文档...
2025-04-28 17:14:56,711 - INFO - 索引批次 15/83，共 1 个文档...
2025-04-28 17:14:57,766 - INFO - 索引批次 16/83，共 1 个文档...
2025-04-28 17:14:58,825 - INFO - 索引批次 17/83，共 1 个文档...
2025-04-28 17:14:59,

## 7. 设置查询和执行检索测试

In [23]:
from vespa.application import Vespa
import json

def query_pdf_documents(pdf_name, limit=10):
    """
    使用Vespa Python客户端查询包含特定PDF文件名的所有文档
    
    参数:
        pdf_name: 要搜索的PDF文件名(如 "126467.pdf")
        limit: 要返回的最大结果数量
    
    返回:
        查询结果的对象
    """
    
    # 构建YQL查询
    yql_query = f'select * from pdf_page where pdf_path contains "{pdf_name}" limit {limit}'
    
    # 执行查询
    response = app.query(yql=yql_query)
    
    return response

def print_result_summary(response):
    """打印查询结果摘要"""
    if not response or not hasattr(response, 'hits') or not response.hits:
        print("没有找到匹配的文档")
        return
    
    total_count = response.number_documents_returned
    total_found = response.number_documents_retrieved
    
    print(f"\n找到 {total_found} 个匹配文档，返回了 {total_count} 个结果:")
    print("-" * 50)
    
    for i, hit in enumerate(response.hits):
        fields = hit.fields
        
        print(f"文档 {i+1}:")
        print(f"  ID: {fields.get('id', 'N/A')}")
        print(f"  PDF路径: {fields.get('pdf_path', 'N/A')}")
        print(f"  页码: {fields.get('page_index', 'N/A')}")
        
        # 打印文本内容(截断过长的文本)
        text = fields.get('text', '')
        if len(text) > 100:
            text = text[:100] + "..."
        print(f"  文本内容: {text}")
        print(f"  相关度得分: {hit.relevance}")
        print("-" * 50)

response = query_pdf_documents("4126467.pdf")
print(response.hits)
# print_result_summary(response)

[{'id': 'id:pdf_page:pdf_page::4126467_page1', 'relevance': 0.0, 'source': 'pdf_pages', 'fields': {'sddocname': 'pdf_page', 'documentid': 'id:pdf_page:pdf_page::4126467_page1', 'id': '4126467_page1', 'pdf_path': '4126467.pdf', 'page_index': 1}}, {'id': 'id:pdf_page:pdf_page::4126467_page2', 'relevance': 0.0, 'source': 'pdf_pages', 'fields': {'sddocname': 'pdf_page', 'documentid': 'id:pdf_page:pdf_page::4126467_page2', 'id': '4126467_page2', 'pdf_path': '4126467.pdf', 'page_index': 2}}, {'id': 'id:pdf_page:pdf_page::4126467_page3', 'relevance': 0.0, 'source': 'pdf_pages', 'fields': {'sddocname': 'pdf_page', 'documentid': 'id:pdf_page:pdf_page::4126467_page3', 'id': '4126467_page3', 'pdf_path': '4126467.pdf', 'page_index': 3}}, {'id': 'id:pdf_page:pdf_page::4126467_page4', 'relevance': 0.0, 'source': 'pdf_pages', 'fields': {'sddocname': 'pdf_page', 'documentid': 'id:pdf_page:pdf_page::4126467_page4', 'id': '4126467_page4', 'pdf_path': '4126467.pdf', 'page_index': 4}}, {'id': 'id:pdf_page

In [26]:
queries = [
    "Which tables outline the required flow and openings for flushing pipelines?\nSelect table names from the doc that best answer the question, do not alter or analyze the table names themselves."
]

query_inputs = processor.process_queries(queries).to('cpu')
with torch.no_grad():
    query_embeddings = image_model(**query_inputs)
query_embeddings = query_embeddings.to("cpu")
print(query_embeddings.shape)
query_embedding = torch.unbind(query_embeddings)[0]
print(query_embedding)

torch.Size([1, 46, 128])
tensor([[-0.0112,  0.1169, -0.0855,  ..., -0.0449, -0.0049, -0.1042],
        [ 0.0161,  0.1156, -0.1057,  ..., -0.0567,  0.0234, -0.1076],
        [-0.0842, -0.0551, -0.0086,  ..., -0.0038, -0.0201,  0.1089],
        ...,
        [ 0.1017,  0.0293,  0.0124,  ..., -0.1560,  0.0668, -0.1667],
        [ 0.0989,  0.0322,  0.0102,  ..., -0.1637,  0.0629, -0.1626],
        [ 0.0952,  0.0351,  0.0076,  ..., -0.1708,  0.0612, -0.1642]],
       dtype=torch.float16)


In [27]:
vespa_qt_format = {}
if hasattr(query_embedding, 'shape') and len(query_embedding.shape) > 1:
    # 多patch/token格式 - 例如 (n_patches, vector_dim)
    for i, patch_embedding in enumerate(query_embedding):
        # 将向量转换为列表
        vector_values = patch_embedding.tolist()
        # 添加到vespa格式
        vespa_qt_format[i] = vector_values
        
else:
    # 单一向量格式
    vector_values = query_embedding.tolist()
    vespa_qt_format[0] = vector_values
    
request_body = {
        "input.query(qt)": vespa_qt_format,
        "input.query(text_weight)": 0.6,  # 可以根据需要调整文本权重
        "presentation.timing": True
    }
print(request_body)

{'input.query(qt)': {0: [-0.0112152099609375, 0.116943359375, -0.08551025390625, -0.06146240234375, -0.11761474609375, 0.0755615234375, 0.1990966796875, 0.10333251953125, 0.057159423828125, -0.0302886962890625, -0.09808349609375, 0.095947265625, 0.07513427734375, 0.0300140380859375, 0.01206207275390625, 0.1268310546875, -0.01300811767578125, 0.0802001953125, -0.08856201171875, 0.0614013671875, 0.006999969482421875, -0.031982421875, -0.08782958984375, -0.05902099609375, -0.0701904296875, -0.059844970703125, 0.08685302734375, -0.07635498046875, -0.00821685791015625, -0.047210693359375, 0.048797607421875, -0.11083984375, -0.09674072265625, 0.030029296875, -0.06671142578125, -0.0267486572265625, -0.07403564453125, -0.04730224609375, -0.04150390625, -0.0030422210693359375, 0.007228851318359375, -0.058441162109375, 0.01523590087890625, -0.0301971435546875, 0.0125274658203125, -0.070068359375, -0.090576171875, 0.1357421875, 0.1724853515625, 0.0887451171875, -0.0249786376953125, 0.1328125, 0.0

In [45]:

try:
    response = app.query(
        yql="select id, pdf_path, page_index, text from pdf_page where userInput(@userQuery) and pdf_path contains '4126467.pdf'",
        ranking="default",
        userQuery=queries[0],
        timeout=120,
        hits=5,  # 返回前5个结果
        body=request_body,
    )
    
    # 9. 检查响应
    if response.is_successful():
        print("查询成功!")

    else:
        print(f"查询失败: {response.status_code}")
        print(f"错误信息: {response.json}")
            
except Exception as e:
    print(f"查询过程中出错: {str(e)}")



查询过程中出错: [{'code': 17, 'summary': 'Bad request.', 'message': 'Invalid request [/search/?yql=select+id%2C+pdf_path%2C+page_index%2C+text+from+pdf_page+where+userInput%28%40userQuery%29+and+pdf_path+contains+%274126467.pdf%27&ranking=default&userQuery=Which+tables+outline+the+required+flow+and+openings+for+flushing+pipelines%3F%0ASelect+table+names+from+the+doc+that+best+answer+the+question%2C+do+not+alter+or+analyze+the+table+names+themselves.&timeout=120&hits=5]: invalid presentation.summary=true'}]


In [40]:
print(response.hits)

[{'id': 'index:pdf_pages/0/f809a40d5ba64116493e4f6a', 'relevance': 20.486672378339286, 'source': 'pdf_pages', 'fields': {'id': '4126467_page77', 'pdf_path': '4126467.pdf', 'page_index': 77}}, {'id': 'index:pdf_pages/0/9b737e1b8d3596a15d8d2393', 'relevance': 17.35146883447797, 'source': 'pdf_pages', 'fields': {'id': '4126467_page76', 'pdf_path': '4126467.pdf', 'page_index': 76}}, {'id': 'index:pdf_pages/0/1c2a202d7376db7313acfd90', 'relevance': 14.646819545275003, 'source': 'pdf_pages', 'fields': {'id': '4126467_page67', 'pdf_path': '4126467.pdf', 'page_index': 67}}, {'id': 'index:pdf_pages/0/c6b06f518e14be034c800aa6', 'relevance': 14.324492702582633, 'source': 'pdf_pages', 'fields': {'id': '4126467_page71', 'pdf_path': '4126467.pdf', 'page_index': 71}}, {'id': 'index:pdf_pages/0/bfed7cae8156181a9e2a1f8e', 'relevance': 13.845469925449228, 'source': 'pdf_pages', 'fields': {'id': '4126467_page21', 'pdf_path': '4126467.pdf', 'page_index': 21}}]


In [41]:
for i, hit in enumerate(response.hits):
    print(f"\n结果 #{i+1} (相关度: {hit['relevance']})")
    fields = hit['fields']
    
    print(f"  PDF路径: {fields.get('pdf_path', 'N/A')}")
    print(f"  页码: {fields.get('page_index', 'N/A')}")
    
    # 打印文本内容(截断过长的文本)
    text = fields.get('text', '')
    if len(text) > 200:
        text = text[:200] + "..."
    print(f"  文本摘要: {text}")


结果 #1 (相关度: 20.486672378339286)
  PDF路径: 4126467.pdf
  页码: 77
  文本摘要: 

结果 #2 (相关度: 17.35146883447797)
  PDF路径: 4126467.pdf
  页码: 76
  文本摘要: 

结果 #3 (相关度: 14.646819545275003)
  PDF路径: 4126467.pdf
  页码: 67
  文本摘要: 

结果 #4 (相关度: 14.324492702582633)
  PDF路径: 4126467.pdf
  页码: 71
  文本摘要: 

结果 #5 (相关度: 13.845469925449228)
  PDF路径: 4126467.pdf
  页码: 21
  文本摘要: 


In [44]:
print(json.dumps(response.json, indent=2))

{
  "timing": {
    "querytime": 0.013000000000000001,
    "summaryfetchtime": 0.0,
    "searchtime": 0.014
  },
  "root": {
    "id": "toplevel",
    "relevance": 1.0,
    "fields": {
      "totalCount": 83
    },
    "coverage": {
      "coverage": 100,
      "documents": 688,
      "full": true,
      "nodes": 1,
      "results": 1,
      "resultsFull": 1
    },
    "children": [
      {
        "id": "index:pdf_pages/0/f809a40d5ba64116493e4f6a",
        "relevance": 20.486672378339286,
        "source": "pdf_pages",
        "fields": {
          "id": "4126467_page77",
          "pdf_path": "4126467.pdf",
          "page_index": 77
        }
      },
      {
        "id": "index:pdf_pages/0/9b737e1b8d3596a15d8d2393",
        "relevance": 17.35146883447797,
        "source": "pdf_pages",
        "fields": {
          "id": "4126467_page76",
          "pdf_path": "4126467.pdf",
          "page_index": 76
        }
      },
      {
        "id": "index:pdf_pages/0/1c2a202d7376db7313ac

In [55]:
import requests

def get_text_by_docapi(doc_id, vespa_url="http://localhost:8080"):
    url = f"{vespa_url}/document/v1/pdf_page/pdf_page/docid/{doc_id}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        fields = data.get("fields", {})
        text = fields.get("text", "")
        return text
    else:
        print(f"请求失败: {response.status_code}")
        return None

for hit in response.hits:
    doc_id = hit['fields']['id']
    text_content = get_text_by_docapi(doc_id)
    print(text_content)


TABLE 5 Required flow and openings to flush pipelines (40 psi residual pressure in
water main)*
 Flow Required
To Produce
Size of Tap, in.
2.5 ft/s(approx)
1
11/2
2
Number of
 Pipe Diameter
Velocity in Main
21/2-in.
in.
gpm
Number of Taps on Pipet
Hydrant Outlets
4
100
1
1
6
200
--
1
1
8
400
2
1
1
-
10
600
3
2
1
12
900
2
2
16
1600
4
2
* With a 40-psi pressure in the main and the hydrant flowing to atmosphere, a 2%/2-in. hydrant outlet will
discharge approximately 1000 gpm; and a 41/z in. hydrant outet will discharge approximately 2500 gpm.
+ Number of taps on pipe based on discharge through 5 feet of galvanized iron (Gl) pipe with one 90°
elbow.
P rocedure for chlorinating the main
1.
Water supplied from a temporary, backflow-protected connection to the
existing distribution system or other approved source of supply shall be made to flow at
a constant, measured rate into the newly installed water main. In the absence of a
meter, the rate may be approximated by methods such as placing a