# 1.图片存储为image

In [None]:
import fitz  # PyMuPDF
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
base_dir = '/data/coding/patent_qa/train/'
pdf_file_list  = [x for x in os.listdir(base_dir+'/documents/') if 'pdf' in x]

for file_name in tqdm(pdf_file_list):
    pdf_document = fitz.open(base_dir+'/documents/'+file_name)
    os.makedirs(base_dir+'/pdf_img/'+file_name.split('.')[0],exist_ok=True)
    # 获取第一页
    for i in range(pdf_document.page_count):
        page = pdf_document.load_page(i)  # 注意：页码从0开始
        # 将页面转换为图像
        pix = page.get_pixmap(dpi=600) # 这些文档600的dpi够了
        pix.save(base_dir+'/pdf_img/'+file_name.split('.')[0]+'/'+str(i+1)+'.jpg')

In [6]:
# 2.OCR

In [None]:
from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, load_dataset
from swift.plugin import InferStats
from swift.llm import VllmEngine
import os
os.environ['VIDEO_MAX_PIXELS'] = '50176'
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["MAX_PIXELS"] = "1229312"
os.environ['FPS_MAX_FRAMES'] = "2"
model_path = "/data/coding/llm_model/Qwen/Qwen2___5-VL-7B-Instruct/"
# 多卡设置tensor_parallel_size为卡数
engine = VllmEngine(model_path,model_type='qwen2_5_vl',gpu_memory_utilization=0.9,limit_mm_per_prompt={"image": 1},tensor_parallel_size=2)
def infer_batch(engine, infer_requests):
    request_config = RequestConfig(max_tokens=10240, temperature=0)
    metric = InferStats()
    resp_list = engine.infer(infer_requests, request_config)
    response = resp_list[0].choices[0].message.content
    return response

In [7]:
def qwen_ocr_cn(img_path):
    prompt="你是一个OCR专家，请提取以下图片中的所有文字内容，并原样返回。"
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": img_path,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    data = dict()
    data['messages'] = messages
    infer_requests = [InferRequest(**data)]
    response = infer_batch(engine, infer_requests)
    return response

def qwen_ocr_en(img_path):
    prompt="You are an OCR expert. Please extract all text from the following images, which have been converted from PDFs, and return the exact text content found in the attached image."
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": img_path,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    data = dict()
    data['messages'] = messages
    infer_requests = [InferRequest(**data)]
    response = infer_batch(engine, infer_requests)
    return response

In [None]:
# 进行ocr
import fitz  # PyMuPDF
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
base_dir = '/data/coding/patent_qa/train/'
pdf_file_list = [x for x in os.listdir(base_dir+'/pdf_img/')]
for pdf_file in pdf_file_list:
    file_name = pdf_file.split('.')[0]
    file_list = [x for x in os.listdir(base_dir+'/pdf_img/'+file_name) if 'jpg' in x] # pdf文件存储的jpg
    os.makedirs(base_dir+'/pdf_ocr/'+file_name,exist_ok=True)
    for k in tqdm(range(len(file_list))):
        img_path = base_dir+'/pdf_img/'+file_name + '/' + file_list[k]
        page_num = int(file_list[k].split('.')[0])
        # ocr
        response = qwen_ocr_cn(img_path)
        # 保存ocr结果为txt
        with open(base_dir+f'/pdf_ocr/{file_name}/{page_num}.txt', 'w', encoding='utf-8') as text_file:
            text_file.write(response)
    print(file_name,'  ok')

# 2. OCR的结果存入向量库

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
os.environ["MAX_PIXELS"] = '1229312' # 1003520
from gme_inference import GmeQwen2VL
gme = GmeQwen2VL(model_name='/data/coding/llm_model/iic/gme-Qwen2-VL-7B-Instruct',max_image_tokens=1280)

In [2]:
import os
import pandas as pd
import numpy as np
import tqdm
from warnings import filterwarnings
# 过滤掉一些警告
filterwarnings("ignore")
base_dir = '/data/coding/patent_qa/train/'
pdf_file_list = [x for x in os.listdir(base_dir+'/pdf_img/')]
files_total_cnt = 0
for pdf_file in pdf_file_list:
    file_name = pdf_file.split('.')[0]
    file_list = [x for x in os.listdir(base_dir+'/pdf_ocr/'+file_name) if 'txt' in x]
    files_total_cnt +=len(file_list)

In [None]:
# 文件有多个，每个都需要存映射关系
ocr_page_num_list = []
ocr_name_list = []
ocr_vectors = np.empty((files_total_cnt, 3584)) # 向量维度是3584维
idx = 0
for pdf_file in pdf_file_list:
    file_name = pdf_file.split('.')[0]
    file_list = [x for x in os.listdir(base_dir+'/pdf_ocr/'+file_name) if 'txt' in x] # pdf文件存储的jpg
    for k in range(len(file_list)):  
        text_path = base_dir+'/pdf_ocr/'+file_name + '/' + file_list[k]
        with open(text_path, 'r', encoding='utf-8') as f:
            text_content = f.read()
        e_text = gme.get_text_embeddings(texts=[text_content])
        ocr_vectors[idx] = e_text[0].to('cpu').numpy()
        page_num = int(file_list[k].split('.')[0])
        ocr_page_num_list.append(page_num)
        ocr_name_list.append(file_name)
        idx+=1
# 映射关系存储到pandas里面比较方便
ocr_page_num_mapping = pd.DataFrame({'index': range(len(ocr_page_num_list)), 'page_num': ocr_page_num_list, 'file_name': ocr_name_list})
# 将向量和页码映射关系存储到文件
np.save('train_ocr_vectors.npy', ocr_vectors)
ocr_page_num_mapping.to_csv('train_ocr_page_num_mapping.csv', index=False) # 存储映射关系


# 3. 图片的结果存入向量库

In [4]:
import os
import pandas as pd
import numpy as np
import tqdm
from warnings import filterwarnings
# 过滤掉一些警告
filterwarnings("ignore")
base_dir = '/data/coding/patent_qa/train/'
pdf_file_list = [x for x in os.listdir(base_dir+'/pdf_img/')]
files_total_cnt = 0
for pdf_file in pdf_file_list:
    file_name = pdf_file.split('.')[0]
    file_list = [x for x in os.listdir(base_dir+'/pdf_img/'+file_name) if 'jpg' in x]
    files_total_cnt +=len(file_list)

In [None]:
# 文件有多个，每个都需要存映射关系
img_page_num_list = []
img_name_list = []
img_vectors = np.empty((files_total_cnt, 3584)) # 向量维度是3584维
idx = 0
for pdf_file in pdf_file_list:
    file_name = pdf_file.split('.')[0]
    file_list = [x for x in os.listdir(base_dir+'/pdf_img/'+file_name) if 'jpg' in x] # pdf文件存储的jpg
    for k in range(len(file_list)):  
        image_path = base_dir+'/pdf_img/'+file_name + '/' + file_list[k]
        e_text = gme.get_image_embeddings(images=[image_path])
        img_vectors[idx] = e_text[0].to('cpu').numpy()
        page_num = int(file_list[k].split('.')[0])
        img_page_num_list.append(page_num)
        img_name_list.append(file_name)
        idx+=1
# 映射关系存储到pandas里面比较方便
img_page_num_mapping = pd.DataFrame({'index': range(len(img_page_num_list)), 'page_num': img_page_num_list, 'file_name': img_name_list})
# 将向量和页码映射关系存储到文件
np.save('train_pdf_img_vectors.npy', img_vectors)
img_page_num_mapping.to_csv('train_pdf_img_page_num_mapping.csv', index=False) # 存储映射关系

# 4. 读取问题生成问题的向量

In [7]:
df_question = pd.read_json('/data/coding/patent_qa/train/questions.jsonl',lines=True)

In [None]:
# 问题的vector进行保存
question_vectors = np.empty((len(df_question), 3584))
for i in range(len(df_question)):
    question = df_question.loc[i,'question']
    document_name = df_question.loc[i,'document']
    options = df_question.loc[i,'options']
    true_answer = df_question.loc[i,'answer']
    full_question = question + ' '.join(options)
    query_vec = gme.get_text_embeddings(texts=[full_question])
    question_vectors[i] = query_vec[0].to('cpu').numpy()
# 保存问题的向量
np.save('all_train_question_vectors.npy', question_vectors)

In [10]:
df_question = pd.read_json("/data/coding/patent_qa/train/questions.jsonl",lines=True)
df_question.head()

Unnamed: 0,question,document,options,answer,group,id
0,根据专利文本，以下哪个是该货物靠边规整处理机构的主要功能？,CN213444549U.pdf,"[A. 实现对货物的加热处理。, B. 实现对货物的靠边规整处理。, C. 实现对货物的分拣...",B,1,1
1,根据专利文本，关于转辊倾斜角度的描述，以下哪项是正确的？,CN213444549U.pdf,"[A. 转辊的倾斜角度为15-20°。, B. 转辊的倾斜角度为5-10°。, C. 转辊的...",B,1,2
2,在文件中第5页提供的图片中，编号为4的部件是什么？,CN213444549U.pdf,"[A. 转轴, B. 侧板, C. 联动皮带, D. 支架]",C,2,3
3,在文件中第5页提供的图片中，编号为5的部件是什么？,CN213444549U.pdf,"[A. 转辊, B. 支架, C. 联动皮带, D. 侧板]",B,2,4
4,在文件中第5页的示意图中，如果货物靠着侧板1移动，那么最可能的原因是？,CN213444549U.pdf,"[A. 转辊2是水平设置的, B. 联动皮带4松动, C. 货物移动到了侧边并且不能再移动,...",C,3,5


In [11]:
import numpy as np
import pandas as pd
quesion_vector = np.load('all_train_question_vectors.npy')
ocr_page_num_mapping = pd.read_csv('train_ocr_page_num_mapping.csv')
train_ocr_vectors = np.load('train_ocr_vectors.npy')

In [27]:
querstion_idx = 0
document_name = df_question.document[querstion_idx].split('.')[0]
vec_idx = ocr_page_num_mapping[ocr_page_num_mapping['file_name']==document_name]['index'].values
candidate_vec = train_ocr_vectors[vec_idx]
query_vec = quesion_vector[querstion_idx]
cos_sim = np.dot(candidate_vec, query_vec) / (np.linalg.norm(candidate_vec) * np.linalg.norm(query_vec))
# 获取最相似的k个索引
k = 5
top_k_indices = np.argsort(cos_sim)[-k:][::-1]
retrived_idx = vec_idx[top_k_indices] # 最相近的5个
retrived_page_num = ocr_page_num_mapping.loc[df_idx]['page_num'].to_list()



In [28]:
retrived_page

[3, 4, 1, 2, 6]

In [29]:
def get_similar_text_embedding(base_dir,document_name,quesion_idx,top_k):
    document_name = df_question.document[querstion_idx].split('.')[0]
    vec_idx = ocr_page_num_mapping[ocr_page_num_mapping['file_name']==document_name]['index'].values
    candidate_vec = train_ocr_vectors[vec_idx]
    query_vec = quesion_vector[querstion_idx]
    cos_sim = np.dot(candidate_vec, query_vec) / (np.linalg.norm(candidate_vec) * np.linalg.norm(query_vec))
    # 获取最相似的top_k个索引
    top_k_indices = np.argsort(cos_sim)[-top_k:][::-1]
    retrived_idx = vec_idx[top_k_indices] # 最相近的top_k个
    retrived_page_num = ocr_page_num_mapping.loc[df_idx]['page_num'].to_list()
    text_list = []
    for i in range(len(retrived_page_num)):
        text_file = base_dir + '/pdf_ocr/' + document_name + '/' + str(page_num) +'.txt'
        with open(text_file,'r'):
            text_list.append(f.read())
    return text_list # 返回一个list,大小最大为top_k


In [None]:
# 试一下qwen3的llm是不是效果好一些
from vllm import LLM, SamplingParams
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
text_qa_llm = LLM(model="/home/octopus/data/dxw/pretrain_model/Qwen/Qwen3-8B",gpu_memory_utilization=0.9)

In [None]:
def get_text_answer(document_name,question,options,quesion_idx):
    prompt="你是一个专利内容分析专家，请根据我提供的专利内容回答我的单选题。\n"
    question = "【我的问题】【"
    question += (question +"】\n")
    question += "【选项】【"
    question += (' '.join(options) + "】\n")
    question += ("请在分析我提供的专利问题后回答我的单选题，回答选项字母。专利内容为：\n")
    retrived_list = get_similar_text_embedding(base_dir,document_name,quesion_idx,top_k=3)
    question += '\n'.join(retrived_list)
    question += ("请你分析专利内容后，回答我的单选题，回答选项字母，你的答案为：\n")
    sampling_params = SamplingParams(temperature=0)
    outputs = text_qa_llm.generate(question,sampling_params)
    return outputs[0].outputs[0].text

In [None]:
train_pdf_image_vectors = np.load("train_pdf_image_vectors.npy")
train_pdf_image_page_num_mapping = pd.read_csv('train_pdf_img_page_num_mapping.csv')
def get_similar_image_embedding(base_dir,document_name,quesion_idx,top_k):
    document_name = df_question.document[querstion_idx].split('.')[0]
    vec_idx = train_pdf_image_page_num_mapping[train_pdf_image_page_num_mapping['file_name']==document_name]['index'].values
    candidate_vec = train_pdf_image_vectors[vec_idx]
    query_vec = quesion_vector[querstion_idx]
    cos_sim = np.dot(candidate_vec, query_vec) / (np.linalg.norm(candidate_vec) * np.linalg.norm(query_vec))
    # 获取最相似的top_k个索引
    top_k_indices = np.argsort(cos_sim)[-top_k:][::-1]
    retrived_idx = vec_idx[top_k_indices] # 最相近的top_k个
    retrived_page_num = train_pdf_image_page_num_mapping.loc[df_idx]['page_num'].to_list()
    image_list = []
    for i in range(len(retrived_page_num)):
        image_file = base_dir + '/pdf_img/' + document_name + '/' + str(page_num) +'.jpg'
        image_list.append(image_file)
    return image_list # 返回一个list,大小最大为top_k

In [None]:
from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, load_dataset
from swift.plugin import InferStats
from swift.llm import VllmEngine
import os
os.environ['VIDEO_MAX_PIXELS'] = '50176'
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["MAX_PIXELS"] = "1229312"
os.environ['FPS_MAX_FRAMES'] = "2"
model_path = "/data/coding/llm_model/Qwen/Qwen2___5-VL-7B-Instruct"
engine = VllmEngine(model_path,model_type='qwen2_5_vl',gpu_memory_utilization=0.9,limit_mm_per_prompt={"image": 3})
def infer_batch(engine, infer_requests):
    request_config = RequestConfig(max_tokens=1024, temperature=0)
    metric = InferStats()
    resp_list = engine.infer(infer_requests, request_config)
    response = resp_list[0].choices[0].message.content
    return response

def get_image_answer(document_name,question,options,quesion_idx):
    prompt="你是一个专利内容分析专家，请根据我提供的专利内容回答我的单选题。\n"
    question1 = "【我的问题】【"
    question1 += (question +"】\n")
    question1 += "【选项】【"
    question1 += (' '.join(options) + "】\n")
    question1 += ("请在分析我提供的专利问题后回答我的单选题，回答选项字母。专利内容为：\n")
    retrived_list = get_similar_image_embedding(base_dir,document_name,ocr_page_num_mapping,quesion_idx,top_k=2)
    question2 = ("请你分析专利内容后，回答我的单选题，回答选项字母，你的答案为：\n")
    if len(retrived_list)>1:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question1},
                    {
                        "type": "image",
                        "image": retrived_list[0],
                    },
                    {
                        "type": "image",
                        "image": retrived_list[1],
                    },
                    {"type": "text", "text": question2},
                ],
            }
        ]
    else:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question1},
                    {
                        "type": "image",
                        "image": retrived_list[0],
                    },
                    {"type": "text", "text": question2},
                ],
            }
        ]
    data = dict()
    data['messages'] = messages
    infer_requests = [InferRequest(**data)]
    response = infer_batch(engine, infer_requests)
    return response

In [None]:
# 给定了图片的情况下，有图片本身，也需要召回对应的文本
def get_mix_answer(document_name,pic_page_num,question,options,question_idx):
    prompt="你是一个专利内容分析专家，请根据我提供的专利内容回答我的单选题。\n"
    question1 = "【我的问题】【"
    question1 += (question +"】\n")
    question1 += "【选项】【"
    question1 += (' '.join(options) + "】\n")
    question1 += ("请在分析我提供的专利问题后回答我的单选题，回答选项字母。专利内容为：\n")
    retrived_list = get_similar_text_embedding(base_dir,document_name,ocr_page_num_mapping,quesion_idx,top_k=2)
    question2 = ("请你分析专利内容后，回答我的单选题，回答选项字母，你的答案为：\n")
    messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question1},
                    {
                        "type": "image",
                        "image": base_dir + '/pdf_img/' + document_name + '/' + str(page_num) +'.jpg',
                    },
                    {"type": "text", "text": '\n'.join(retrived_list)},
                    {"type": "text", "text": question2},
                ],
            }
    ]
    data = dict()
    data['messages'] = messages
    infer_requests = [InferRequest(**data)]
    response = infer_batch(engine, infer_requests)
    return response

In [None]:
for i in range(len(df_question)):
    question = df_question.loc[i,'question']
    document_name = df_question.loc[i,'document']
    options = df_question.loc[i,'options']
    true_answer = df_question.loc[i,'answer']
    full_question = question + ' '.join(options)
    if "第" in question and "页" in question and "图": # 问题含有图片
        pic_page_num = re.findall(r"第(\d+)页", question)[0]
        pic_page_num = int(pic_page_num)
        answer = get_mix_answer(document_name,pic_page_num,question,options,i)
    else:
        text_answer = get_text_answer(document_name,question,options,i) # 使用文本来回答
        image_answer = get_img_answer(document_name,question,options,i) # 使用图像来回答
    