In [2]:
import glob
import pdfplumber
import re
from collections import defaultdict
import json

class PDFProcessor:
    def __init__(self, filepath):
        self.filepath = filepath
        self.pdf = pdfplumber.open(filepath)
        self.all_text = defaultdict(dict)
        self.allrow = 0
        self.last_num = 0

    def check_lines(self, page, top, buttom):
        lines = page.extract_words()[::]
        text = ''
        last_top = 0
        last_check = 0
        for l in range(len(lines)):
            each_line = lines[l]
            check_re = '(?:。|；|单位：元|单位：万元|币种：人民币|\d|报告(?:全文)?(?:（修订版）|（修订稿）|（更正后）)?)$'
            if top == '' and buttom == '':
                if abs(last_top - each_line['top']) <= 2:
                    text = text + each_line['text']
                elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):

                    text = text + each_line['text']
                else:
                    text = text + '\n' + each_line['text']
            elif top == '':
                if each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 2:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            else:
                if each_line['top'] < top and each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 2:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
                                                                                                          text):
                        text = text + each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            last_top = each_line['top']
            last_check = each_line['x1'] - page.width * 0.85

        return text

    def drop_empty_cols(self, data):
        # 删除所有列为空数据的列
        transposed_data = list(map(list, zip(*data)))
        filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]
        result = list(map(list, zip(*filtered_data)))
        return result

    @staticmethod
    def keep_visible_lines(obj):
        """
        If the object is a ``rect`` type, keep it only if the lines are visible.

        A visible line is the one having ``non_stroking_color`` not null.
        """
        if obj['object_type'] == 'rect':
            if obj['non_stroking_color'] is None:
                return False
            if obj['width'] < 1 and obj['height'] < 1:
                return False
            # return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not None
        if obj['object_type'] == 'char':
            return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None
        return True

    def extract_text_and_tables(self, page):
        buttom = 0
        page = page.filter(self.keep_visible_lines)
        tables = page.find_tables()
        if len(tables) >= 1:
            count = len(tables)
            for table in tables:
                if table.bbox[3] < buttom:
                    pass
                else:
                    count -= 1
                    top = table.bbox[1]
                    text = self.check_lines(page, top, buttom)
                    text_list = text.split('\n')
                    for _t in range(len(text_list)):
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'text', 'inside': text_list[_t]}
                        self.allrow += 1

                    buttom = table.bbox[3]
                    new_table = table.extract()
                    r_count = 0
                    for r in range(len(new_table)):
                        row = new_table[r]
                        if row[0] is None:
                            r_count += 1
                            for c in range(len(row)):
                                if row[c] is not None and row[c] not in ['', ' ']:
                                    if new_table[r - r_count][c] is None:
                                        new_table[r - r_count][c] = row[c]
                                    else:
                                        new_table[r - r_count][c] += row[c]
                                    new_table[r][c] = None
                        else:
                            r_count = 0

                    end_table = []
                    for row in new_table:
                        if row[0] != None:
                            cell_list = []
                            cell_check = False
                            for cell in row:
                                if cell != None:
                                    cell = cell.replace('\n', '')
                                else:
                                    cell = ''
                                if cell != '':
                                    cell_check = True
                                cell_list.append(cell)
                            if cell_check == True:
                                end_table.append(cell_list)
                    end_table = self.drop_empty_cols(end_table)

                    for row in end_table:
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'excel', 'inside': str(row)}
                        # self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel',
                        #                               'inside': ' '.join(row)}
                        self.allrow += 1

                    if count == 0:
                        text = self.check_lines(page, '', buttom)
                        text_list = text.split('\n')
                        for _t in range(len(text_list)):
                            self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                          'type': 'text', 'inside': text_list[_t]}
                            self.allrow += 1

        else:
            text = self.check_lines(page, '', '')
            text_list = text.split('\n')
            for _t in range(len(text_list)):
                self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                              'type': 'text', 'inside': text_list[_t]}
                self.allrow += 1

        first_re = '[^计](?:报告(?:全文)?(?:（修订版）|（修订稿）|（更正后）)?)$'
        end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'
        if self.last_num == 0:
            try:
                first_text = str(self.all_text[1]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and not '[' in end_text:
                    self.all_text[1]['type'] = '页眉'
                    if re.search(end_re, end_text) and not '[' in end_text:
                        self.all_text[len(self.all_text) - 1]['type'] = '页脚'
            except:
                print(page.page_number)
        else:
            try:
                first_text = str(self.all_text[self.last_num + 2]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and '[' not in end_text:
                    self.all_text[self.last_num + 2]['type'] = '页眉'
                if re.search(end_re, end_text) and '[' not in end_text:
                    self.all_text[len(self.all_text) - 1]['type'] = '页脚'
            except:
                print(page.page_number)

        self.last_num = len(self.all_text) - 1


    def process_pdf(self):
        for i in range(len(self.pdf.pages)):
            self.extract_text_and_tables(self.pdf.pages[i])


    def save_all_text(self, path):
        with open(path, 'w', encoding='utf-8') as file:
            for key in self.all_text.keys():
                file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')


def process_all_pdfs_in_folder(folder_path):
    file_paths = glob.glob(f'{folder_path}/*')
    file_paths = sorted(file_paths, reverse=True)

    for file_path in file_paths:
        print(file_path)
        try:
            processor = PDFProcessor(file_path)
            processor.process_pdf()
            save_path = 'alltxt/' + file_path.split('/')[-1].replace('.pdf', '.txt')
            processor.save_all_text(save_path)
        except:
            print('check')




  filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]


In [4]:
if __name__ == '__main__':
    pdf_path = r'D:/微信文件/WeChat Files/a35857710/FileStorage/File/2023-11/红土盐田港REIT：红土创新盐田港仓储物流封闭式基础设施证券投资基金招募说明书（更新）2021年第1期(1)/test.pdf'
    out_path = r'D:/reits知识库/具体项目/test_pdf2textnew.txt'
    processor = PDFProcessor(pdf_path)
    processor.process_pdf()
    processor.save_all_text(out_path)

# folder_path = 'allpdf'
# process_all_pdfs_in_folder(folder_path)

In [None]:
from preprocess import extract_pdf_text, extract_pdf_tables

In [None]:
import os
import json
import shutil
# import pdfplumber
# import camelot
from multiprocessing import Pool
from loguru import logger
# from langchain.document_loaders import UnstructuredPDFLoader
# from langchain.document_loaders import PDFPlumberLoader
# from langchain.document_loaders import TextLoader
# from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
# from langchain.schema import Document
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS

from config import cfg
from file import load_pdf_info
# from chinese_text_splitter import ChineseTextSplitter
# from pdf2txt import PDFProcessor
from pdf_util import PdfExtractor
from financial_state import (extract_basic_info, extract_employee_info,
    extract_cbs_info, extract_cscf_info, extract_cis_info, extract_dev_info, merge_info)


def setup_xpdf():
    os.chdir(cfg.XPDF_PATH)
    cmd = 'chmod +x pdftotext'
    os.system(cmd)


def extract_pure_content(idx, key, pdf_path):
    logger.info('Extract text for {}:{}'.format(idx, key))
    save_dir = os.path.join(cfg.DATA_PATH, cfg.PDF_TEXT_DIR)
    key_dir = os.path.join(save_dir, key)
    if not os.path.exists(key_dir):
        os.mkdir(key_dir)
    save_path = os.path.join(key_dir, 'pure_content.txt')
    if os.path.exists(save_path):
        os.remove(save_path)
    PdfExtractor(pdf_path).extract_pure_content_and_save(save_path)

# def extract_text(idx, key, pdf_path):
#     print(idx, key, pdf_path)
#     save_dir = os.path.join(cfg.DATA_PATH, __pdf_text_dir__)
#     key_dir = os.path.join(save_dir, key)
#     if not os.path.exists(key_dir):
#         os.mkdir(key_dir)
#     save_path = os.path.join(key_dir, 'docs.txt')
#     # if os.path.exists(save_path):
#     #     return
#     # else:
#         # os.chdir(__xpdf_path__)
#         # cmd = './pdftotext -lineprinter "{}" "{}"'.format(pdf_path, save_path)
#         # print(cmd)
#         # os.system(cmd)
#     try:
#         processor = PDFProcessor(pdf_path)
#         processor.process_pdf()
#         processor.save_all_text(save_path)
#         # PdfExtractor(pdf_path).extract_and_save(save_path)
#     except Exception as e:
#         print(e, pdf_path)


def extract_pdf_text(extract_func=extract_pure_content):
    setup_xpdf()

    save_dir = os.path.join(cfg.DATA_PATH, cfg.PDF_TEXT_DIR)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    pdf_info = load_pdf_info()

    # for i, (k, v) in enumerate(pdf_info.items()):
    #     extract_func(i, k, v['pdf_path'])

    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.starmap(extract_func, [(i, k, v['pdf_path']) for i, (k, v) in enumerate(pdf_info.items())])


def extract_pdf_tables():
    pdf_info = load_pdf_info()
    pdf_keys = list(pdf_info.keys())

    # basic_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_basic_info, pdf_keys)
    merge_info('basic_info')
    # # employee_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_employee_info, pdf_keys)
    merge_info('employee_info')
    # cbs_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_cbs_info, pdf_keys)
    merge_info('cbs_info')
    # cscf_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_cscf_info, pdf_keys)
    merge_info('cscf_info')
    # cis_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_cis_info, pdf_keys)
    merge_info('cis_info')
    # dev_info
    with Pool(processes=cfg.NUM_PROCESSES) as pool:
        results = pool.map(extract_dev_info, pdf_keys)
    merge_info('dev_info')


# def generate_embedding_vector(key, embedding):
#     text_path = os.path.join(cfg.DATA_PATH, __pdf_text_dir__, key, 'docs.txt')
#     loader = TextLoader(text_path, encoding='utf-8')
#     docs = loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(
#         separators=['\n'], keep_separator=False,
#         chunk_size=1024, chunk_overlap=0,
#         length_function=len, add_start_index=True))
#     # for doc in docs:
#     #     print(len(doc.page_content))
#     #     print(doc.page_content)
#     #     print(doc.metadata)
#     #     print('*'*100)
#     # exit(0)
    
#     doc_vecs = FAISS.from_documents(docs, embedding)
#     doc_vecs.save_local(os.path.join(cfg.DATA_PATH, __pdf_text_dir__, key, 'doc_vecs'))


# def generate_embedding_all():
#     os.environ['CUDA_VISIBLE_DEVICES'] = '3'

#     # embeddings = None
#     connection_error = True
#     while connection_error:
#         try:
#             embeddings = HuggingFaceEmbeddings(model_name='GanymedeNil/text2vec-large-chinese')
#             connection_error = False
#         except Exception as e:
#             print(e)
#             continue
#     with open(os.path.join(cfg.DATA_PATH, 'pdf_info.json')) as f:
#         pdf_info = json.load(f)

#     for k, v in pdf_info.items():
#         print(k)
#         generate_embedding_vector(k, embeddings)


if __name__ == '__main__':
    import os
    import time
    # import ghostscript
    os.environ['PATH'] = r'C:\Program Files\gs\gs10.01.2\bin;' + os.environ['PATH']
    # import ctypes
    # from ctypes.util import find_library
    # lib = find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll")))
    # print(lib)
    # import camelot
    # generate_embedding_all()
    # extract_text_all(extract_func=extract_pure_content)


    # extract_pure_content(0, '2020-03-25__南京钢铁股份有限公司__600282__南钢股份__2019年__年度报告.pdf',
    #     '/raidnvme/czc/MODELSCOPE_CACHE_HOME/modelscope/hub/datasets/modelscope/chatglm_llm_fintech_raw_dataset/master/data_files/1106979bbfe796043d45ea0f4831c916802713a7b08a580e98421d91d8ba0eb3')

    pdf_path = r'C:\Users\CHENZHAOCAI\Downloads\test.pdf'
    out_path = r'C:\Users\CHENZHAOCAI\Downloads\test.txt'

    # pdf_path = '/raidnvme/czc/MODELSCOPE_CACHE_HOME/modelscope/hub/datasets/modelscope/chatglm_llm_fintech_raw_dataset/master/data_files/011af0d314a605ab3cff699f48af52248d2d9fabe417b811321d11107fa49c97'


    # start = time.time()
    PdfExtractor(pdf_path).extract_table_of_pages([103])
    # PdfExtractor(pdf_path).extract_pure_content_and_save(out_path, True)

    # end = time.time()
    # print(end - start)

    # from file import load_pdf_info, load_pdf_pure_text
    # pdf_info = load_pdf_info()

    # for k, v in pdf_info.items():
    #     # print(k, v['pdf_path'])
    #     text_lines = load_pdf_pure_text(k) 
    #     if len(text_lines) == 0:
    #         extract_pure_content(0, k, v['pdf_path'])