In [9]:
import os
from docx import Document
import PyPDF2
import openpyxl
from pptx import Presentation

def read_docx(file_path):
    doc = Document(file_path)
    return '\n'.join(paragraph.text for paragraph in doc.paragraphs)

def read_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'
        return text

def read_xlsx(file_path):
    wb = openpyxl.load_workbook(file_path)
    text = ''
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        for row in ws.iter_rows(values_only=True):
            text += '\t'.join(str(cell) if cell is not None else '' for cell in row) + '\n'
    return text

def read_pptx(file_path):
    prs = Presentation(file_path)
    text = ''
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + '\n'
    return text

def convert_to_txt(input_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    base_name = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(output_folder, f"{base_name}.txt")

    ext = os.path.splitext(input_path)[1].lower()

    try:
        if ext == '.docx':
            content = read_docx(input_path)
        elif ext == '.pdf':
            content = read_pdf(input_path)
        elif ext == '.xlsx':
            content = read_xlsx(input_path)
        elif ext == '.pptx':
            content = read_pptx(input_path)
        else:
            print(f"❌ 不支持的文件格式: {ext}")
            return

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"✅ 已成功保存为: {output_path}")

    except Exception as e:
        print(f"❌ 处理文件时出错: {e}")

# 固定路径测试方式
input_file = "E:/2025金种子/knowledege_graph/代理计证.pdf"
output_dir = "./output"
convert_to_txt(input_file, output_dir)

✅ 已成功保存为: ./output\代理计证.txt
