# Back2Content

In [33]:
from PyPDF2 import PdfWriter, PdfReader, PdfMerger
from PyPDF2.generic import AnnotationBuilder
from reportlab.pdfgen import canvas
import io
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.pagesizes import letter,A4
import os
from os import path
from glob import glob
pdf_pages = []
submitted_name = []
all_name = ["xxxx"]

matching_files = glob('周报_xxx组_20240308/*.pdf')

for pdf_file in matching_files:
    if pdf_file.endswith('.pdf'):
        name = None
        file_extension = os.path.splitext(pdf_file)[1]
        for cur_name in all_name:
            if cur_name in pdf_file:
                name = cur_name
                submitted_name.append(name)
                break
        if name is None:
            continue
        try:
            with open(pdf_file, 'rb') as file:
                reader = PdfReader(file)
                num_pages = len(reader.pages)
                pdf_pages.append((pdf_file, num_pages))
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")

# 按页数逆序排序
pdf_pages.sort(key=lambda x: x[1])
# 注册中文字体，这里以“微软雅黑”为例
pdfmetrics.registerFont(TTFont('YaHei', 'msyh.ttc'))
# 创建一个新的PDF页面
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=A4)
can.setFont("YaHei", 18)  # 使用中文字体

can.drawString(72, 720, "周报文件顺序（按页码递增）| 点击文字有跳转：")
cur_page = 2
for i, (pdf_file, num_pages) in enumerate(pdf_pages, start=1):
    next_page = cur_page + num_pages
    prefix_name = f"{i}. {os.path.basename(pdf_file)}"
    postfix_name = f" (Page {cur_page}-{next_page-1})"
    text_width = pdfmetrics.stringWidth(postfix_name, "YaHei", 18)
    x_pos = A4[0] - text_width - 72  # 72为页面右边距
    # 绘制文本
    can.drawString(72, 720 - 60 * i, prefix_name[:-4] )
    can.drawString(x_pos, 720 - 60 * i, postfix_name)
    # 递增当前页码
    cur_page = next_page
can.save()
# 将新创建的页面转换为PDF
packet.seek(0)
new_pdf = PdfReader(packet)
merger = PdfMerger()
merger.append(new_pdf)
for pdf_file, _ in pdf_pages:
    try:
        merger.append(pdf_file)
    except Exception as e:
        print(f"Error merging {pdf_file}: {e}")
merger.write('test.pdf')
merger.close()

reader = PdfReader(open('test.pdf','rb'))
merger = PdfWriter()
num_of_pages = len(reader.pages) 

for page in range(num_of_pages):
    current_page = reader.pages[page]
    merger.add_page(current_page)

x1, y1, x2, y2 = merger.pages[0].mediabox 
cur_page = 1

for i, (pdf_file, num_pages) in enumerate(pdf_pages, start=1):
    next_page = cur_page + num_pages
    prefix_name = f"{i}. {os.path.basename(pdf_file)}"[:-4]
    postfix_name = f" (Page {cur_page}-{next_page-1})"
    annotation = AnnotationBuilder.link(
    rect=(72, 720 - 60 * i + 20, x2 -60, 720 - 60 * i - 10), target_page_index=cur_page, 
    )
    merger.add_annotation(page_number=0, annotation=annotation)
    # 递增当前页码
    cur_page = next_page
with open('test.pdf','wb') as link_pdf:
    merger.write(link_pdf)

# Remove metadata

In [None]:
import PyPDF2
 
def remove_metadata(input_pdf_path, output_pdf_path):
    with open(input_pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        writer = PyPDF2.PdfWriter()
        print(reader.metadata)
        # 遍历PDF的每一页
        for page in reader.pages:
            writer.addPage(page)
 
        # 删除元数据
        writer.add_metadata({'/Producer': ''})
 
        with open(output_pdf_path, 'wb') as file:
            writer.write(file)
 
# 使用函数
input_path = '扫描件.pdf'  # 输入的PDF文件路径
output_path = '扫描件-1464.pdf'  # 输出的PDF文件路径（无元数据）
remove_metadata(input_path, output_path)

# Remove watermark (use in PyPDF2 with some modified)

In [None]:
from pikepdf import Pdf
import pikepdf
from fpdf import FPDF
pdf = Pdf.open('测试练习：作答回顾.pdf')
pdf2 = Pdf.new()
version = max(pdf2.pdf_version,pdf.pdf_version)

for cur_page in pdf.pages:
    cur_page.Annots = pikepdf.Array()
    cur_page.resources.XObject = pikepdf.Dictionary()
    # print(cur_page.resources.XObject)
    # for val in cur_page.resources.XObject:
    #     val = pikepdf.Stream(owner='val.owner')
    pdf2.pages.append(cur_page)
# with pdf.open_metadata() as source_meta:
#     with pdf2.open_metadata() as target_meta:
#         for k in source_meta:
#             if isinstance(source_meta[k], list) and source_meta[k] is None:
#                 target_meta[k] = None
#             else:
#                 target_meta[k] = source_meta[k]
pdf2.remove_unreferenced_resources()

pdf2.save('output.pdf', min_version=version)

# Font Embedding detection

Ghostscript is necessary (https://www.ghostscript.com/releases/gsdnld.html)

please use absolute path of Ghostscript in windows

In [47]:
import PyPDF2
import platform
import subprocess
import os
import tempfile

def embed_fonts(input_pdf, output_pdf, font_path):
    """
    使用Ghostscript嵌入PDF文件中未嵌入的字体。
    :param input_pdf: 输入PDF文件路径
    :param output_pdf: 输出PDF文件路径
    :param font_path: 字体文件存放路径
    """
    command = [
        "gs" if "Linux" == platform.system() else r"C:\Program Files\gs\gs10.03.0\bin\GSWIN64c",
        "-o", output_pdf,
        "-sDEVICE=pdfwrite",
        "-dPDFSETTINGS=/prepress",
        "-dEmbedAllFonts=true",
        "-dSubsetFonts=true",
        "-sFONTPATH={}".format(font_path),
        input_pdf
    ]
    print(' '.join(command))
    # 执行命令
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # 检查结果
    if result.returncode == 0:
        print("Embed fonts Successfully")
    else:
        print(f"Fail: {result.stderr}", )

def walk(obj, fnt, emb):
    """
    If there is a key called 'BaseFont', that is a font that is used in the document.
    If there is a key called 'FontName' and another key in the same dictionary object
    that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
    embedded.

    We create and add to two sets, fnt = fonts used and emb = fonts embedded.
    """

    if isinstance(obj, PyPDF2.generic.IndirectObject):
        # recurse on indirect objects
        walk(obj.get_object(), fnt, emb)

    if not isinstance(obj, (PyPDF2.generic.DictionaryObject, PyPDF2.generic.ArrayObject)):
        # cannot check non dictionary or array objects for properties
        return

    fontkeys = set(["/FontFile", "/FontFile2", "/FontFile3"])
    if "/BaseFont" in obj:
        fnt.add(obj["/BaseFont"])
    if "/FontName" in obj:
        if [x for x in fontkeys if x in obj]:  # test to see if there is FontFile
            emb.add(obj["/FontName"])

    # recurse on dictionaries
    if isinstance(obj, PyPDF2.generic.DictionaryObject):
        for key in obj.keys():
            walk(obj[key], fnt, emb)

    # recurse on arrays
    elif isinstance(obj, PyPDF2.generic.ArrayObject):
        for i in obj:
            walk(i, fnt, emb)


def get_fonts(pdf):
    """Get all the fonts in the PDF and which are and are not embedded"""
    fonts = set()
    embedded = set()
    for page in pdf.pages:
        obj = page.get_object()
        walk(obj["/Resources"], fonts, embedded)

    unembedded = fonts - embedded
    return fonts, embedded, unembedded

def print_pdf_version(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf = PyPDF2.PdfFileReader(file)
        print("PDF Version:", pdf.documentInfo)  # 打印PDF版本信息

def copy_pdf_metadata(source_pdf_path, target_pdf_path):
    # 读取源文件的元数据
    with open(source_pdf_path, "rb") as file:
        source_pdf = PyPDF2.PdfFileReader(file)
        source_metadata = source_pdf.metadata

    # 读取目标文件
    with open(target_pdf_path, "rb") as file:
        target_pdf = PyPDF2.PdfFileReader(file)
        writer = PyPDF2.PdfFileWriter()

        # 将目标PDF的每一页复制到写入器对象
        for page in target_pdf.pages:
            writer.add_page(page)

        # 设置元数据
        writer.add_metadata(source_metadata)

        # 使用tempfile创建一个临时文件
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            writer.write(temp_file)
            temp_path = temp_file.name

    # 替换原始文件
    os.replace(temp_path, target_pdf_path)
    print('Copy successfully')


In [48]:
# 使用示例
input_pdf = "demo.pdf"
output_pdf = "demo-s.pdf"
font_path = "C:/Windows/Fonts/"



pdf_path = PyPDF2.PdfFileReader(input_pdf)
fonts, embedded, unembedded = get_fonts(pdf_path)
print('fonts', fonts)
print('embedded', embedded)
print('unembedded', unembedded)
embed_fonts(input_pdf, output_pdf, font_path)
copy_pdf_metadata(input_pdf, output_pdf, )


fonts {'/HJTVCQ+Times-Roman', '/QDTWCG+MSBM10', '/ZRGIGW+CMMI9', '/ARVNQA+CMSY10', '/BCDFEE+Calibri', '/ILNECK+CMMIB10', '/IDHDGX+NimbusRomNo9L-ReguItal', '/JKSUUE+CMSY6', '/HPRQEV+CMSY8', '/BCDEEE+CambriaMath', '/SDVUTG+CMSY7', '/BCDFEE+Calibri-Bold', '/JFFMXC+CMMI7', '/JAREGN+CMR6', '/TimesNewRomanPS-BoldMT', '/KRVLHQ+CMMI6', '/NBNCPI+CMR10', '/TWGGSQ+CMR7', '/YRWECK+CMMI10', '/VDKMLK+CMEX9', '/LREKMX+NimbusRomNo9L-MediItal', '/TimesNewRomanPSMT', '/VVGCDX+CMEX10', '/FOPJIV+CMSY9', '/OBXDCW+CMMI8', '/Times-Roman', '/MYWHRY+NimbusRomNo9L-Regu', '/ZGRWKB+CMR8', '/FJNIUZ+CMR9', '/BCDEEE+Calibri-Bold', '/AHBDAI+CMMI5', '/VYQISI+NimbusRomNo9L-Medi'}
embedded {'/HJTVCQ+Times-Roman', '/QDTWCG+MSBM10', '/ZRGIGW+CMMI9', '/ARVNQA+CMSY10', '/BCDFEE+Calibri', '/ILNECK+CMMIB10', '/IDHDGX+NimbusRomNo9L-ReguItal', '/JKSUUE+CMSY6', '/HPRQEV+CMSY8', '/BCDEEE+CambriaMath', '/SDVUTG+CMSY7', '/BCDFEE+Calibri-Bold', '/JFFMXC+CMMI7', '/JAREGN+CMR6', '/KRVLHQ+CMMI6', '/NBNCPI+CMR10', '/TWGGSQ+CMR7', '/YRWE