<a href="https://colab.research.google.com/github/Aruminium/pdfConsolidation/blob/main/pdfConsolidation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 上から順次実行 - Sequential execution from the top

#インストール - install
!pip install PyPDF2
!pip install natsort
!pip install pycryptodome

In [None]:
import PyPDF2
import subprocess
from typing import List
import glob
from natsort import natsorted
import os
import re
from Crypto.Cipher import AES
import unicodedata
from google.colab import drive
drive.mount('/content/drive')

In [31]:
def changeFilesName(files: List[str], old: str, new: str) -> List[str]:
    """ファイル名を変更する&driveの方にも適用する - change FIles Name

    Args:
        files (list[str]): files path list
        old (str): Required. String to be replaced.
        new (str): Optional. String to replace the old one.

    Returns:
        list[str]: Processed list
    
    Examples:
        When you want to delete the string "hoge" in a list
        files = changeFilesName(files, "hoge", "")
    """
    newFiles = [pdfName.replace(old, new) for pdfName in files]
    [os.rename(pdfFile, newPdfFile) for pdfFile, newPdfFile in zip(files, newFiles)]
    return newFiles

In [None]:
#@title 準備 - Preparation { vertical-output: true }

#@markdown **googleドライブのパス(基本的に変更しない)**
#@markdown google drive path (basically unchanged)
gdrivePath = '/content/drive/MyDrive/' #@param {type:"string"}

#@markdown **連結させたいpdfが在るディレクトリのパス**
#@markdown Path of the directory where the pdfs to be concatenated are located - 
dirPath = '' #@param {type:"string"}

#連結するpdfをまとめたディレクトリのパス
files = glob.glob(gdrivePath + dirPath + "*.pdf")

# 全角->半角
[os.rename(pdfFile, unicodedata.normalize("NFKC", pdfFile)) for pdfFile in files]

#連結するpdfをまとめたディレクトリのパス
files = glob.glob(gdrivePath + dirPath + "*.pdf")

fileNames = [os.path.basename(pdfFile).replace(".pdf", "") for pdfFile in files]
fileNames

In [None]:
#@title 名前変更する場合,追記し実行 - If you want to change the name, add and execute { vertical-output: true }

old = "" #@param {type: "string"}
new = "" #@param {type: "string"}

files = changeFilesName(files, old, new)

fileNames = [os.path.basename(pdfFile).replace(".pdf", "") for pdfFile in files]
fileNames

In [None]:
#@title 並び替え - Sort { vertical-output: true }

sorting = "ascending" #@param ["unsort", "ascending", "descending", "reversion"]

if sorting in "ascending":
    files = natsorted(files)
elif sorting in "descending":
    files = natsorted(files).reverse()
elif sorting in "reversion":
    files = files.reverse()

#確認
fileNames = [os.path.basename(pdfFile).replace(".pdf", "") for pdfFile in files]
fileNames

In [None]:
#@title ページ数取得 - Page count acquisition { vertical-output: true }
pdfPages = []
sumPages = 0
for pdfFile in files:
    pdf_reader = PyPDF2.PdfFileReader(pdfFile)
    page_num = pdf_reader.getNumPages()
    pdfPages.append(sumPages)
    sumPages += page_num
    print(sumPages)
pdfPages

In [41]:

#@title 出力 - Output { vertical-output: true }

#@markdown **ファイル名** file-name
pdfName = '' #@param {type:"string"}
#@markdown **MyDrive/以下の出力path** MyDrive/ Output path under
exportPath = '' #@param {type:"string"}

pdf_writer = PyPDF2.PdfFileMerger()

for file in files:
    pdf_writer.append(file)

#出力パス gdrivePath + "/ + 任意ディレクトリのpath(/のみでも大丈夫)" + fileName
outPath = gdrivePath + exportPath + pdfName + '.pdf'

# exportPathがなければ作る
if not os.path.exists(gdrivePath+exportPath):
  os.mkdir(gdrivePath+exportPath)

pdf_writer.write(outPath)
pdf_writer.close()

In [42]:
#@title しおり作成 & 圧縮 - Bookmark Creation & Compression

#@markdown うまくいかない時は1つ前のセルのpdf結合からやり直す

#@markdown If it doesn't work, start over from the pdf merge of the previous cell.

writer = PyPDF2.PdfFileWriter()
reader = PyPDF2.PdfFileReader(outPath)

for i in range(reader.getNumPages()):
    #圧縮
    page = reader.getPage(i)
    page.compressContentStreams()
    writer.addPage(page)

for key, value in zip(fileNames, pdfPages):
    writer.addBookmark(key, value, parent=None)
writer.setPageMode("/UseOutlines")  # This is what tells the PDF to open to bookmarks
with open(outPath, "wb") as fp:
    writer.write(fp)