In [36]:
from PyPDF2 import PdfWriter, PdfReader, PageObject
import re
import datetime as dt
from pydantic import BaseModel
from typing import Optional


In [42]:
paths = [r"E:\Merit\pdfbuilder\IC-A-241010-Pre-Man-Int.pdf", r"E:\Merit\pdfbuilder\IC-A-241010.pdf"]

class PdfPage(BaseModel):
    pdf_pages: list[PageObject] # usually will only have one page. Has multiple when there are pages without datetime
    datetime: Optional[dt.datetime]
    is_manually_integrated: Optional[bool]
    original_pdf_path: str

    class Config:
        arbitrary_types_allowed = True



In [38]:
def get_datetime_from_text(text: str) -> dt.datetime:
    pattern = r"\d{2}-\w{3}-\d{4}\s+/\s+\d{2}:\d{2}"
    matches = re.findall(pattern, text)
    if not matches:
        return None
    match = matches[0].replace(" ", "")
    return dt.datetime.strptime(match, "%d-%b-%Y/%H:%M")


In [39]:
def get_is_manually_integrated(text: str) -> bool:
    pattern = r"\d.+(BMB\s?\*|BM\s?\*|MB\s?\*)" # matches a row with a BMB, BM or MB that has asterisk
    matches = re.findall(pattern, text)
    return bool(matches)

In [44]:
all_pages = []
for path in paths:
    pdf = PdfReader(path)
    for page in pdf.pages:
        text = page.extract_text()
        datetime = get_datetime_from_text(text)
        is_manually_integrated = get_is_manually_integrated(text)
        if not datetime:
            all_pages[-1].pdf_pages.append(page)
            all_pages[-1].is_manually_integrated = is_manually_integrated or all_pages[-1].is_manually_integrated
        else:
            all_pages.append(PdfPage(pdf_pages=[page], datetime=datetime, is_manually_integrated=is_manually_integrated, original_pdf_path=path))

# For pages without datetime, use previous page's datetime
for i in range(1, len(all_pages)):
    if all_pages[i].datetime is None:
        all_pages[i].datetime = all_pages[i-1].datetime


all_pages = sorted(all_pages, key=lambda x: (x.datetime, int(x.is_manually_integrated)*-1))


def pages_are_equal(page1: PdfPage, page2: PdfPage) -> bool:
    return page1.datetime == page2.datetime and page1.is_manually_integrated == page2.is_manually_integrated

to_remove = []
for i in range(1, len(all_pages)):
    if pages_are_equal(all_pages[i], all_pages[i-1]):
        to_remove.append(i)

for i in sorted(to_remove, reverse=True):
    all_pages.pop(i)

for page in all_pages:
    print(page.datetime)
    print('pages:', len(page.pdf_pages))
    print(page.is_manually_integrated)
    print(page.original_pdf_path)
    print("-"*100)


2024-09-24 12:17:00
pages: 2
False
E:\Merit\pdfbuilder\IC-A-241010-Pre-Man-Int.pdf
----------------------------------------------------------------------------------------------------
2024-10-10 18:41:00
pages: 1
False
E:\Merit\pdfbuilder\IC-A-241010-Pre-Man-Int.pdf
----------------------------------------------------------------------------------------------------
2024-10-10 18:54:00
pages: 1
True
E:\Merit\pdfbuilder\IC-A-241010.pdf
----------------------------------------------------------------------------------------------------
2024-10-10 18:54:00
pages: 1
False
E:\Merit\pdfbuilder\IC-A-241010-Pre-Man-Int.pdf
----------------------------------------------------------------------------------------------------
2024-10-10 19:04:00
pages: 1
True
E:\Merit\pdfbuilder\IC-A-241010.pdf
----------------------------------------------------------------------------------------------------
2024-10-10 19:04:00
pages: 1
False
E:\Merit\pdfbuilder\IC-A-241010-Pre-Man-Int.pdf
-----------------------

# TODO: test