In [10]:
import os
from PyPDF2 import PdfReader, PdfWriter 
import pandas as pd
import re
import shutil

pattern = r"Site_01-(\d{4})"

def extract_pdf_pages(pdf_filepath, page_nos: list, output_folder: str) -> str:
    """
    Use this function to extract the required pages from pdf and create a new pdf.
    The new pdf would be saved in Contact-Information folder with prefix Contact-Information- in pdf filename
    Paramteres:
        pdf_filepath: str = The filepath of main pdf file which needs to be used to extract the pdf pages.
        page_nos: list = list of page numbers that needs to be extracted. e.g. [37,38]     
    """
    output_filename = f"{os.path.basename(output_folder)}-Contact-Information.pdf"
    output_filepath = os.path.join(output_folder, output_filename)
    writer = PdfWriter()
    page_nos = [i-1 for i in page_nos]
    with open(pdf_filepath, 'rb') as infile:
        reader = PdfReader(infile)
        for page in page_nos:
            writer.add_page(reader.pages[page])

        with open(output_filepath, 'wb') as outfile:
            writer.write(outfile)
            
#     print(f"Extracted {page_nos+1} pages from {os.path.basename(pdf_filepath)} file, and saved to {output_filepath}")
    return output_filepath


def delete_pdf_pages(pdf_filepath, page_nos: list, output_folder: str) -> str:
    """
    Use this function to delete the not required pages from pdf and create a new pdf.
    The new pdf would be saved in Consent-Signature folder with prefix Contact-Information- in pdf filename
    Paramteres:
        pdf_filepath: str = The filepath of main pdf file which needs to be used to delete the pdf pages.
        page_nos: list = list of page numbers that needs to be deleted. e.g. [37,38]     
    """
    output_filename = f"{os.path.basename(output_folder)}-Consent-Signature.pdf"
    output_filepath = os.path.join(output_folder, output_filename)
    writer = PdfWriter()
    page_nos = [i-1 for i in page_nos]
    with open(pdf_filepath, 'rb') as infile:
        reader = PdfReader(infile)
        for page in range(len(reader.pages)):
            if page in page_nos:
                continue
            writer.add_page(reader.pages[page])

        with open(output_filepath, 'wb') as outfile:
            writer.write(outfile)

#     print(f"Deleted {page_nos+1} pages from {os.path.basename(pdf_filepath)} file, and saved to {output_filepath}")
    return output_filepath

def get_medrio_id(name):
    match = re.search(pattern, name)
    if match:
        medrio_id = match.group(1)
        if medrio_id.startswith("0"):
            medrio_id = medrio_id[1:]
        return medrio_id
    
def replace_medrio_id(old_text, new_text):
    return re.sub(pattern, old_text, new_text)

def copy_original_pdf(origial_filepath, new_filepath):
    shutil.copy(origial_filepath, new_filepath)

####

In [12]:
excel_filepath = "email.xlsx"
input_pdf_folder = "input_pdfs"
output_dir = "output"

extract_page_nos = [37, 38]
delete_page_nos = [37, 38]

email_df = pd.read_excel(excel_filepath)
email_dict = {}
for i in email_df.itertuples():
    email_dict[str(i._3)] = i._4
    
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for input_file in os.listdir(input_pdf_folder):
    print("Processing ", input_file)
    input_filepath = os.path.join(input_pdf_folder, input_file)
    
    m_id = get_medrio_id(input_file)
    email = email_dict.get(m_id)
    if not email:
        print(f"Medrio Id {m_id} not found in excel.")
        continue
        
    username = email.split("@")[0]
    output_folder = os.path.join(output_dir, username)
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
        
    try:
        extract_pdf_pages(pdf_filepath=input_filepath, page_nos=extract_page_nos, output_folder=output_folder)
        delete_pdf_pages(pdf_filepath=input_filepath, page_nos=delete_page_nos, output_folder=output_folder)
        shutil.copy(input_filepath, os.path.join(output_folder, f"{username}.pdf"))
    except Exception as e:
        print(str(e))


Processing  Flamingo_Site_01-0130-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0131-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0132-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0133-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0135-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0139-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
Processing  Flamingo_Site_01-0140-FLAMINGO_Informed_Consent_23-0322-ST3072(2).pdf
