In [5]:
ArquivosManipulados = "Arquivo Manipulado/"

## Mesclar arquivos PDF

In [1]:
!pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [7]:
from PyPDF2 import PdfMerger

def merge_pdfs(pdf_list, output):
    merger = PdfMerger()

    for pdf in pdf_list:
        merger.append(pdf)

    with open(output, 'wb') as output_pdf:
        merger.write(output_pdf)

# Exemplo de uso:
pdf_files = ['file1.pdf', 'file2.pdf', 'file3.pdf']
merge_pdfs(pdf_files,f"{ArquivosManipulados}merged.pdf")


## 2. Extrair texto de um PDF


In [9]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""

    for page in reader.pages:
        text += page.extract_text()

    return text

# Exemplo de uso:
pdf_text = extract_text_from_pdf(f"file1.pdf")
print(pdf_text)


Dom Casmu rro
Machado de  Assi s
CAPÍTULO PRIM EIRO / DO TÍTULO 
Uma noite destas, vindo da  cidade para o Engenho N ovo, e ncontrei num  trem da Central um rapaz 
aqui do ba irro, que  eu conhe ço de vista e de chapéu. Cum primentou-m e, sentou-se ao pé de mim, 
falou da  lua e dos ministros, e acabou re citando-m e versos. A viagem era curta, e os versos pode  ser 
que não fos sem inteiramente maus. Sucedeu, poré m, que , como eu estava cansado, fe chei os olhos 
três ou qua tro ve zes; tanto bastou pa ra que ele interrom pesse a leitura e metesse os versos no bol so. 
-- Cont inue, disse eu acordando. 
-- Já acabei, murmurou e le. 
-- São muito boni tos. 
Vi-lhe fazer um gesto para tirá-los outra vez do bol so, mas não passou do ge sto; estava amuado. N o 
dia seguinte entrou a  dizer de mim nom es feios, e acabou a lcunha ndo-m e Dom Casmurro. O s 
vizinhos , que  não gos tam dos meus hábitos reclusos e calados, deram curso à alcunha , que  afinal 
pegou. N em por i sso me zangue i

## 3. Adicionar uma página de um PDF a outro


In [10]:
from PyPDF2 import PdfReader, PdfWriter

def add_page_to_pdf(input_pdf, page_number, output_pdf):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Adiciona a página especificada ao writer
    writer.add_page(reader.pages[page_number])

    # Salva o novo PDF
    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

# Exemplo de uso:
add_page_to_pdf('source.pdf', 0, f'{ArquivosManipulados}output.pdf')  # Adiciona a primeira página de source.pdf para output.pdf


## 4. Rotacionar páginas


In [12]:
from PyPDF2 import PdfReader, PdfWriter

def rotate_pdf(input_pdf, output_pdf, degrees):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Rotaciona cada página
    for page in reader.pages:
        # page.rotate_clockwise(degrees)#rotate_clockwise is deprecated and was removed in PyPDF2 3.0.0. Use rotate instead.
        page.rotate(degrees)#rotate_clockwise is deprecated and was removed in PyPDF2 3.0.0. Use rotate instead.
        writer.add_page(page)

    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

# Exemplo de uso:
rotate_pdf('source.pdf', f'{ArquivosManipulados}rotated.pdf', 90)  # Rotaciona as páginas 90 graus no sentido horário


## 5. Adicionar um carimbo de texto ou imagem (watermark) em PDFs usando PyPDF2

In [13]:
from PyPDF2 import PdfReader, PdfWriter

def add_watermark(input_pdf, watermark_pdf, output_pdf):
    reader = PdfReader(input_pdf)
    watermark_reader = PdfReader(watermark_pdf)
    watermark_page = watermark_reader.pages[0]

    writer = PdfWriter()

    for page in reader.pages:
        # Mescla a página do watermark com cada página do PDF
        page.merge_page(watermark_page)
        writer.add_page(page)

    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

# Exemplo de uso:
add_watermark('source.pdf', 'watermark.pdf', f'{ArquivosManipulados}output_with_watermark.pdf')


## 6. Criar um PDF simples com texto e formas usando reportlab


In [15]:
from reportlab.pdfgen import canvas

def create_pdf(output_pdf):
    c = canvas.Canvas(output_pdf)

    # Adiciona texto
    c.drawString(100, 750, "Este é um PDF gerado com reportlab")

    # Desenha uma linha
    c.line(50, 700, 500, 700)

    # Adiciona um retângulo
    c.rect(100, 650, 200, 100)

    # Salva o PDF
    c.save()

# Exemplo de uso:
create_pdf(f'{ArquivosManipulados}created.pdf')


## 7. Modificar metadados do PDF usando PyPDF2


In [16]:
from PyPDF2 import PdfReader, PdfWriter

def modify_metadata(input_pdf, output_pdf, metadata):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Copia as páginas para o novo PDF
    for page in reader.pages:
        writer.add_page(page)

    # Atualiza os metadados
    writer.add_metadata(metadata)

    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

# Exemplo de uso:
new_metadata = {
    '/Title': 'Novo Título',
    '/Author': 'Ana Marcacini',
    '/Subject': 'PDF Modificado'
}
modify_metadata('source.pdf', f'{ArquivosManipulados}output_with_metadata.pdf', new_metadata)


## 8. Criptografar um PDF usando pikepdf

In [18]:
!pip install pikepdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pikepdf
  Downloading pikepdf-9.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: pikepdf
Successfully installed pikepdf-9.3.0


In [19]:
import pikepdf

def encrypt_pdf(input_pdf, output_pdf, password):
    with pikepdf.open(input_pdf) as pdf:
        pdf.save(output_pdf, encryption=pikepdf.Encryption(user=password, owner=password, R=4))

# Exemplo de uso:
encrypt_pdf('source.pdf', f'{ArquivosManipulados}encrypted.pdf', 'minha_senha')


## 9. Extrair imagens de um PDF usando PyMuPDF (fitz)


In [22]:
!pip install fitz --use-deprecated=legacy-resolver

Defaulting to user installation because normal site-packages is not writeable


In [24]:
!pip install PyMuPDF


Defaulting to user installation because normal site-packages is not writeable
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting PyMuPDFb==1.24.10
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m31m64.2 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10


In [27]:
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder):
    pdf = fitz.open(pdf_path)
    
    for page_number in range(len(pdf)):
        page = pdf.load_page(page_number)
        images = page.get_images(full=True)
        
        for image_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_name = f"{output_folder}/page{page_number + 1}_img{image_index + 1}.{image_ext}"
            
            with open(image_name, "wb") as img_file:
                img_file.write(image_bytes)

# Exemplo de uso:
extract_images_from_pdf('source.pdf', f'{ArquivosManipulados}output_images')


## 10. Remover páginas específicas de um PDF usando PyPDF2


In [29]:
from PyPDF2 import PdfReader, PdfWriter

def remove_pages(input_pdf, output_pdf, pages_to_remove):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Adiciona apenas as páginas que não foram especificadas para remoção
    for i in range(len(reader.pages)):
        if i not in pages_to_remove:
            writer.add_page(reader.pages[i])

    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

# Exemplo de uso:
remove_pages('source.pdf', f'{ArquivosManipulados}output_without_pages.pdf', [0, 2])  # Remove a página 1 e 3
