# Remove background from images

In [14]:
import cv2
import os
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import filedialog

def remove_background():
    root = tk.Tk()
    root.withdraw()
    
    # Open file dialog to select image file
    file_path = filedialog.askopenfilename()
    # Read the image
    img = cv2.imread(file_path)
    
    # Convert image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding to extract the foreground
    # object (the hand)
    ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Remove the background
    foreground = cv2.bitwise_and(img, img, mask=thresh)
    
    # Get the file name and directory of the original image
    file_name, file_extension = os.path.splitext(os.path.basename(file_path))
    directory = os.path.dirname(file_path)
    
    # Save the foreground image in the same directory
    output_path = os.path.join(directory, file_name + "_foreground" + file_extension)
    cv2.imwrite(output_path, foreground)
    print("Foreground image saved to: " + output_path)
    
remove_background()

Foreground image saved to: C:/Users/Ahad/Desktop\lol_foreground.jpg


# Merge PDF files

In [25]:
import os
from PyPDF2 import PdfFileMerger
from tkinter import filedialog

def merge_pdfs(output_file):
    root = tk.Tk()
    root.withdraw()
    
    #Select all the pdf files in the dialog to merge
    pdf_files = filedialog.askopenfilenames(title = "Select PDF files", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if len(pdf_files) == 0:
        print("No files selected")
        
    else:
        # Create a pdf merger object
        merger = PdfFileMerger()

        # Iterate through all pdf files
        for pdf_file in pdf_files:
            
            # Open each pdf file
            with open(pdf_file, 'rb') as file:
                
                # Add the pdf file to the merger
                merger.append(file)

        # Write the merged pdf to the output file
        with open(output_file, 'wb') as file:
            merger.write(file)
            
        merger.close()
        print(f"PDFs merged successfully and saved as {output_file}")
        
output_file = output_file = r'C:\Users\Ahad\Desktop\merged_pdf.pdf'
merge_pdfs(output_file)

No files selected


# PDF to JPG

In [5]:
from pdf2image import convert_from_path
import tkinter as tk
from tkinter import filedialog
import os
import fitz

In [6]:
def pdf_to_jpg():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if pdf_file == '':
        print("No file selected")
        
    else:
        folder_path = filedialog.askdirectory(title = "Select output folder")
        
        if folder_path == '':
            print("No folder selected")
            
        else:
            doc = fitz.open(pdf_file)
            
            for i in range(doc.page_count):
                page = doc[i]
                pix = page.get_pixmap()
                output = f'{folder_path}/page_{i}.jpg'
                pix.save(output)
                
            print(f"PDF {pdf_file} converted to jpeg and saved at {folder_path}")
                    
pdf_to_jpg()

PDF C:/Users/Ahad/Desktop/Abdul_Ahad.pdf converted to jpeg and saved at C:/Users/Ahad/Desktop


# JPG to PDF

In [7]:
import tkinter as tk
from PIL import Image
from tkinter import filedialog

In [8]:
def image_to_pdf():
    root = tk.Tk()
    root.withdraw()
    image_file = filedialog.askopenfilename(title = "Select image file", filetypes = (("image files", "*.jpg;*.png;*.jpeg"), ("all files", "*.*")))
    
    if image_file == '':
        print("No file selected")
        
    else:
        pdf_file = filedialog.asksaveasfilename(title = "Save as",defaultextension='.pdf',filetypes = (("pdf files", "*.pdf"), ("all files", "*.*")))
        
        if pdf_file =='':
            print("No file selected")
            
        else:
            # Open the image file
            with Image.open(image_file) as image:
                
                # Create a pdf object
                pdf = Image.new("RGB", image.size, (255, 255, 255))
                pdf.paste(image, (0, 0))
                
                # Save the pdf
                pdf.save(pdf_file, "PDF", resolution = 100.0)
                print(f"Image {image_file} converted to pdf and saved as {pdf_file}")
                
image_to_pdf()

Image C:/Users/Ahad/Desktop/lol.png converted to pdf and saved as C:/Users/Ahad/Desktop/plew.pdf


# Compress PDF

In [1]:
import PyPDF2
import tkinter as tk
from tkinter import filedialog

In [2]:
def compress_pdf():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if pdf_file == '':
        print("No file selected")
        
    else:
        compressed_pdf_file = filedialog.asksaveasfilename(title = "Save as",defaultextension='.pdf',filetypes = (("pdf files", "*.pdf"), ("all files", "*.*")))
        
        if compressed_pdf_file == '':
            print("No file selected")
        
        else:
            # Open the pdf file
            with open(pdf_file, 'rb') as file:
                pdf_reader = PyPDF2.PdfFileReader(file)
                pdf_writer = PyPDF2.PdfFileWriter()
                
                for page in range(pdf_reader.numPages):
                    pdf_writer.addPage(pdf_reader.getPage(page))
                pdf_writer.addMetadata({
                    '/Title': 'compressed pdf',
                    '/Author': 'User'
                })
                
                with open(compressed_pdf_file, 'wb') as output:
                    pdf_writer.write(output)
                print(f"PDF {pdf_file} is compressed and saved as {compressed_pdf_file}")
                
compress_pdf()

PDF C:/Users/Ahad/Desktop/Abdul_Ahad.pdf is compressed and saved as C:/Users/Ahad/Desktop/plew.pdf


# Split PDFs

In [3]:
import PyPDF2
import tkinter as tk
from tkinter import filedialog

In [5]:
def split_pdf():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if pdf_file =='':
        print("No file selected")
    
    else:
        folder_path = filedialog.askdirectory(title = "Select output folder")
        
        if folder_path =='':
            print("No folder selected")
        
        else:
            num_of_pages = int(input("Enter the number of pages per split pdf file: "))
            # Open the pdf file
            with open(pdf_file, 'rb') as file:
                pdf_reader = PyPDF2.PdfFileReader(file)
                total_pages = pdf_reader.numPages
                
                for i in range(0, total_pages, num_of_pages):
                    pdf_writer = PyPDF2.PdfFileWriter()
                    
                    for page in range(i, i + num_of_pages):
                        if page < total_pages:
                            pdf_writer.addPage(pdf_reader.getPage(page))
                    output_file = f'{folder_path}/split_{i}-{i + num_of_pages}.pdf'
                    
                    with open(output_file, 'wb') as output:
                        pdf_writer.write(output)
                print(f"PDF {pdf_file} is splitted and saved at {folder_path}")
                
split_pdf()

Enter the number of pages per split pdf file: 2
PDF C:/Users/Ahad/Desktop/Abdul_Ahad.pdf is splitted and saved at C:/Users/Ahad/Desktop


# PDF to Word

In [7]:
import PyPDF2
import docx
import tkinter as tk
from tkinter import filedialog

In [8]:
def pdf_to_docx():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if pdf_file == '':
        print("No file selected")
    
    else:
        docx_file = filedialog.asksaveasfilename(title = "Save as",defaultextension='.docx',filetypes = (("Word files", "*.docx"), ("all files", "*.*")))
        
        if docx_file == '':
            print("No file selected")
        
        else:
            # Open the pdf file
            with open(pdf_file, 'rb') as file:
                pdf_reader = PyPDF2.PdfFileReader(file)
                text = ""
                
                for page in range(pdf_reader.numPages):
                    text += pdf_reader.getPage(page).extractText()
                doc = docx.Document()
                doc.add_paragraph(text)
                doc.save(docx_file)
                print(f"PDF {pdf_file} is converted to docx and saved as {docx_file}")
                
pdf_to_docx()

PDF C:/Users/Ahad/Desktop/Abdul_Ahad.pdf is converted to docx and saved as C:/Users/Ahad/Desktop/plew.docx


# Alternate PDF to Word using Tesseract for one page PDFs

In [1]:
from wand.image import Image as wi
from PIL import Image
import pytesseract
import docx
import tkinter as tk
from tkinter import filedialog
import os

def pdf_to_docx():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    if pdf_file =='':
        print("No file selected")
    else:
        docx_file = filedialog.asksaveasfilename(title = "Save as",defaultextension='.docx',filetypes = (("Word files", "*.docx"), ("all files", "*.*")))
        if docx_file =='':
            print("No file selected")
        else:
            with wi(filename=pdf_file, resolution=300) as img:
                if os.path.exists('temp.jpg'):
                    os.remove('temp.jpg')
                img.save(filename=os.path.expanduser("~/Desktop/temp.jpg"))
                with Image.open("C:\\Users\\Ahad\\Desktop\\temp.jpg") as img:
                    text = pytesseract.image_to_string(img)
                    doc = docx.Document()
                    doc.add_paragraph(text)
                    doc.save(docx_file)
                    print(f"PDF {pdf_file} is converted to docx and saved as {docx_file}")
                
pdf_to_docx()

PDF C:/Users/Ahad/Desktop/CC-73DE11A7F8.pdf is converted to docx and saved as C:/Users/Ahad/Desktop/vxc.docx


In [6]:
import tkinter as tk
from tkinter import filedialog
from PyPDF2 import PdfFileReader
from PIL import Image

In [7]:
def extract_images_from_pdf():
    root = tk.Tk()
    root.withdraw()
    pdf_file = filedialog.askopenfilename(title = "Select PDF file", filetypes = (("PDF files", "*.pdf"), ("all files", "*.*")))
    
    if pdf_file == '':
        print("No file selected")
    
    else:
        folder_path = filedialog.askdirectory(title = "Select folder to save images")
        
        if folder_path == '':
            print("No folder selected")
        
        else:
            try:
                with open(pdf_file, 'rb') as f:
                    pdf = PdfFileReader(f)
                    
                    for pg in range(pdf.getNumPages()):
                        page = pdf.getPage(pg)
                        xObject = page['/Resources']['/XObject'].getObject()
                        
                        for obj in xObject:
                            if xObject[obj]['/Subtype'] == '/Image':
                                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                                data = xObject[obj].getData()
                                img = Image.frombytes('RGB', size, data)
                                img_file = folder_path + '/image_' + str(pg) + '.jpg'
                                img.save(img_file)
                                print(f"image saved as {img_file}")
            
            except Exception as e:
                print(f"Error Occured: {e}")
            
extract_images_from_pdf()