# PDF Tabellen erkennen und exttrahieren 

In [6]:
# V3 
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import pdfplumber
import pandas as pd
import os
import subprocess
import threading

def select_file():
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    if file_path:
        threading.Thread(target=extract_tables, args=(file_path,)).start()

def extract_tables(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            total_pages = len(pdf.pages)
            progress_bar["maximum"] = total_pages
            progress_bar["value"] = 0
            all_tables = []

            for i, page in enumerate(pdf.pages):
                update_status(f"Processing page {i + 1} of {total_pages}...")
                tables = page.extract_tables()
                for table in tables:
                    df = pd.DataFrame(table[1:], columns=table[0])  # Erste Zeile als Spaltennamen
                    all_tables.append(df)
                progress_bar["value"] = i + 1
                root.update_idletasks()

            if not all_tables:
                messagebox.showerror("Error", "No tables found in the PDF.")
                return

            # Verzeichnispfad der PDF-Datei extrahieren
            pdf_directory = os.path.dirname(file_path)

            # Excel-Dateinamen erstellen (gleicher Speicherort wie PDF)
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            excel_file_path = os.path.join(pdf_directory, f"{base_name} (Tables extracted).xlsx")

            # Speichere Tabellen in die Excel-Datei
            with pd.ExcelWriter(excel_file_path) as writer:
                for i, table in enumerate(all_tables):
                    table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)

        messagebox.showinfo("Success", f"Tables have been extracted and saved to '{excel_file_path}'.")
        
        # Öffne die erstellte Excel-Datei
        open_file(excel_file_path)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        messagebox.showerror("Error", f"An error occurred: {str(e)}")
    finally:
        progress_bar.stop()  # Stop the progress bar
        update_status("")  # Clear status message


def update_status(message):
    status_label.config(text=message)

def open_file(file_path):
    try:
        if os.name == 'nt':  # Windows
            os.startfile(file_path)
        elif os.name == 'posix':  # macOS oder Linux
            subprocess.call(['open', file_path] if sys.platform == 'darwin' else ['xdg-open', file_path])
    except Exception as e:
        print(f"Could not open the file: {str(e)}")
        messagebox.showerror("Error", f"Could not open the file: {str(e)}")

# Erstelle die GUI
root = tk.Tk()
root.title("PDF Table Extractor")

frame = tk.Frame(root, padx=10, pady=10)
frame.pack(padx=10, pady=10)

label = tk.Label(frame, text="Select a PDF file to extract tables:")
label.pack(pady=5)

button = tk.Button(frame, text="Select PDF", command=select_file)
button.pack(pady=5)

progress_bar = ttk.Progressbar(frame, orient="horizontal", mode="determinate")
progress_bar.pack(pady=5, fill=tk.X)

status_label = tk.Label(frame, text="")
status_label.pack(pady=5)

root.mainloop()
