In [29]:
import fitz  # PyMuPDF
import io
from PIL import Image
import pdfplumber
import csv
import pandas as pd

def extract_table_titles(pdf_path):
    # Deschide PDF-ul
    doc = fitz.open(pdf_path)
    table_titles = []
    title_frequencies = defaultdict(int)

    # Variabilă pentru a ține evidența rândurilor goale între titlurile de tabele
    blank_lines_count = 0
    
    # Parcurge fiecare pagină
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        lines = text.split('\n')
        
        for line in lines:
            words = line.split()
            
            # Verifică dacă linia este goală
            if len(words) == 0:
                blank_lines_count += 1
            else:
                # Verifică dacă linia conține un singur cuvânt care începe cu literă mare
                if len(words) == 1 and words[0][0].isupper():
                    # Pentru primul titlu de tabel, nu este nevoie să verificăm numărul de rânduri goale
                    if not table_titles or blank_lines_count >= 2:
                        table_titles.append(words[0])
                        title_frequencies[words[0]] = 0
                    blank_lines_count = 0  # Resetează contorul de rânduri goale
                else:
                    # Resetează contorul de rânduri goale dacă întâlnește o linie care nu este goală sau nu este titlu de tabel
                    blank_lines_count = 0

    return table_titles, dict(title_frequencies)
    
def extract_images_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]

            image = Image.open(io.BytesIO(image_bytes))
            image_path = f"{output_folder}/page_{page_num + 1}img{img_index + 1}.{image.format.lower()}"
            image.save(image_path)

            print(f"Saved image: {image_path}")

    print("Image extraction complete.")


def normalize_header(header):
    """Normalizează header-ul eliminând spațiile și caracterele de nouă linie."""
   # header=header.rstrip('\n')
    # header.replace('\n',"").strip()
    return header.replace('\n',"").strip()

def extract_tables_from_pdf(pdf_path, output_folder):
    table_titles, title_frequencies = extract_table_titles(pdf_path)
    index=0
    # Cuvinte cheie de verificat în antetul tabelului (normalizate)
    keywords = {"region_id", "country_id", "location_id", "job_id"}
    
    # Creează directorul de ieșire dacă nu există
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            tables = page.extract_tables()

            for table_index, table in enumerate(tables):
                # Creează DataFrame din tabel
                if len(table) > 1:
                    df = pd.DataFrame(table[1:], columns=table[0])
                else:
                    continue  # Sari peste tabelele fără date
                
                # Normalizează antetul
                headers = {normalize_header(header) for header in df.columns} # [header for header in df.columns] 
                #headers2=[header for header in df.columns] 
                print(f"Page {page_num + 1}, Table {table_index + 1} headers: {headers}")
                
                # Verifică dacă antetul conține toate cuvintele cheie
                ok = 0
                for keyword in keywords:
                    if keyword in headers:
                        ok = 1
                        break

                if ok == 1:
                    # Construiește calea fișierului folosind os.path.join
                    table_path = os.path.join(output_folder, f"{table_titles[index]}.csv")
                    index=index+1
                    df.to_csv(table_path, index=False)
                else: 
                    cale_fisier_csv = os.path.join(output_folder, f"{table_titles[index-1]}.csv")
                    # Încărcați datele din fișierul CSV existent într-un DataFrame
                    df_existent = pd.read_csv(cale_fisier_csv)
                    # Adăugați tabelul extras la DataFrame-ul existent
                    df_final = pd.concat([df_existent, df], ignore_index=True)
                    # Salvați DataFrame-ul final în fișierul CSV existent
                    df_final.to_csv(cale_fisier_csv, index=False)

                print(f"Saved table: {table_path}, OK: {ok}")

    print("Table extraction complete.")
    
# Example usage
pdf_path = 'employee_details.pdf'  # Path to your PDF file
output_folder = 'extracted_content'  # Output folder to save images and tables

import os
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
# ok
extract_images_from_pdf(pdf_path, output_folder)
extract_tables_from_pdf(pdf_path, output_folder)

Saved image: extracted_content/page_10img1.png
Image extraction complete.
Page 1, Table 1 headers: {'region_id', 'region_name'}
Saved table: extracted_content\Regions.csv, OK: 1
Page 1, Table 2 headers: {'country_id', 'region_id', 'country_name'}
Saved table: extracted_content\Countries.csv, OK: 1
Page 2, Table 1 headers: {'street_address', 'city', 'state_province', 'location_id', 'country_id', 'postal_code'}
Saved table: extracted_content\Locations.csv, OK: 1
Page 3, Table 1 headers: {'department_id', 'location_id', 'manager_id', 'department_name'}
Saved table: extracted_content\Departments.csv, OK: 1
Page 4, Table 1 headers: {'job_title', 'max_salary', 'job_id', 'min_salary'}
Saved table: extracted_content\Jobs.csv, OK: 1
Page 4, Table 2 headers: {'department_id', 'last_name', 'employee_id', 'phone_number', 'hire_date', 'email', 'salary', 'job_id', 'commission_pct', 'first_name', 'manager_id'}
Saved table: extracted_content\Employees.csv, OK: 1
Page 5, Table 1 headers: {'Bruce', 'BER

In [8]:
import pytesseract
from PIL import Image

# Set the tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Example usageemployee_details.pdf
img = Image.open('img.jpg')
text = pytesseract.image_to_string(img)
print(text)


About Me

Lorem ipsum dolor sit amet,
consectetur adipiscing elit.
Vestibulum sit amet quam
rhoncus, egestas dui eget,
malesuada justo. Ut aliquam
augue.

eg +123-456-7890
@ hello@reallygreatsite.com
6 123 Anywhere St., Any City

LANGUAGE

« English
« Germany (basic)

¢ Spain (basic)

EXPERTISE

* Management Skills
¢ Creativity

¢ Digital Marketing
* Negotiation
Critical Thinking
Leadership

RICHARD

SANCHEZ

Product Designer

EXPERIEN

Studio Showde

Canberra - Australia

2020 - 2022

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada justo. Ut aliquam augue.

Elsetown Cor.

Kota Baru - Singapore

2016 - 2020

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada justo. Ut aliquam augue.

Studio Showde

sydney - Australia

2010 - 2015

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada j