In [46]:
import fitz  # PyMuPDF
import io
import pytesseract
from PIL import Image
import pdfplumber
import csv
import pandas as pd
from collections import defaultdict


def extract_table_titles(pdf_path):
    # Deschide PDF-ul
    doc = fitz.open(pdf_path)
    table_titles = []
    title_frequencies = defaultdict(int)

    # Variabilă pentru a ține evidența rândurilor goale între titlurile de tabele
    blank_lines_count = 0
    
    # Parcurge fiecare pagină
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        lines = text.split('\n')
        
        for line in lines:
            words = line.split()
            
            # Verifică dacă linia este goală
            if len(words) == 0:
                blank_lines_count += 1
            else:
                # Verifică dacă linia conține un singur cuvânt care începe cu literă mare
                if len(words) == 1 and words[0][0].isupper():
                    # Pentru primul titlu de tabel, nu este nevoie să verificăm numărul de rânduri goale
                    if not table_titles or blank_lines_count >= 2:
                        table_titles.append(words[0])
                        title_frequencies[words[0]] = 0
                    blank_lines_count = 0  # Resetează contorul de rânduri goale
                else:
                    # Resetează contorul de rânduri goale dacă întâlnește o linie care nu este goală sau nu este titlu de tabel
                    blank_lines_count = 0

    return table_titles, dict(title_frequencies)
    
def extract_images_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]

            image = Image.open(io.BytesIO(image_bytes))
            image_path = os.path.join(f"Imagine.jpg")
            image.save(image_path)

            print(f"Saved image: {image_path}")

    print("Image extraction complete.")
    img = Image.open('Imagine.jpg')
    text = pytesseract.image_to_string(img)
    print(text)


def normalize_header(header):
    """Normalizează header-ul eliminând spațiile și caracterele de nouă linie."""
   # header=header.rstrip('\n')
    # header.replace('\n',"").strip()
    for df in header.columns:
        df=df.replace('\n','').strip()
    return header#header.replace('\n','').strip()

def extract_tables_from_pdf(pdf_path, output_folder):
    table_titles, title_frequencies = extract_table_titles(pdf_path)
    index=0
    all_tables_df = pd.DataFrame()
    # Cuvinte cheie de verificat în antetul tabelului (normalizate)
    keywords = {"region_id", "country_id", "location_id", "job_id"}
    
    # Creează directorul de ieșire dacă nu există
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            tables = page.extract_tables()

            for table_index, table in enumerate(tables):
                # Creează DataFrame din tabel
                if len(table) > 1:
                    df = pd.DataFrame(table[1:], columns=table[0])
                else:
                    continue  # Sari peste tabelele fără date
                
                # Normalizează antetul
                headers = [header for header in df.columns] #{normalize_header(header) for header in df.columns} # [header for header in df.columns] 
                #df=normalize_header(df)
                #headers2=[header for header in df.columns] 
                print(f"Page {page_num + 1}, Table {table_index + 1} headers: {headers}")
                
                # Verifică dacă antetul conține toate cuvintele cheie
                ok = 0
                for keyword in keywords:
                    if keyword in headers:
                        ok = 1
                        break

                if ok == 1:
                    # Construiește calea fișierului folosind os.path.join
                    table_path = os.path.join(output_folder, f"{table_titles[index]}.csv")
                    index=index+1
                    #df2=df
                    df.to_csv(table_path, index=False)
                    if all_tables_df.empty:
                            all_tables_df = df
                    else:
                            all_tables_df = pd.concat([all_tables_df, df], ignore_index=True)
                    
                    
                else: 
                    previous_table_path = os.path.join(output_folder, f"{table_titles[index - 1]}.csv")
                    df_existent = pd.read_csv(previous_table_path)
                    df_existent = pd.concat([df_existent, df], ignore_index=True)
                    df_existent.to_csv(previous_table_path, index=False)
                    all_tables_df = df_existent
    

                
                print(f"Saved table: {table_path}, OK: {ok}")
   
    print("Table extraction complete.")
    
# Example usage
pdf_path = 'Employee-details-1.pdf'  # Path to your PDF file
output_folder = 'extracted_content'  # Output folder to save images and tables

import os
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
# ok
extract_images_from_pdf(pdf_path, output_folder)
extract_tables_from_pdf(pdf_path, output_folder)


Saved image: Imagine.jpg
Image extraction complete.

Page 1, Table 1 headers: ['region_id', 'region_name']
Saved table: extracted_content\Regions.csv, OK: 1
Page 1, Table 2 headers: ['country_id', 'country_name', 'region_id']
Saved table: extracted_content\Countries.csv, OK: 1
Page 2, Table 1 headers: ['location_id', 'street_address', 'postal_code', 'city', 'state_province', 'country_id']
Saved table: extracted_content\Locations.csv, OK: 1
Page 3, Table 1 headers: ['department_id', 'department_name', 'manager_id', 'location_id']
Saved table: extracted_content\Departments.csv, OK: 1
Page 4, Table 1 headers: ['job_id', 'job_title', 'min_salary', 'max_salary']
Saved table: extracted_content\Jobs.csv, OK: 1
Page 5, Table 1 headers: ['employee_id', 'first_name', 'last_name', 'email', 'phone_number', 'hire_date', 'job_id', 'salary', 'commission_pct', 'manager_id', 'department_id']
Saved table: extracted_content\Employees.csv, OK: 1
Page 6, Table 1 headers: ['132', 'TJ', 'Olson', 'TJOLSON', '

In [47]:
import pytesseract
from PIL import Image

# Set the tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Example usageemployee_details.pdf
img = Image.open('Imagine.jpg')
text = pytesseract.image_to_string(img)
print(text)





In [36]:
import pandas as pd

# Load the provided CSV file
csv_path = r"C:/Users/Talent2/Desktop/ness/extracted_content/Employees.csv"
df_existent = pd.read_csv(csv_path)

# Display the first few rows of the dataframe
print(df_existent.head())


   employee_id first_name last_name     email  phone_number    hire_date  \
0        100.0     Steven      King     SKING  515.123.4567  17-JUN-1987   
1        101.0      Neena   Kochhar  NKOCHHAR  515.123.4568  21-SEP-1989   
2        102.0        Lex   De Haan   LDEHAAN  515.123.4569  13-JAN-1993   
3        103.0  Alexander    Hunold   AHUNOLD  590.423.4567  03-JAN-1990   
4        104.0      Bruce     Ernst    BERNST  590.423.4568  21-MAY-1991   

    job_id   salary  commission_pct  manager_id  ...  Charles  Johnson  \
0  AD_PRES  24000.0             NaN         NaN  ...      NaN      NaN   
1    AD_VP  17000.0             NaN       100.0  ...      NaN      NaN   
2    AD_VP  17000.0             NaN       100.0  ...      NaN      NaN   
3  IT_PROG   9000.0             NaN       102.0  ...      NaN      NaN   
4  IT_PROG   6000.0             NaN       103.0  ...      NaN      NaN   

  CJOHNSON 011.44.1644.429262 04-JAN-2000 SA_REP 6200 .10  149  80  
0      NaN                NaN

In [None]:
cale_fisier_csv = os.path.join(output_folder, f"{table_titles[index-1]}.csv")
                    # Încărcați datele din fișierul CSV existent într-un DataFrame
                    df_existent = pd.read_csv(cale_fisier_csv)
                    # Adăugați tabelul extras la DataFrame-ul existent
                    df2 = pd.concat([df2, df], ignore_index=True)
                    # Salvați DataFrame-ul final în fișierul CSV existent
                    df_final=df2
                    df_final.to_csv(cale_fisier_csv, index=False)