In [1]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,968 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [2]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [3]:
!pip install Pillow



In [4]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [5]:
!apt-get -qq install poppler-utils

Selecting previously unselected package poppler-utils.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 121796 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.3_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.3) ...
Setting up poppler-utils (22.02.0-2ubuntu0.3) ...
Processing triggers for man-db (2.10.2-1) ...


In [6]:
import pytesseract
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
import cv2
import pandas as pd
import re

# Function to convert a PDF to images
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

# Function to preprocess an image for OCR
def preprocess_image_for_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.bilateralFilter(gray, 11, 17, 17)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
    return thresh

# Function to extract key-value pairs and table data from text
def extract_data_from_text(text):
    key_value_pattern = r"([A-Za-z ]+)\s*[:]\s*([A-Za-z0-9./ ]+)"
    table_pattern = r'(\b[A-Z0-9\.]+\b)\s+([A-Z ]+)\s+(\d+)\s+(\d+\.?\d*)\s+(-?\d+\.?\d*)\s+(\d+)'

    key_value_matches = re.findall(key_value_pattern, text)
    key_value_dict = {key.strip(): value.strip() for key, value in key_value_matches}
    table_matches = re.findall(table_pattern, text)
    table_dict = {match[0]: match[1:] for match in table_matches}

    return {**key_value_dict, 'Table Data': table_dict}

# Function to extract phone numbers from text
def extract_phone_numbers(text):
    phone_number_pattern = r"(?:\+\d{1,3})?[ -]?\(?\d{3}\)?[ -]?\d{3}[ -]?\d{4}"
    phone_matches = re.findall(phone_number_pattern, text)
    return phone_matches

# Function to extract full names from text
def extract_full_names(text):
    name_pattern = r"\b(Mr|Ms|Mrs)\. [A-Z][a-z]+ [A-Z][a-z]+\b"
    name_matches = re.findall(name_pattern, text)
    return name_matches

# Function to extract highlighted values from the original image
def extract_highlighted_values(img):
    lower_highlight = np.array([0, 200, 200])
    upper_highlight = np.array([10, 255, 255])
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv, lower_highlight, upper_highlight)
    highlighted_text = pytesseract.image_to_string(cv2.bitwise_and(img, img, mask=mask))

    key_value_pairs = {}
    for line in highlighted_text.splitlines():
        if "Discount" in line:
            key_value_pairs["Discount"] = line.split()[-1]
        elif "Total CHF" in line:
            key_value_pairs["Total CHF"] = line.split()[-1]

    return key_value_pairs

# Main processing function
def process_pdf_and_extract_data(pdf_path):
    images = pdf_to_images(pdf_path)
    combined_data = {}

    for img in images:
        original_img = np.array(img)
        preprocessed_img = preprocess_image_for_ocr(original_img)

        # Extract phone numbers
        phone_text = pytesseract.image_to_string(preprocessed_img, config='--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789+()- ')
        phone_numbers = extract_phone_numbers(phone_text)

        # Extract names
        name_text = pytesseract.image_to_string(preprocessed_img, config='--oem 3 --psm 6')
        full_names = extract_full_names(name_text)

        # Extract remaining data
        text = pytesseract.image_to_string(preprocessed_img, config='--psm 6')
        data_from_text = extract_data_from_text(text)
        highlighted_values = extract_highlighted_values(original_img)

        combined_data = {**combined_data, **data_from_text, **highlighted_values}
        combined_data['Phone Numbers'] = phone_numbers
        combined_data['Full Names'] = full_names

    return combined_data

# Path to the PDF file
pdf_path = '/content/sample4.pdf'

# Extract data
extracted_data = process_pdf_and_extract_data(pdf_path)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(list(extracted_data.items()), columns=['Key', 'Value'])
df.to_csv('extracted_data4.csv', index=False)

print("Data extraction complete, saved to 'extracted_data.csv'.")


Data extraction complete, saved to 'extracted_data.csv'.
