# PDF reading with poppler&pytesseract -nu merge :)

In [3]:
import os
from typing import Any, List, Dict
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
import pytesseract
import pandas as pd

# Set the Tesseract command path explicitly
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

# Verify Tesseract installation
print(pytesseract.get_tesseract_version())

# Function to check if an element is a table
def is_table(element) -> bool:
    return "unstructured.documents.elements.Table" in str(type(element))

# Function to extract tables and their titles from elements
def extract_tables_with_titles(elements: List[Any]) -> Dict[str, List[str]]:
    tables_with_titles = {}
    current_title = None

    for element in elements:
        if "unstructured.documents.elements.Title" in str(type(element)):
            current_title = element.text.strip()
        elif is_table(element):
            if current_title:
                if current_title not in tables_with_titles:
                    tables_with_titles[current_title] = []
                tables_with_titles[current_title].append(str(element))
    
    return tables_with_titles

# Get elements from the PDF
raw_pdf_elements = partition_pdf(
    filename="C:/Users/Talent2/Desktop/ness/employee_details.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
)

# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique categories will have unique elements
unique_categories = set(category_counts.keys())
print(category_counts)

class Element(BaseModel):
    type: str
    text: Any

# Extract tables with titles
tables_with_titles = extract_tables_with_titles(raw_pdf_elements)

# Save each table to a CSV file
for title, tables in tables_with_titles.items():
    # Combine all tables with the same title
    combined_tables = []
    for table in tables:
        df = pd.read_csv(pd.compat.StringIO(table))
        combined_tables.append(df)
    
    combined_table = pd.concat(combined_tables, ignore_index=True)
    
    # Clean the title for use as a filename
    filename = title.replace(' ', '_').replace('/', '_') + '.csv'
    
    # Save to CSV
    combined_table.to_csv(filename, index=False)
    print(f"Saved table '{title}' to {filename}")


5.4.0.20240606


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [1]:
import os

print(os.environ['PATH'])


C:\WINDOWS\system32;C:\WINDOWS;C:\WINDOWS\System32\Wbem;C:\WINDOWS\System32\WindowsPowerShell\v1.0\;C:\WINDOWS\System32\OpenSSH\;C:\Program Files\Condusiv Technologies\ExpressCache\;C:\Program Files\PuTTY\;C:\Program Files\Git\cmd;C:\Program Files\TortoiseGit\bin;C:\MinGW\bin;C:\Program Files\nodejs\;C:\Users\Talent2\Desktop\python-3.12.3-amd64\;C:\;C:\Users\Talent2\Desktop\openai-env\Scripts;C:\Users\Talent2\Desktop\Scripts;C:\Users\Talent2\Desktop\poppler-24.02.0\Library\bin;C:\Users\Talent2\Desktop\tesseract-5.4.0;C:\Program Files (x86)\Tesseract-OCR;C:\Program Files (x86)\Tesseract-OCR\tesseract.exe;C:\Program Files\Tesseract-OCR\tesseract.exe;C:\Users\Talent2\Desktop\New folder (2)\Scripts\;C:\Users\Talent2\Desktop\New folder (2)\;C:\Users\Talent2\AppData\Local\Microsoft\WindowsApps;;C:\Users\Talent2\AppData\Local\Programs\Microsoft VS Code\bin;C:\Program Files\JetBrains\IntelliJ IDEA 2024.1\bin;


# image reading

In [13]:
import pytesseract
from PIL import Image

# Set the tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Example usageemployee_details.pdf
img = Image.open('img.jpg')
text = pytesseract.image_to_string(img)
print(text)


About Me

Lorem ipsum dolor sit amet,
consectetur adipiscing elit.
Vestibulum sit amet quam
rhoncus, egestas dui eget,
malesuada justo. Ut aliquam
augue.

eg +123-456-7890
@ hello@reallygreatsite.com
6 123 Anywhere St., Any City

LANGUAGE

« English
« Germany (basic)

¢ Spain (basic)

EXPERTISE

* Management Skills
¢ Creativity

¢ Digital Marketing
* Negotiation
Critical Thinking
Leadership

RICHARD

SANCHEZ

Product Designer

EXPERIEN

Studio Showde

Canberra - Australia

2020 - 2022

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada justo. Ut aliquam augue.

Elsetown Cor.

Kota Baru - Singapore

2016 - 2020

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada justo. Ut aliquam augue.

Studio Showde

sydney - Australia

2010 - 2015

Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vestibulum sit amet quam rhoncus, egestas dui eget,
malesuada j

## delete existing collection before running pdf reading with pdfplumber

In [17]:
import chromadb

# Initialize Chroma client
chroma_client = chromadb.Client()

# Specify the name of the collection you want to delete
collection_name = "pdf_tables"

# Delete the collection
chroma_client.delete_collection(name=collection_name)

# pdf reading with pdfplumber

In [18]:
import pdfplumber
import chromadb
import os
from langchain.vectorstores import Chroma
from langchain_openai.embeddings.base import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA


class CustomOpenAIEmbeddings(OpenAIEmbeddings):
    def __init__(self, api_key):
        super().__init__(api_key=api_key, embedding_size=384)
        
# Function to extract tables from PDF
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables.extend(page.extract_tables())
    return tables

# Function to preprocess tables data
def preprocess_tables(tables):
    processed_data = []
    for table_idx, table in enumerate(tables):
        for row_idx, row in enumerate(table):
            processed_data.append({
                "text": " | ".join(row),
                "metadata": {"table_index": table_idx, "row_index": row_idx}
            })
    return processed_data

# Path to the PDF file
pdf_path = r"C:/Users/Talent2/Desktop/ness/employee_details.pdf"

# Extract tables from the PDF
tables = extract_tables_from_pdf(pdf_path)

# Preprocess tables data
processed_data = preprocess_tables(tables)

# Initialize Chroma client
chroma_client = chromadb.Client()

# Create or get a Chroma collection
collection_name = "pdf_tables"
collection = chroma_client.create_collection(name=collection_name)

# Add processed data to Chroma collection
for idx, entry in enumerate(processed_data):
    collection.add(
        documents=[entry['text']],
        metadatas=[entry['metadata']],
        ids=[str(idx)]
    )

# Initialize LangChain with Chroma and OpenAI
# Initialize OpenAI embeddings with the desired dimensionality
# Initialize OpenAI embeddings without specifying the embedding size
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
key = os.getenv('OPENAPI_KEY')
embedding = OpenAIEmbeddings(api_key=key)


vectordb = Chroma(
    client=chroma_client,
    collection_name=collection_name,
    embedding_function=embedding
)


# Initialize RetrievalQA with LangChain
qa = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever())

# Query prompts
prompts = [
    "What is the value in the second column of the first row in the first table?"
]

# Invoke queries
for prompt in prompts:
    response = qa.invoke(prompt)
    print(f"Prompt: {prompt}\nResponse: {response}\n")


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-HBK7j***************************************4YJs. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}