# **Extract data from pdf and Png**

In [None]:
!apt-get install -y tesseract-ocr
!pip install langchain openai pypdf pytesseract Pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,764 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123594 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from google.colab import files
import os
import pytesseract
from PIL import Image
import io

# Function to extract text from PNG using OCR
def extract_text_from_png(file_content):
    image = Image.open(io.BytesIO(file_content))
    text = pytesseract.image_to_string(image)
    return text

# Function to process a file (PDF or PNG)
def process_file(file_content, filename):
    if filename.lower().endswith('.pdf'):
        with open(filename, 'wb') as f:
            f.write(file_content)


        loader = PyPDFLoader(filename)
        pages = loader.load_and_split()
        os.remove(filename)

        return pages
    elif filename.lower().endswith('.png'):
        text = extract_text_from_png(file_content)
        return [Document(page_content=text, metadata={"source": filename})]
    else:
        raise ValueError("error.")

uploaded = files.upload()

#you secret api / openai api key

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

prompt_template = """
Extract the following information from the given invoice text:
1. Customer details
2. Products (including quantity and price)
3. Total Amount

Invoice text:
{text}

Extracted Information:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = LLMChain(llm=llm, prompt=prompt)

# Process each uploaded file
for filename, file_content in uploaded.items():
    print(f"Processing file: {filename}")
    try:
        pages = process_file(file_content, filename)

        for i, page in enumerate(pages):
            print(f"Processing page {i+1}:")
            response = chain.run(text=page.page_content)
            print(response)
            print("-" * 50)
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        print("-" * 50)

Saving c669abb4-f485-4880-8973-cc7fdfeee22e.pdf to c669abb4-f485-4880-8973-cc7fdfeee22e (1).pdf
Processing file: c669abb4-f485-4880-8973-cc7fdfeee22e (1).pdf
Processing page 1:
1. Customer details:
- Name: TEST
- Address: Hyderabad, TELANGANA, 500089
- Phone: 9108239284
- Email: test@gmail.com

2. Products:
- Product 1: WASTE AND SCRAP OF STAINLESS STEEL
  - HSN: 72042190
  - Rate: ₹5.00
  - Quantity: 6,790 KGS
  - Total Amount: ₹6,45,050.00

3. Total Amount:
- Taxable Amount: ₹6,45,050.00
- IGST 18.0%: ₹1,16,109.00
- TCS @ 1% 206C: ₹7,611.59
- Round Off: ₹0.41
- Total: ₹7,68,771.00
- Amount Payable: ₹7,68,771.00
--------------------------------------------------
