In [None]:
# Install pdfplumber for extracting text from PDFs
!pip install pdfplumber

# Install pdf2image and pytesseract for OCR (in case the PDF is scanned)
!pip install pdf2image pytesseract pillow

!pip install groq


import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import json
from groq import Groq

def extract_text_from_pdf(pdf_path):
    output = {}
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:  # If text extraction is successful
                    output[f"Page {i + 1}"] = text
                    print(f"Text extracted from page {i + 1}.")
                else:
                    output[f"Page {i + 1}"] = "No text found."
    except Exception as e:
        print(f"Error reading PDF with pdfplumber: {e}")
        output["error"] = str(e)

    # If pdfplumber couldn't extract text, try OCR
    if not output:
        print("Trying to extract text using OCR...")
        images = convert_from_path(pdf_path)
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            output[f"Page {i + 1}"] = text

    return output

# Example usage
pdf_path = '/kaggle/input/electricity-bill-c/2809444438 01.07.24 - 31.07.24.pdf'
extracted_text = extract_text_from_pdf(pdf_path)

# Convert output to JSON format
json_output = json.dumps(extracted_text, indent=4)


# Initialize the Groq client with your API key
client = Groq(
    api_key=("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"),
)

# Load JSON data from file
with open('extracted_text.json', 'r') as file:
    json_data = json.load(file)

# Prepare the prompt to ask for MPAN number and total energy consumption
prompt = "From the following text, please extract the MPAN number, energy tariffs & the total energy consumption:\n\n"
for page, text in json_data.items():
    prompt += f"{page}: {text}\n\n"

# Create a chat completion request
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama-3.2-90b-vision-preview",
)

# Print the extracted information
print(chat_completion.choices[0].message.content)