**Assignment :- Financial Data Extraction Using Open-Source LLMs**

**Name -** Anil Kushwaha

**Mob.no-** 9829159839

**Email -** kumaranil48309@gmail.com

In [52]:
from google.colab import files

print("Please upload 1st PDF files :")
uploaded = files.upload()

print("Please upload 2nd PDF files :")
uploaded = files.upload()  # Opens a file selection dialog


Please upload 1st PDF files :


Please upload 2nd PDF files :


In [51]:
"""
# Financial Data Extraction from PDFs

This notebook extracts financial details such as company name, report date,
and profit before tax from PDF documents (both normal and scanned) using OCR.

## Libraries Used:
- `pdfplumber` for extracting text from PDFs
- `pytesseract` for OCR on scanned PDFs
- `pdf2image` for converting PDF pages to images
- `re` for extracting structured financial data using regex
- `json` for saving extracted data
- `os` for file handling
- `google.colab.files` for uploading files in Google Colab
"""

import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import re
import os
import json

# Function to extract text from PDF (both normal & scanned)
def extract_text_from_pdf(pdf_path):
    text = ""
    summary = ""

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"pdfplumber failed: {e}")

    # If no text was extracted, use OCR
    if not text.strip():
        print(f"Using OCR for {pdf_path} (possible scanned document)")
        images = convert_from_path(pdf_path)
        text = "\n".join([pytesseract.image_to_string(img) for img in images])

    # Improved summary extraction pattern
    summary_pattern = re.search(r"(Summary|Overview|Executive Summary)(.*?)(?=\n[A-Z])", text, re.DOTALL)
    if summary_pattern:
        summary = summary_pattern.group(2).strip()

    return text, summary

# Function to extract financial details using regex
def extract_financial_entities(text, summary):
    entities = {
        "Company Name": None,
        "Report Date": None,
        "Profit Before Tax": None,
        "Summary": summary
    }

    # Improved regex patterns
    company_pattern = re.search(r"(Amara Raja Batteries Limited|Amaar Raja|[A-Z][a-z]+ [A-Z][a-z]+ (Ltd|Limited|Inc|Corp))", text)
    date_pattern = re.search(r"(\d{2}-[A-Za-z]{3}-\d{4}|\d{2}/\d{2}/\d{4})", text)
    profit_pattern = re.search(r"(Profit before tax|PBT)[:\s]+([\d,.]+)", text, re.IGNORECASE)

    # Extract values if found
    if company_pattern:
        entities["Company Name"] = company_pattern.group(1).strip()
    if date_pattern:
        entities["Report Date"] = date_pattern.group(1)
    if profit_pattern:
        entities["Profit Before Tax"] = profit_pattern.group(2)

    return entities

# List of PDF files
pdf_files = [
    "/content/Amaar raja Earnings Summary.pdf",
    "/content/1_FinancialResults_05022025142214.pdf"
]

# Process each PDF and store results
financial_data = {}

for pdf_path in pdf_files:
    text, summary = extract_text_from_pdf(pdf_path)
    financial_data[os.path.basename(pdf_path)] = extract_financial_entities(text, summary)

# Save output as JSON
output_file = "/content/financial_data.json"
with open(output_file, "w") as f:
    json.dump(financial_data, f, indent=4)

print(f"Extracted financial data saved to {output_file}")


Extracted financial data saved to /content/financial_data.json
