
# Customer Data Extraction from PDF/JSON Documents

## Introduction
This notebook demonstrates the extraction of structured customer data (such as names, addresses, invoice numbers, dates, and amounts) from semi-structured documents like PDFs and JSON files. 
We will use regular expressions, NLP models, logging, error handling, and visualization tools to enhance the extraction process.



## Setup
Install required packages:

```bash
!pip install pdfplumber matplotlib
```


In [None]:

import pdfplumber
import re
import json
import logging
import matplotlib.pyplot as plt
from typing import List, Dict
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:

def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        logger.info(f"Successfully extracted text from {pdf_path}.")
    except Exception as e:
        logger.error(f"Error reading PDF {pdf_path}: {e}")
    return text


In [None]:

def extract_from_multiple_pdfs(file_list: List[str]) -> Dict:
    extracted_data = {}
    for pdf_file in file_list:
        text = extract_text_from_pdf(pdf_file)
        if text:
            extracted_data[pdf_file] = text
    return extracted_data


In [None]:

def extract_fields_from_text(text: str) -> Dict:
    fields = {}
    try:
        invoice_number = re.search(r'Invoice\s*(Number|#)\s*[:\-]?\s*([\w-]+)', text, re.IGNORECASE)
        if invoice_number:
            fields["invoice_number"] = invoice_number.group(2)

        date = re.search(r'Date\s*[:\-]?\s*([\w,\s\-]+)', text, re.IGNORECASE)
        if date:
            fields["invoice_date"] = date.group(1).strip()

        total_amount = re.search(r'Total\s*(Due)?\s*[:\-]?\s*\$?([\d,]+\.\d+)', text, re.IGNORECASE)
        if total_amount:
            fields["total_amount"] = "$" + total_amount.group(2)

        idx = text.lower().find("bill to")
        if idx != -1:
            sub_text = text[idx:]
            lines = [ln.strip() for ln in sub_text.splitlines() if ln.strip()]
            if len(lines) > 1:
                fields["customer_name"] = lines[1]
                fields["customer_address"] = " ".join(lines[2:])
    except Exception as e:
        logger.error(f"Error extracting fields: {e}")
    return fields


In [None]:

def extract_data_from_texts(text_data: Dict) -> Dict:
    extracted_info = {}
    for file_name, text in text_data.items():
        extracted_info[file_name] = extract_fields_from_text(text)
    return extracted_info


In [None]:

def save_to_json(data: Dict, filename: str):
    try:
        with open(filename, "w") as f:
            json.dump(data, f, indent=2)
        logger.info(f"Data saved to {filename}")
    except Exception as e:
        logger.error(f"Error saving data to file: {e}")


In [None]:

def visualize_data(data: Dict):
    invoice_numbers = []
    amounts = []
    for file, details in data.items():
        invoice_numbers.append(details.get("invoice_number", "Unknown"))
        amounts.append(float(details.get("total_amount", "0").replace("$", "")) if "total_amount" in details else 0)

    plt.figure(figsize=(12, 6))
    plt.bar(invoice_numbers, amounts, color='skyblue')
    plt.xlabel('Invoice Numbers')
    plt.ylabel('Amount in Dollars ($)')
    plt.title('Invoice Amounts Extracted from PDFs')
    plt.show()


In [None]:

pdf_files = ["invoice1.pdf", "invoice2.pdf", "invoice3.pdf"]
extracted_texts = extract_from_multiple_pdfs(pdf_files)
extracted_data = extract_data_from_texts(extracted_texts)
save_to_json(extracted_data, "extracted_invoices.json")
visualize_data(extracted_data)


In [None]:

def parse_json_file(file_path: str) -> Dict:
    try:
        with open(file_path, "r") as file:
            data = json.load(file)
        logger.info(f"JSON file {file_path} loaded successfully.")
        return data
    except Exception as e:
        logger.error(f"Error reading JSON file {file_path}: {e}")
        return {}


In [None]:

json_data = parse_json_file("sample_data.json")
print(json.dumps(json_data, indent=2))


In [None]:

def visualize_json_data(data: Dict):
    customer_names = [entry.get("customer_name", "Unknown") for entry in data.values()]
    amounts = [float(entry.get("total_amount", "0").replace("$", "")) for entry in data.values()]

    plt.figure(figsize=(10, 5))
    plt.bar(customer_names, amounts, color='orange')
    plt.xlabel('Customer Names')
    plt.ylabel('Amount ($)')
    plt.title('Invoice Amounts by Customer')
    plt.show()


In [None]:

print(json.dumps(extracted_data, indent=2))


## Named Entity Recognition (NER)

Using spaCy for Named Entity Recognition to identify entities such as names, locations, dates, and organizations from the parsed text.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:

import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def perform_ner(text: str):
    """Perform Named Entity Recognition on the given text."""
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities
        

### Extract Entities from Parsed PDF Text

In [None]:

# Example usage with parsed data from previous cells
pdf_files = ["example1.pdf", "example2.pdf"]  # Replace with your actual PDF file paths
parsed_data = extract_from_multiple_pdfs(pdf_files)

# Perform NER on the extracted text from PDFs
ner_results = {}
for file, text in parsed_data.items():
    ner_results[file] = perform_ner(text)

# Display the extracted entities
for file, entities in ner_results.items():
    print(f"Entities from {file}:\n", entities)
        

### Visualize Named Entities

In [None]:

from collections import Counter

def visualize_entities(entities):
    labels = [label for _, label in entities]
    label_counts = Counter(labels)
    
    plt.figure(figsize=(10, 6))
    plt.bar(label_counts.keys(), label_counts.values())
    plt.xlabel('Entity Type')
    plt.ylabel('Count')
    plt.title('Distribution of Named Entity Types')
    plt.xticks(rotation=45)
    plt.show()

# Example visualization for one PDF's entities
if ner_results:
    visualize_entities(next(iter(ner_results.values())))
        

### Save Entities to JSON

In [None]:

import json

def save_entities_to_json(ner_results, output_file='entities.json'):
    with open(output_file, 'w') as json_file:
        json.dump(ner_results, json_file, indent=4)
    logger.info(f"Entities saved to {output_file}")

# Save the NER results to JSON file
save_entities_to_json(ner_results)
        