## Implementing Data Extraction Techniques

In [2]:
# Intro to Data Extraction
import pandas as pd

def convert(unstructured_data):
    
    key_value_pairs = unstructured_data.split(", ")
    
    # Initializing an empty dic to hold the structured data
    structured_data = {}

    for pair in key_value_pairs:
        # Splitting the pair by ': '
        key, value = pair.split(": ")

        # Checking if val is numeric and converting it to float
        if value.replace('.', '', 1).isdigit():
            value = float(value)
        elif value.startswith('$'):
            value = float(value[1:])

        structured_data[key] = value

    return structured_data

unstructured_data = "Date: 2024-01-22, Name: John Doe, Amount: $123.45, Purpose: Subscription"

structured_data = convert(unstructured_data)

df = pd.DataFrame([structured_data])
df.head()

Unnamed: 0,Date,Name,Amount,Purpose
0,2024-01-22,John Doe,123.45,Subscription


In [4]:
# Challenges and Ethics
import re

unstructured_data = "Patient ID: 12345, Name: John Doe, Condition: Flu, SSN: 987-65-4321"

pattern = r'(\d{3}-\d{2})-\d{4}'

replacement = r'\1-xxxx'

anonymized_data = re.sub(pattern, replacement, unstructured_data)

anonymized_data

'Patient ID: 12345, Name: John Doe, Condition: Flu, SSN: 987-65-xxxx'

In [5]:
# Rule based Extraction Techniques
import re

document = "INV-123-456, Date: 2024-01-22, $123.45"

patterns = {
    'Invoice Number': r'(\S+)',
    'Date': r'Date: (\S+)',
    'Total Amount': r'\$([0-9.]+)'
}

extracted_data = {key: re.search(pattern, document).group(1) for key, pattern in patterns.items()}

extracted_data

{'Invoice Number': 'INV-123-456,',
 'Date': '2024-01-22,',
 'Total Amount': '123.45'}

In [6]:
# Advanced Rule-based Extraction
import re

document = "**** Invoice **** Number: INV-789-101, Date (Issued): 2024-01-23, Total: $421.37"

patterns = {
    'Invoice Number': r'Number: (\S+)',
    'Date': r'Date.*?: (\S+)',
    'Total Amount': r'Total.*?: \$([0-9.]+)'
}

extracted_data = {key: re.search(pattern, document).group(1) for key, pattern in patterns.items()}

extracted_data

{'Invoice Number': 'INV-789-101,',
 'Date': '2024-01-23,',
 'Total Amount': '421.37'}

### Advanced Data Extraction Techniques

In [14]:
# Machine learning for data extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

docs = ["Invoice amount $123", "Date of transaction: 2024-01-22", "Payment: $456"]
labels = ['amount', 'date', 'amount']

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(docs)

clf = MultinomialNB()
clf.fit(X, labels)

print(clf.predict(vectorizer.transform(['Payment due: $123'])))

['amount']


In [16]:
# Deep Leaning in data extraction
# !python -m spacy download en_core_web_sm

import spacy

# Load a pre-trained NER model
nlp = spacy.load("en_core_web_sm")

text = "This is a simple invoice text with an amount of $123.45 due by 2024-01-25."

doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]

entities

[('123.45', 'MONEY'), ('2024-01-25', 'DATE')]

In [17]:
# Evaluating Data Extraction models
from sklearn.metrics import precision_score, recall_score, f1_score

true_labels = [0, 1, 0, 1, 0]
predicted_labels = [0, 1, 0, 0, 1]

pr = precision_score(true_labels, predicted_labels)
re = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(pr, re, f1)

0.5 0.5 0.5


In [None]:
# Handling diverse documents and challenges
!pip install pytesseract

import pytesseract
from PIL import Image
import spacy

# pytesseact.image_to_string(image)