In [1]:
import os
import glob
import pdfminer
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFSyntaxError
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
# Data Preparation
pdf_directory = 'D:\Projects\Project Dataset\data hygeine data\pdf'
pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))
documents = []

In [3]:
for pdf_file in pdf_files:
    try:
        text = extract_text(pdf_file)
        documents.append(text)
    except PDFSyntaxError as e:
        print(f"Error extracting text from {pdf_file}: {e}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")

Error extracting text from D:\Projects\Project Dataset\data hygeine data\pdf\1124.pdf: No /Root object! - Is this really a PDF?
Error extracting text from D:\Projects\Project Dataset\data hygeine data\pdf\1648.pdf: No /Root object! - Is this really a PDF?
Error processing D:\Projects\Project Dataset\data hygeine data\pdf\3012.pdf: Unexpected EOF
Error processing D:\Projects\Project Dataset\data hygeine data\pdf\3013.pdf: Unexpected EOF


In [4]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

In [5]:
# Unsupervised Learning - Automatic Labeling
num_clusters = 2  # Number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_



In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [7]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       365
           1       0.98      0.99      0.99       288

    accuracy                           0.99       653
   macro avg       0.99      0.99      0.99       653
weighted avg       0.99      0.99      0.99       653



In [9]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"PDF Error: {pdf_path} - {e}")
        return "" 

In [13]:
%pip install easyocr

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
from fuzzywuzzy import fuzz
import re
import spacy
import fitz
import easyocr
from fuzzywuzzy import fuzz
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import PyPDF2

pii_keywords = [
    'SSN', 'Social Security Number',
    'DOB', 'Date of Birth',
    'Name', 'Address',
    'Phone Number', 'Phone', 'TelephoneNo',
    'E-mail', 'EmailAddress',
    'Credit Card Number',
    'FullNames', 'IDCardNo',
    'Contact', 'PostalAddress'
]

def check_for_pii(text):
    found_keywords = []
    for keyword in pii_keywords:
        pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE)
        if pattern.search(text):
            found_keywords.append(keyword)
    return found_keywords

# Initialize the easyocr reader
reader = easyocr.Reader(lang_list=['en'])

def extract_text_from_image(image_path):
    result = reader.readtext(image_path)
    text = ' '.join(result[1] for result in result)
    return text

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# File path
file_path = 'D:/Projects/Project Dataset/data classification data/confidential_data/1.pdf'  # Update with your file path

if file_path.lower().endswith('.pdf'):
    # PDF handling logic
    with open(file_path, 'rb') as pdf:
        pdf_reader = PyPDF2.PdfReader(pdf)
        text_content = ''
        for page in pdf_reader.pages:
            text_content += page.extract_text()
    if text_content:
        pii_detected = check_for_pii(text_content)
        if pii_detected:
            print("PII keywords detected:", ', '.join(pii_detected))
            print("Potential Personally Identifiable Information found.")

            doc = fitz.open(file_path)
            num_pages = len(doc)

            for page_num in range(num_pages):
                page = doc.load_page(page_num)
                page_text = page.get_text()

                # Search for PII data using fuzzy matching
                for keyword in pii_detected:
                    matches = re.finditer(rf"\b{re.escape(keyword)}\b", page_text, flags=re.IGNORECASE)
                    for match in matches:
                        start_pos = match.start()
                        end_pos = match.end()
                        text_slice = page_text[start_pos:end_pos]

                        match_score = fuzz.partial_ratio(text_slice.lower(), keyword.lower())

                        if match_score >= 80:
                            instances = page.search_for(text_slice)
                            for instance in instances:
                                highlight = page.add_rect_annot(instance)
                                highlight.set_colors({"fill": (1, 0, 1)})

            # Save the modified PDF file
            output_file = 'C:/Users/Ashfak/Downloads/highlighted.pdf'
            doc.save(output_file)
            doc.close()

            print("Highlighted PDF file created:", output_file)
        else:
            print("No PII keywords detected.")
    else:
        print("Error: Unable to extract text from the PDF.")

elif file_path.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
    # Image handling logic
    extracted_text = extract_text_from_image(file_path)
    if extracted_text:
        pii_detected = check_for_pii(extracted_text)
        if pii_detected:
            print("PII keywords detected:", ', '.join(pii_detected))
            print("Potential Personally Identifiable Information found.")

            # Create a PDF document
            output_pdf_path = 'C:/Users/Ashfak/Downloads/highlighted_text.pdf'
            doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
            styles = getSampleStyleSheet()

            # Create a list of paragraphs with highlighted PII data
            paragraphs = []
            for keyword in pii_detected:
                modified_text = re.sub(rf'\b{re.escape(keyword)}\b', f'<font color="red">[{keyword}]</font>', extracted_text, flags=re.IGNORECASE)
                paragraph = Paragraph(modified_text, style=styles["Normal"])
                paragraphs.append(paragraph)

            doc.build(paragraphs)

            print("Modified PDF file created:", output_pdf_path)
        else:
            print("No PII keywords detected in the image.")
    else:
        print("Error: Unable to extract text from the image.")
else:
    print("Unsupported file format.")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


PII keywords detected: Phone, TelephoneNo, EmailAddress, FullNames, IDCardNo, PostalAddress
Potential Personally Identifiable Information found.
Highlighted PDF file created: C:/Users/Ashfak/Downloads/highlighted.pdf


In [1]:
import sklearn
print(sklearn.__version__)

1.2.2
