In [1]:
import os
import fitz  # PyMuPDF
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Data Collection
# Replace 'pii' with your own PII data label and 'non_pii' with your non-PII data label
pii_directory = 'D:/Projects/Project Dataset/data hygeine data/pdf'
non_pii_directory = 'D:/Projects/Project Dataset/data classification data/non_confidential_data'

pii_files = [os.path.join(pii_directory, filename) for filename in os.listdir(pii_directory)]
non_pii_files = [os.path.join(non_pii_directory, filename) for filename in os.listdir(non_pii_directory)]


In [2]:
# Step 2: Preprocessing
def preprocess_text(text):
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text

In [3]:
# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(preprocessor=preprocess_text)
all_texts = []
labels = []

# Iterate through PII files
for file_path in pii_files:
    try:
        with open(file_path, 'rb') as file:
            pdf = fitz.open(stream=file.read(), filetype="pdf")
            text = ''
            for page_num in range(pdf.page_count):
                page = pdf[page_num]
                text += page.get_text()
            all_texts.append(text)
            labels.append(1)  # Label as PII data
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Iterate through non-PII files
for file_path in non_pii_files:
    try:
        with open(file_path, 'rb') as file:
            pdf = fitz.open(stream=file.read(), filetype="pdf")
            text = ''
            for page_num in range(pdf.page_count):
                page = pdf[page_num]
                text += page.get_text()
            all_texts.append(text)
            labels.append(0)  # Label as non-PII data
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000000_in - Copy.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000000_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000001_in - Copy.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000001_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000002_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000003_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000004_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000005_in.jpg: cannot open broken document
Error processing D:/Projects/Project Dataset/data hygeine data/pdf\00000006_in.jpg: cannot

In [4]:
%pip install imbalanced-learn

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


X_train, X_test, y_train, y_test = train_test_split(all_texts, labels, test_size=0.2)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Oversampling to balance the dataset
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_vectorized, y_train)

# Undersampling to balance the dataset
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

X_test_vectorized = vectorizer.transform(X_test)

# Train and evaluate SVM model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_resampled, y_resampled)
svm_accuracy = accuracy_score(y_test, svm_model.predict(X_test_vectorized))

# Train and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_resampled, y_resampled)
rf_accuracy = accuracy_score(y_test, rf_model.predict(X_test_vectorized))

# Train and evaluate Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_resampled, y_resampled)
lr_accuracy = accuracy_score(y_test, lr_model.predict(X_test_vectorized))

print(f"SVM Model Accuracy: {svm_accuracy}")
print(f"Random Forest Model Accuracy: {rf_accuracy}")
print(f"Logistic Regression Model Accuracy: {lr_accuracy}")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


SVM Model Accuracy: 1.0
Random Forest Model Accuracy: 1.0
Logistic Regression Model Accuracy: 1.0


In [5]:
import joblib

# Save the trained SVM model
model_filename = 'svm_model.pkl'
joblib.dump(svm_model, model_filename)
print(f"Trained SVM model saved as {model_filename}")

Trained SVM model saved as svm_model.pkl


In [7]:
# %pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.0-py3-none-any.whl (2.9 MB)
                                              0.0/2.9 MB ? eta -:--:--
                                              0.0/2.9 MB 1.3 MB/s eta 0:00:03
                                              0.1/2.9 MB 656.4 kB/s eta 0:00:05
     -                                        0.1/2.9 MB 930.9 kB/s eta 0:00:03
     --                                       0.2/2.9 MB 1.0 MB/s eta 0:00:03
     ---                                      0.2/2.9 MB 1.0 MB/s eta 0:00:03
     ----                                     0.3/2.9 MB 1.2 MB/s eta 0:00:03
     -----                                    0.4/2.9 MB 1.2 MB/s eta 0:00:03
     ------                                   0.5/2.9 MB 1.2 MB/s eta 0:00:02
     ---------                                0.7/2.9 MB 1.6 MB/s eta 0:00:02
     ---------                                0.7/2.9 MB 1.5 MB/s eta 0:00:02
     -----------                              0.9/2.9 MB 1.7 MB/s eta 0:


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import joblib
import fitz  # PyMuPDF
import re
from collections import defaultdict
import easyocr
from PIL import Image

# Load the saved SVM model
model_filename = 'svm_model.pkl'
loaded_svm_model = joblib.load(model_filename)

# Initialize the easyocr reader
reader = easyocr.Reader(lang_list=['en'])

# Preprocessing function
def preprocess_text(text):
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text

test_file_paths = [
    'D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf',
    'D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf',
    'D:/Documents/My CV/My CV.png',
    'D:/Documents/My CV/My CV.png' 
]

# Dictionary to store preprocessed_text and their corresponding file paths
file_path_mapping = defaultdict(list)

# Iterate through test files
for test_file_path in test_file_paths:
    if test_file_path.lower().endswith('.pdf'):
        with open(test_file_path, 'rb') as file:
            pdf = fitz.open(stream=file.read(), filetype="pdf")
            text = ''
            for page_num in range(pdf.page_count):
                page = pdf[page_num]
                text += page.get_text()
    elif test_file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        image = Image.open(test_file_path)
        image_np = np.array(image) 
        results = reader.readtext(image_np)
        text = ' '.join(result[1] for result in results)
    else:
        text = ''

    preprocessed_text = preprocess_text(text)
    document_vec = vectorizer.transform([preprocessed_text])
    prediction = loaded_svm_model.predict(document_vec)

    file_path_mapping[preprocessed_text].append(test_file_path)

    if prediction == 1:
        print(f"PII Data Detected in {test_file_path}")

# Find duplicated preprocessed_text (PII data)
duplicated_pii_data = {preprocessed_text: file_paths for preprocessed_text, file_paths in file_path_mapping.items() if len(file_paths) > 1}

# Create a text content for the TXT file
output_text = ""
for preprocessed_text, file_paths in duplicated_pii_data.items():
    output_text += f"Duplicated PII Data:\n"
    output_text += f"PII Data: {preprocessed_text}\n"
    output_text += f"Duplicated in Files:\n"
    output_text += ', '.join(file_paths) + '\n'
    output_text += "=" * 40 + "\n\n"

# Save the text content to a TXT file
output_file_path = 'C:/Users/Ashfak/Downloads/duplicate_pii_data.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write(output_text)

print(f"Duplicate PII data details saved as {output_file_path}")
print("Completed.")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


PII Data Detected in D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf
PII Data Detected in D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf
PII Data Detected in D:/Documents/My CV/My CV.png
PII Data Detected in D:/Documents/My CV/My CV.png
Duplicate PII data details saved as C:/Users/Ashfak/Downloads/duplicate_pii_data.txt
Completed.
