In [4]:
import os
import glob
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
pii_types = ['SSN', 'Social Security Number', 'DOB', 'Date of Birth', 'Name', 'Address', 'Phone Number', 'Phone', 'E-mail', 'EmailAddress', 'Credit Card Number', 'FullNames', 'IDCardNo', 'TelephoneNo', 'Contact', 'PostalAddress']

In [6]:
# Data Preparation
pdf_directory = 'D:/Projects/Project Dataset/data classification data/confidential_data'
pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))

In [7]:
# Text Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ashfak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ashfak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
preprocessed_data = []
labels = []

for pdf_file in pdf_files:
    with open(pdf_file, 'rb') as pdf:
        pdf_reader = PyPDF2.PdfReader(pdf)
        text_content = ''
        for page in pdf_reader.pages:
            text_content += page.extract_text()
        
        # Tokenize and preprocess text
        tokens = word_tokenize(text_content)
        tokens = [token.lower() for token in tokens if token.isalnum()]
        tokens = [token for token in tokens if token not in stop_words]
        
        preprocessed_text = ' '.join(tokens)
        preprocessed_data.append(preprocessed_text)
        
        # Determine if the PDF contains PII or not
        contains_pii = any(pii_type in text_content for pii_type in pii_types)
        labels.append(1 if contains_pii else 0)

In [9]:
# Print preprocessed data for the first PDF file
print("Preprocessed Data for the First PDF:")
print(preprocessed_data[0])

Preprocessed Data for the First PDF:
personal informationcurriculum vitae fullnames mike kisasatiwanaswa postaladdress box 85575 80100 mombasa 550926 emailaddress mikewanaswa mlanguages well spoken english swahilipurpose put use latest inventions telecommunication information technology positive impact individuals business enterprises corporate organizations work experience date april 2011 date position fixed data network technician employer ben electronics services ltd mombasa duties survey installation integration maintenance support decommissioning fixed data services using various access technologies wimax fiber microwaves safaricomltd survey installation support ceragon ip20 access technology airtel k survey installation support cambridge p2mp solutions safaricom fiber optics splicing terminations deployment support maintenance design installation technical support structured cabling installation support cctv ip cameras biometrics security controls installation support radwin p2p 

In [10]:
import warnings 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

warnings.filterwarnings("ignore")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, labels, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.89      1.00      0.94         8

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9



In [11]:
import joblib

model_filename = 'pii_encryption_model.pkl'
joblib.dump(classifier, model_filename)
print(f"Model saved as {model_filename}")

Model saved as pii_encryption_model.pkl


In [27]:
%pip install pyDes
%pip install fpdf


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40712 sha256=535a7b9555a0e2359aae86c8c2f983a5d0a81a2328b5dcf443bcff44214489de
  Stored in directory: c:\users\ashfak\appdata\local\pip\cache\wheels\65\4f\66\bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import joblib
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from pyDes import triple_des, PAD_PKCS5, CBC
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import PyPDF2

# Load the trained model
loaded_classifier = joblib.load('pii_encryption_model.pkl')


pdf_path = 'D:/Projects/Project Dataset/data classification data/confidential_data/2.pdf'
with open(pdf_path, 'rb') as pdf:
    pdf_reader = PyPDF2.PdfReader(pdf)
    text_content = ''
    for page in pdf_reader.pages:
        text_content += page.extract_text()

# Preprocess the text content
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize and remove stopwords
    nltk.download('punkt')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

preprocessed_text = preprocess_text(text_content)

# Predict if the PDF contains PII data
tfidf_text = tfidf_vectorizer.transform([preprocessed_text])
prediction = loaded_classifier.predict(tfidf_text)

# Encrypt PII using cryptography library if detected
if prediction == 1:

    key = b"secretpassword1234567890"
    mode = CBC

    pii_mappings = {}

    # Regular expressions for PII types
    pii_patterns = {
        r'\bSSN\b': 'SSN',
        r'\bSocial Security Number\b': 'Social Security Number',
        r'\bDOB\b': 'DOB',
        r'\bDate of Birth\b': 'Date of Birth',
        r'\bName\b': 'Name',
        r'\bPhone Number\b': 'Phone Number',
        r'\bPhone\b': 'Phone',
        r'\bE-mail\b': 'E-mail',
        r'\bEmailAddress\b': 'EmailAddress',
        r'\bCredit Card Number\b': 'Credit Card Number',
        r'\bFullNames\b': 'FullNames',
        r'\bIDCardNo\b': 'IDCardNo',
        r'\bTelephoneNo\b': 'TelephoneNo',
        r'\bContact\b': 'Contact',
    }

     
    encrypted_pii_values = {}

    
    for pattern, pii_type in pii_patterns.items():
        encrypted_value = triple_des(key, mode, padmode=PAD_PKCS5).encrypt(pii_type.encode('utf-8'))
        encrypted_pii_values[pii_type] = encrypted_value



story = []

# styles for normal and highlighted text
styles = getSampleStyleSheet()
normal_style = styles["Normal"]
highlighted_style = normal_style.clone("highlighted")
highlighted_style.textColor = colors.red


highlighted_values = []

for line in text_content.split('\n'):
    for pattern, pii_type in pii_patterns.items():
        line = re.sub(pattern, f"[{pii_type}]", line)
        line = re.sub(rf"\[{pii_type}\]", f'<font color="red">[{pii_type}]</font>', line)

        # Check if a highlighted value is found
        if f'<font color="red">[{pii_type}]</font>' in line:
            highlighted_values.append(line)


doc.build(story)

print("Highlighted PII values:")
for value in highlighted_values:
    print(value)

    


Highlighted PII values:
<font color="red">[Phone]</font>: 9891423551  , 9650459770  
<font color="red">[E-mail]</font>: ardraprasad93@gmail.com  
<font color="red">[Name]</font>  : Ardra Prasad  
<font color="red">[Date of Birth]</font>  : 23-12-1993  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ashfak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ashfak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Encrypt and print the values
encrypted_values = {}

for value in highlighted_values:
    pii_type_start = value.find("[")
    pii_type_end = value.find("]") 
    pii_type = value[pii_type_start+1 : pii_type_end]
    
    # Find the start of the encryption
    encrypted_text_start = value.find("</font>") + len("</font>")
    
    
    text_to_encrypt = value[encrypted_text_start:]
    
    encrypted_text = triple_des(key, mode, padmode=PAD_PKCS5).encrypt(text_to_encrypt.encode('utf-8'))
    encrypted_values[pii_type] = encrypted_text
    
    # Replace the original text with the encrypted text
    encrypted_value = f'[{pii_type}: {encrypted_text}]'
    value = value[:encrypted_text_start] + encrypted_value
    
    print(value)

# Save as a file
output_txt_path = 'C:/Users/Ashfak/Downloads/encrypted_values.txt'
with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
    for pii_type, encrypted_text in encrypted_values.items():
        txt_file.write(f"{pii_type}: {encrypted_text}\n")

print(f"All encrypted values saved to {output_txt_path}")


<font color="red">[Phone]</font>[Phone: b'\xae\xf4P\xc0@\x9f\x0e\x00_\xea\xd9\xc9\x85z=<3\x84>5\x91\x13\xdf\xe9\xa8\xf5w3\xa6\x06#r']
<font color="red">[E-mail]</font>[E-mail: b'8\xa2&\x87\x88\xad\xfe"\xf3\xcf\xa1G\x02\xe2V\x82O\xafuK\t/\xdf\x86\x01\xc0s\xe8kF\x0co']
<font color="red">[Name]</font>[Name: b'O\xa5\xdd\x0c\x8f\xae\xdc\xd6\x95l\xe7\x8ea\xb9\x9f\x10\xc5:\x15Y\x9f3\xae\xae']
<font color="red">[Date of Birth]</font>[Date of Birth: b"\x99'\xd7\xc7\xbb\xad\xde\x11\xafL:\xfa\xea&w\xf5\xed\xa3\xafel\xd48t"]
All encrypted values saved to C:/Users/Ashfak/Downloads/encrypted_values.txt


In [33]:
# Detect PII values and save to a file
detected_pii_values = []

for line in text_content.split('\n'):
    for pattern, pii_type in pii_patterns.items():
        if re.search(pattern, line):
            detected_pii_values.append(f"{pii_type}: {line.strip()}")

# Save detected PII values to a file
decrypted_pii_output_path = 'C:/Users/Ashfak/Downloads/decrypted_pii_values.txt'
with open(decrypted_pii_output_path, 'w', encoding='utf-8') as txt_file:
    for detected_pii_value in detected_pii_values:
        txt_file.write(f"{detected_pii_value}\n")

print(f"All detected PII values saved to {decrypted_pii_output_path}")


All detected PII values saved to C:/Users/Ashfak/Downloads/decrypted_pii_values.txt
