In [1]:
import os
import pytesseract

#%pip install pdfplumber

import pdfplumber
import torch

#%pip install transformers

from transformers import BertTokenizer

model_path = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_path)

# Define paths to your PDF and image directories
pdf_directory = 'D:/Projects/Project Dataset/data hygeine data/pdf'
image_directory = 'D:/Projects/Project Dataset/CNH_Aberta'

data = []

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Process PDF files
for pdf_file in os.listdir(pdf_directory):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, pdf_file)
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text = ''
                for page in pdf.pages:
                    text += page.extract_text()
                data.append({"text": text})
        except Exception as e:
            print(f"Error while processing {pdf_file}: {e}")



Error while processing 1124.pdf: No /Root object! - Is this really a PDF?
Error while processing 1648.pdf: No /Root object! - Is this really a PDF?
Error while processing 3012.pdf: Unexpected EOF
Error while processing 3013.pdf: Unexpected EOF


In [3]:
# Process image files
for image_file in os.listdir(image_directory):
    if image_file.endswith((".jpg", ".png")):
        image_path = os.path.join(image_directory, image_file)
        try:
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
            data.append({"text": text})
        except Exception as e:
            print(f"Error processing {image_file}: {e}")

Error processing 00000000_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000000_in - Copy.jpg: name 'Image' is not defined
Error processing 00000000_in.jpg: name 'Image' is not defined
Error processing 00000001_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000001_in - Copy.jpg: name 'Image' is not defined
Error processing 00000001_in.jpg: name 'Image' is not defined
Error processing 00000002_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000002_in.jpg: name 'Image' is not defined
Error processing 00000003_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000003_in.jpg: name 'Image' is not defined
Error processing 00000004_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000004_in.jpg: name 'Image' is not defined
Error processing 00000005_gt_segmentation.jpg: name 'Image' is not defined
Error processing 00000005_in.jpg: name 'Image' is not defined
Error processing 00000006_gt_segmentatio

Model Creation

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Define model parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
learning_rate = 0.001
batch_size = 32
epochs = 1

# Define your RedactionModel
class RedactionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RedactionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, attention_mask, decoder_hidden=None):
        embedded_decoder_input = self.embedding(input_ids[:, :-1])
        if decoder_hidden is None:
            batch_size = input_ids.size(0)
            decoder_hidden = (
                torch.zeros(1, batch_size, self.decoder.hidden_size).to(input_ids.device),
                torch.zeros(1, batch_size, self.decoder.hidden_size).to(input_ids.device)
            )
        decoder_output, _ = self.decoder(embedded_decoder_input, decoder_hidden)
        output = self.output_layer(decoder_output)
        return output

# Create the model, loss function, and optimizer
model = RedactionModel(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [6]:
# Tokenize and encode the data
input_ids = []
attention_mask = []

max_length = 128 

for entry in data:
    encoding = tokenizer(entry['text'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    input_ids.append(encoding['input_ids'][0])  # Extract the tensor from the list
    attention_mask.append(encoding['attention_mask'][0])  # Extract the tensor from the list

input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)


In [7]:
# Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_input_ids, val_input_ids, train_attention_mask, val_attention_mask = train_test_split(
    input_ids, attention_mask, test_size=0.2, random_state=42
)

# Create DataLoader for training
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_input_ids, train_attention_mask)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [34]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_input_ids, batch_attention_mask in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_input_ids, batch_attention_mask)
        
        target_ids = batch_input_ids[:, 1:].clone()
        loss = criterion(outputs.transpose(1, 2), target_ids)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{epochs}], Average Loss: {average_loss:.4f}")

Epoch [1/10], Average Loss: 6.9261
Epoch [2/10], Average Loss: 5.6258
Epoch [3/10], Average Loss: 4.9967
Epoch [4/10], Average Loss: 4.5927
Epoch [5/10], Average Loss: 4.2907
Epoch [6/10], Average Loss: 4.0463
Epoch [7/10], Average Loss: 3.8430
Epoch [8/10], Average Loss: 3.6650
Epoch [9/10], Average Loss: 3.5075
Epoch [10/10], Average Loss: 3.3671


Save the model

In [9]:
torch.save(model.state_dict(), 'redaction_model.pth')

Load the model

In [10]:
model = RedactionModel(vocab_size, embedding_dim, hidden_dim)
model.load_state_dict(torch.load('redaction_model.pth'))
model.eval()  # Set the model to evaluation mode

RedactionModel(
  (embedding): Embedding(30522, 128)
  (decoder): LSTM(128, 256, batch_first=True)
  (output_layer): Linear(in_features=256, out_features=30522, bias=True)
)

In [11]:
import pdfplumber

file_path = 'D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf'
with pdfplumber.open(file_path) as pdf:
    text = ''
    for page in pdf.pages:
        text += page.extract_text()

# Tokenize and preprocess the text
encoding = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [12]:
with torch.no_grad():
    redacted_output = model(input_ids, attention_mask)
    redacted_tokens = torch.argmax(redacted_output, dim=-1)
    redacted_text = tokenizer.decode(redacted_tokens[0], skip_special_tokens=True)


In [13]:
print("Redacted Text:")
print(redacted_text)

# Or save the redacted text to a file
with open('redacted_output.txt', 'w') as f:
    f.write(redacted_text)

Redacted Text:
curriculum vitae, - -,,, -.,. -...... com.......... com. com.........................................................................................,...


In [14]:
from PIL import Image

# Load text from PDF or image file
def load_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
    elif file_path.endswith((".jpg", ".jpeg", ".png", ".bmp")):
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    else:
        raise ValueError("Unsupported file format")
    return text

# Replace redacted portions with placeholders
redacted_tokens = torch.argmax(redacted_output, dim=-1)
redacted_text = tokenizer.decode(redacted_tokens[0], skip_special_tokens=True)

# Get the original text from the PDF file
with open(file_path, 'rb') as pdf_file:
    original_pdf_text = load_text_from_file(file_path)

In [15]:
import spacy

# # Download and install the spaCy model
# !python -m spacy download en_core_web_sm

# Load the model
nlp = spacy.load("en_core_web_sm")


In [16]:
import pdfplumber
import spacy
from PIL import Image
import pytesseract

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

def redact_pii_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Define a set of entity labels to redact
    pii_labels = {"PERSON", "GPE", "DATE", "PHONE", "EMAIL", "EmailAddress"}
    
    # Redact identified PII entities from the text
    redacted_text = text
    for ent in doc.ents:
        if ent.label_ in pii_labels:
            redacted_text = redacted_text.replace(ent.text, "[REDACTED]")
    
    return redacted_text

def process_file(file_path):
    if file_path.endswith((".pdf")):
        with pdfplumber.open(file_path) as pdf:
            original_text = ""
            for page in pdf.pages:
                original_text += page.extract_text()
        
    elif file_path.endswith((".jpg", ".jpeg", ".png", ".bmp")):
        image = Image.open(file_path)
        original_text = pytesseract.image_to_string(image)
        
    else:
        print("Unsupported file format")
        return
    
    redacted_text = redact_pii_text(original_text)
    return redacted_text

# Example usage
file_path = 'D:/Projects/Project Dataset/data hygeine data/pdf/1.pdf'
redacted_text = process_file(file_path)

# Save the redacted text to a new file
output_file_path = 'C:/Users/Ashfak/Downloads/non_redacted_text.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(redacted_text)

print("Redacted text saved to:", output_file_path)

Redacted text saved to: C:/Users/Ashfak/Downloads/non_redacted_text.txt
