In [8]:
#!pip install -qr requirements.txt

In [9]:
#!pip install torch==2.5.1

In [10]:
#!pip install tensorflow==2.18.0

In [11]:
#!pip install pandas==2.2.3

# Personal Identifiable Information (PII) Detection using LSTM

This notebook demonstrates how to detect and mask sensitive personal data 
such as national ID numbers, phone numbers, and vehicle plate numbers using 
an LSTM-based deep learning model. The model is trained to classify whether 
a given text contains PII. Additionally, regex-based filtering is applied 
to enhance detection accuracy.

## Setup: Imports, Configurations, Regex, and Model

We start by importing the necessary libraries for data processing (`pandas`), deep learning (`torch`), and text preprocessing (`tensorflow.keras.preprocessing.text`).  
The model is configured with predefined hyperparameters, and regex patterns are used for detecting PII such as national IDs, phone numbers, and vehicle plate numbers.


In [6]:
import os
import re
import torch
import pandas as pd
import torch.nn as nn
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model configuration, including preprocessing parameters and model hyperparameters
MODEL_CONFIG = {
    "preprocessing": {
        "padding": {"max_length": 20, "padding_mode": "pre"},
        "label_encoding": {"pii": 1, "non-pii": 0},
        "tokenizer": {"flname": "pii_tokenizer.json"},
    },
    "models": {
        "name": "model_hyperparams_epoch50_layer5_lr01.pth",
        "hyperparams": {
            "vocab_size": 28229,
            "output_dim": 1,
            "embedding_dim": 128,
            "hidden_dim": 128,
            "num_layers": 5,
        },
    },
}

# Define regex patterns to detect sensitive data such as NIK (Indonesian ID number),
# mobile phone numbers, home phone numbers, and vehicle plate numbers.
NIK_REGEX = re.compile(r"^(1[1-9]|21|[37][1-6]|5[1-3]|6[1-5]|[89][12])\d{2}\d{2}([04][1-9]|[1256][0-9]|[37][01])(0[1-9]|1[0-2])\d{2}\d{4}$")
MOBILE_PHONE_REGEX = re.compile(r"^(\+62|62)?[\s-]?0?8[1-9]{1}\d{1}[\s-]?\d{4}[\s-]?\d{2,5}$")
PLATE_NUMBER_REGEX = re.compile(r"^[A-Z]{1,2}\s{0,1}\d{0,4}\s{0,1}[A-Z]{0,3}$")
HOME_PHONE_REGEX = re.compile(r"^(\+62|62)?[\s-]?0?([2-7]|9)\d(\d)?[\s-]?[2-9](\d){6,7}")

# Define an LSTM-based model for PII detection
class LSTIMPii(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        super(LSTIMPii, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out[:, -1, :])
        return logits

# Define a class for handling PII prediction
class PredictingPII:
    def __init__(self, input_folder, output_folder):
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.hyperparams = MODEL_CONFIG["models"]["hyperparams"]
        self.model = self._instantiate_model()
        self.tokenizer = self._load_tokenizer()

    # Instantiate and load the model with pretrained weights
    def _instantiate_model(self):
        model = LSTIMPii(
            vocab_size=self.hyperparams["vocab_size"],
            embedding_dim=self.hyperparams["embedding_dim"],
            hidden_dim=self.hyperparams["hidden_dim"],
            output_dim=self.hyperparams["output_dim"],
            num_layers=self.hyperparams["num_layers"],
        )
        model.load_state_dict(torch.load(MODEL_CONFIG["models"]["name"], map_location=self.device))
        return model.to(self.device)

    # Load the tokenizer from a JSON file
    def _load_tokenizer(self):
        with open(MODEL_CONFIG["preprocessing"]["tokenizer"]["flname"], "r") as file:
            return tokenizer_from_json(file.read())

    # Preprocess text data before feeding it to the model
    def _preprocess(self, data):
        sequences = self.tokenizer.texts_to_sequences(data.astype(str))
        padded = pad_sequences(sequences, maxlen=MODEL_CONFIG["preprocessing"]["padding"]["max_length"], padding="pre")
        return torch.tensor(padded, dtype=torch.long).to(self.device)
    
    # Predict and mask PII data in a CSV file
    def predict(self, file_name):
        input_path = os.path.join(self.input_folder, file_name)
        output_path = os.path.join(self.output_folder, f"predicted_{file_name}")

        df = pd.read_csv(input_path)
        self.model.eval()

        for column in df.columns:
            df[column] = df[column].astype(str).apply(
                lambda x: "*****" if NIK_REGEX.match(x) or MOBILE_PHONE_REGEX.match(x) or PLATE_NUMBER_REGEX.match(x) or HOME_PHONE_REGEX.match(x) else x
            )
            input_data = self._preprocess(df[column])
            with torch.no_grad():
                predictions = (self.model(input_data).squeeze(1) > 0.5).long()
            df[column] = df[column].apply(lambda x: "*****" if predictions[0].item() == 1 else x)
        
        df.to_csv(output_path, index=False)
        print(f"Processed {file_name} -> {output_path}")

2025-02-27 16:38:32.536136: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Run PII Detection on CSV Files
This script will:

- Instantiate the prediction class
- Loop through all CSV files in the input/ folder
- Run the predict function on each file
- Make sure to put your files on the input folder!

In [7]:
# Run the PII detection process on all CSV files in the input folder
if __name__ == "__main__":
    predictor = PredictingPII(input_folder="input", output_folder="output")
    for file in os.listdir("input"):
        if file.endswith(".csv"):
            predictor.predict(file)

Processed dummy_medical_data.csv -> output/predicted_dummy_medical_data.csv
