In [1]:
#%%writefile requirements.txt
#torch==2.5.1
#tensorflow==2.18.0
#pandas==2.2.3

Overwriting requirements.txt


In [2]:
#!pip install -qr requirements.txt

In [None]:
!pip install torch==2.5.1

In [None]:
!pip install tensorflow==2.18.0

In [None]:
!pip install pandas==2.2.3

In [1]:
import os
import re
import torch
import pandas as pd
import torch.nn as nn
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Konfigurasi Model
MODEL_CONFIG = {
    "preprocessing": {
        "padding": {"max_length": 20, "padding_mode": "pre"},
        "label_encoding": {"pii": 1, "non-pii": 0},
        "tokenizer": {"flname": "pii_tokenizer.json"},
    },
    "models": {
        "name": "model_hyperparams_epoch50_layer5_lr01.pth",
        "hyperparams": {
            "vocab_size": 28229,
            "output_dim": 1,
            "embedding_dim": 128,
            "hidden_dim": 128,
            "num_layers": 5,
        },
    },
}

# Dummy regex untuk mendeteksi data sensitif
NIK_REGEX = re.compile(r"^(1[1-9]|21|[37][1-6]|5[1-3]|6[1-5]|[89][12])\d{2}\d{2}([04][1-9]|[1256][0-9]|[37][01])(0[1-9]|1[0-2])\d{2}\d{4}$")
MOBILE_PHONE_REGEX = re.compile(r"^(\+62|62)?[\s-]?0?8[1-9]{1}\d{1}[\s-]?\d{4}[\s-]?\d{2,5}$")
PLATE_NUMBER_REGEX = re.compile(r"^[A-Z]{1,2}\s{0,1}\d{0,4}\s{0,1}[A-Z]{0,3}$")
HOME_PHONE_REGEX = re.compile(r"^(\+62|62)?[\s-]?0?([2-7]|9)\d(\d)?[\s-]?[2-9](\d){6,7}")

class LSTIMPii(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        super(LSTIMPii, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out[:, -1, :])
        return logits

class PredictingPII:
    def __init__(self, input_folder, output_folder):
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.hyperparams = MODEL_CONFIG["models"]["hyperparams"]
        self.model = self._instantiate_model()
        self.tokenizer = self._load_tokenizer()

    def _instantiate_model(self):
        model = LSTIMPii(
            vocab_size=self.hyperparams["vocab_size"],
            embedding_dim=self.hyperparams["embedding_dim"],
            hidden_dim=self.hyperparams["hidden_dim"],
            output_dim=self.hyperparams["output_dim"],
            num_layers=self.hyperparams["num_layers"],
        )
        model.load_state_dict(torch.load(MODEL_CONFIG["models"]["name"], map_location=self.device))
        return model.to(self.device)

    def _load_tokenizer(self):
        with open(MODEL_CONFIG["preprocessing"]["tokenizer"]["flname"], "r") as file:
            return tokenizer_from_json(file.read())

    def _preprocess(self, data):
        sequences = self.tokenizer.texts_to_sequences(data.astype(str))
        padded = pad_sequences(sequences, maxlen=MODEL_CONFIG["preprocessing"]["padding"]["max_length"], padding="pre")
        return torch.tensor(padded, dtype=torch.long).to(self.device)

    def predict(self, file_name):
        input_path = os.path.join(self.input_folder, file_name)
        output_path = os.path.join(self.output_folder, f"predicted_{file_name}")

        df = pd.read_csv(input_path)
        self.model.eval()

        for column in df.columns:
            df[column] = df[column].astype(str).apply(
                lambda x: "*****" if NIK_REGEX.match(x) or MOBILE_PHONE_REGEX.match(x) or PLATE_NUMBER_REGEX.match(x) or HOME_PHONE_REGEX.match(x) else x
            )
            input_data = self._preprocess(df[column])
            with torch.no_grad():
                predictions = (self.model(input_data).squeeze(1) > 0.5).long()
            df[column] = df[column].apply(lambda x: "*****" if predictions[0].item() == 1 else x)
        
        df.to_csv(output_path, index=False)
        print(f"Processed {file_name} -> {output_path}")

if __name__ == "__main__":
    predictor = PredictingPII(input_folder="input", output_folder="output")
    for file in os.listdir("input"):
        if file.endswith(".csv"):
            predictor.predict(file)


2025-02-24 08:25:24.737826: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Processed dummy_medical_data.csv -> output/predicted_dummy_medical_data.csv
