In [2]:
from transformers import pipeline
# Import necessary libraries
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch
from datasets import load_dataset

model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")


# Initialize a token classification pipeline
pii_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to apply PII masking to a specific text column
def redact_column(dataset, column_name):
    redacted_texts = []

    for text in dataset[column_name]:
        # Get model predictions for the text
        predictions = pii_pipeline(text)
        
        # Replace PII spans with placeholders
        redacted_text = text
        for entity in sorted(predictions, key=lambda x: x['start'], reverse=True):  # Sort in reverse to avoid shifting positions
            label = entity['entity_group']
            redacted_text = redacted_text[:entity['start']] + f"[{label}]" + redacted_text[entity['end']:]
        
        redacted_texts.append(redacted_text)
    
    return redacted_texts

df = pd.DataFrame({'source_text': ['hi my name is shubhangi, and my phone number is 239-123-1238 and I live in Los Angeles, California. My email is shubhangiwaldiya@gmail.com']})

# Display the redacted text
print(redact_column(df, 'source_text'))

['hi my name is[FIRSTNAME][FIRSTNAME][FIRSTNAME][FIRSTNAME], and my phone number is[PHONE_NUMBER] and I live in[STATE][STATE], California. My email is[EMAIL][EMAIL][EMAIL][EMAIL]']


In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch
from datasets import load_dataset
# Define and load the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
train_set = load_dataset("ai4privacy/pii-masking-300k", split='train')
dataset = train_set.shuffle().select(range(20)) # selecting only 20
print(dataset)

# Define label mapping, might increase
label_map = {"O": 0, "TIME": 1, "DATE": 2, "LASTNAME1": 3, "LASTNAME2": 4, "EMAIL": 5, "SOCIALNUMBER": 6}

# Function to tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    labels = ["O"] * len(tokenized_inputs["input_ids"])
    
    # Adjust indexing to fit the tokenization
    example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length

    for span in example["privacy_mask"]:
        start, end, label_name = span["start"], span["end"], span["label"]
        label_id = label_map.get(label_name, 0)
        
        span_tokens = tokenizer(example["source_text"][start:end], add_special_tokens=False).tokens()
        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs["input_ids"]) - len(numeric_labels))
    
    tokenized_inputs["labels"] = numeric_labels
    return tokenized_inputs


# Apply function to dataset
encoded_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Convert to PyTorch tensors to feed into the model
input_ids = torch.tensor(encoded_dataset["input_ids"])
attention_mask = torch.tensor(encoded_dataset["attention_mask"])
labels = torch.tensor(encoded_dataset["labels"])

# Load model
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Define metric computation
metric = evaluate.load("accuracy")  # You may want to load relevant metrics for token classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)  # Note: axis=2 for token classification
    true_labels = labels != -100  # Masking out unnecessary labels
    
    # Flatten predictions and true_labels to compute accuracy
    flattened_predictions = predictions[true_labels]
    flattened_labels = labels[true_labels]
    
    results = metric.compute(references=flattened_labels, predictions=flattened_predictions)
    return results


# Tokenize validation set
val_set = load_dataset("ai4privacy/pii-masking-300k", split='validation')  # Ensure you have a validation set
small_val = val_set.shuffle().select(range(20))
encoded_small_val = small_val.map(tokenize_and_align_labels, batched=False)

# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    gradient_accumulation_steps=2,  # Adjust if needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 20
})


Map: 100%|██████████| 20/20 [00:00<00:00, 53.82 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 68.09 examples/s]


In [4]:
# Run training
trainer.train()

                                              
 33%|███▎      | 5/15 [05:12<08:02, 48.25s/it]

{'eval_loss': 0.21650974452495575, 'eval_accuracy': 0.9735880572084769, 'eval_runtime': 67.7228, 'eval_samples_per_second': 0.295, 'eval_steps_per_second': 0.044, 'epoch': 1.0}


 67%|██████▋   | 10/15 [08:13<03:30, 42.03s/it]

{'loss': 1.5672, 'grad_norm': 0.16207049787044525, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                               
 67%|██████▋   | 10/15 [09:29<03:30, 42.03s/it]

{'eval_loss': 0.01968233287334442, 'eval_accuracy': 0.9969725441069005, 'eval_runtime': 69.2365, 'eval_samples_per_second': 0.289, 'eval_steps_per_second': 0.043, 'epoch': 2.0}


                                               
100%|██████████| 15/15 [13:53<00:00, 55.54s/it]


{'eval_loss': 0.008220566436648369, 'eval_accuracy': 0.999791209938407, 'eval_runtime': 62.1872, 'eval_samples_per_second': 0.322, 'eval_steps_per_second': 0.048, 'epoch': 3.0}
{'train_runtime': 833.1094, 'train_samples_per_second': 0.072, 'train_steps_per_second': 0.018, 'train_loss': 1.0484985935191313, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=1.0484985935191313, metrics={'train_runtime': 833.1094, 'train_samples_per_second': 0.072, 'train_steps_per_second': 0.018, 'total_flos': 18303156633600.0, 'train_loss': 1.0484985935191313, 'epoch': 3.0})

In [5]:
trainer.evaluate()

100%|██████████| 3/3 [00:40<00:00, 13.48s/it]


{'eval_loss': 0.008220566436648369,
 'eval_accuracy': 0.999791209938407,
 'eval_runtime': 67.1656,
 'eval_samples_per_second': 0.298,
 'eval_steps_per_second': 0.045,
 'epoch': 3.0}

In [7]:
from transformers import pipeline

# Initialize a token classification pipeline
pii_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to apply PII masking to a specific text column
def redact_column(dataset, column_name):
    redacted_texts = []

    for text in dataset[column_name]:
        # Get model predictions for the text
        predictions = pii_pipeline(text)
        
        # Replace PII spans with placeholders
        redacted_text = text
        for entity in sorted(predictions, key=lambda x: x['start'], reverse=True):  # Sort in reverse to avoid shifting positions
            label = entity['entity_group']
            redacted_text = redacted_text[:entity['start']] + f"[{label}]" + redacted_text[entity['end']:]
        
        redacted_texts.append(redacted_text)
    
    return redacted_texts

df = pd.DataFrame({'source_text': ['hi my name is shubhangi, and my phone number is 239-123-1238 and I live in Los Angeles, California. My email is shubhangiwaldiya@gmail.com']})

# Display the redacted text
print(redact_column(df, 'source_text'))

['[PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PHONE_NUMBER][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX],[PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][USERNAME][PREFIX][PREFIX][PREFIX][PREFIX][USERNAME]']


In [8]:
df['source_text']

0    hi my name is shubhangi, and my phone number i...
Name: source_text, dtype: object