In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch
from datasets import load_dataset

# Define and load the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
train_set = load_dataset("ai4privacy/pii-masking-300k", split='train')
dataset = train_set.shuffle().select(range(20))
print(dataset)

# Define label mapping
label_map = {"O": 0, "TIME": 1, "DATE": 2, "LASTNAME1": 3, "LASTNAME2": 4, "EMAIL": 5, "SOCIALNUMBER": 6}

# Function to tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    labels = ["O"] * len(tokenized_inputs["input_ids"])
    
    # Adjust indexing to fit the tokenization
    example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length

    for span in example["privacy_mask"]:
        start, end, label_name = span["start"], span["end"], span["label"]
        label_id = label_map.get(label_name, 0)
        
        span_tokens = tokenizer(example["source_text"][start:end], add_special_tokens=False).tokens()
        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs["input_ids"]) - len(numeric_labels))
    
    tokenized_inputs["labels"] = numeric_labels
    return tokenized_inputs


# Apply function to dataset
encoded_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Convert to PyTorch tensors
input_ids = torch.tensor(encoded_dataset["input_ids"])
attention_mask = torch.tensor(encoded_dataset["attention_mask"])
labels = torch.tensor(encoded_dataset["labels"])

# Load model
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Define metric computation
metric = evaluate.load("accuracy")  # You may want to load relevant metrics for token classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)  # Note: axis=2 for token classification
    true_labels = labels != -100  # Masking out unnecessary labels
    
    # Flatten predictions and true_labels to compute accuracy
    flattened_predictions = predictions[true_labels]
    flattened_labels = labels[true_labels]
    
    results = metric.compute(references=flattened_labels, predictions=flattened_predictions)
    return results


# Tokenize validation set
val_set = load_dataset("ai4privacy/pii-masking-300k", split='validation')  # Ensure you have a validation set
small_val = val_set.shuffle().select(range(20))
encoded_small_val = small_val.map(tokenize_and_align_labels, batched=False)

# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    gradient_accumulation_steps=2,  # Adjust if needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 20
})


Map: 100%|██████████| 20/20 [00:00<00:00, 155.69 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 214.53 examples/s]


In [7]:
# Run training
trainer.train()

                                              
 33%|███▎      | 5/15 [02:45<04:18, 25.85s/it]

{'eval_loss': 0.4130392074584961, 'eval_accuracy': 0.9286088139851355, 'eval_runtime': 35.6973, 'eval_samples_per_second': 0.56, 'eval_steps_per_second': 0.084, 'epoch': 1.0}


 67%|██████▋   | 10/15 [04:52<02:22, 28.54s/it]

{'loss': 2.0259, 'grad_norm': 0.8904018998146057, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                               
 67%|██████▋   | 10/15 [05:33<02:22, 28.54s/it]

{'eval_loss': 0.04530291631817818, 'eval_accuracy': 0.987438500994452, 'eval_runtime': 37.0441, 'eval_samples_per_second': 0.54, 'eval_steps_per_second': 0.081, 'epoch': 2.0}


                                               
100%|██████████| 15/15 [10:01<00:00, 40.07s/it]


{'eval_loss': 0.01584729738533497, 'eval_accuracy': 0.9995812833664818, 'eval_runtime': 62.3534, 'eval_samples_per_second': 0.321, 'eval_steps_per_second': 0.048, 'epoch': 3.0}
{'train_runtime': 600.9944, 'train_samples_per_second': 0.1, 'train_steps_per_second': 0.025, 'train_loss': 1.3604229042927425, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=1.3604229042927425, metrics={'train_runtime': 600.9944, 'train_samples_per_second': 0.1, 'train_steps_per_second': 0.025, 'total_flos': 18303156633600.0, 'train_loss': 1.3604229042927425, 'epoch': 3.0})

In [8]:
trainer.evaluate()

100%|██████████| 3/3 [00:37<00:00, 12.42s/it]


{'eval_loss': 0.01584729738533497,
 'eval_accuracy': 0.9995812833664818,
 'eval_runtime': 62.1961,
 'eval_samples_per_second': 0.322,
 'eval_steps_per_second': 0.048,
 'epoch': 3.0}

In [None]:
from transformers import pipeline

# Initialize a token classification pipeline
pii_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="")

# Function to apply PII masking to a specific text column
def redact_column(dataset, column_name):
    redacted_texts = []

    for text in dataset[column_name]:
        # Get model predictions for the text
        predictions = pii_pipeline(text)
        
        # Replace PII spans with placeholders
        redacted_text = text
        for entity in sorted(predictions, key=lambda x: x['start'], reverse=True):  # Sort in reverse to avoid shifting positions
            label = entity['entity_group']
            redacted_text = redacted_text[:entity['start']] + f"[{label}]" + redacted_text[entity['end']:]
        
        redacted_texts.append(redacted_text)
    
    return redacted_texts

df = pd.DataFrame({'source_text': ['hi my name is shubhangi, and my phone number is 239-123-1238 and I live in Los Angeles, California. My email is shubhangiwaldiya@gmail.com']})

# Display the redacted text
print(redact_column(df, 'source_text'))

['[PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][FIRSTNAME][FIRSTNAME][FIRSTNAME][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PHONE_NUMBER][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX],[PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][PREFIX][USERNAME][EMAIL][USERNAME]']


In [1]:
df['source_text']

NameError: name 'df' is not defined