In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Imported the dataset
from datasets import load_dataset
train_set=load_dataset("ai4privacy/pii-masking-300k", split='train')
val_set=load_dataset("ai4privacy/pii-masking-300k", split='validation')

In [15]:
# Inspect the dataset
print(type(train_set))
print(train_set.shape)
val_set.shape

<class 'datasets.arrow_dataset.Dataset'>
(177677, 9)


(47728, 9)

In [16]:
import random
# Select 100 random indices
random_indices = random.sample(range(len(train_set)), 10)

# Get the random samples
random_samples = train_set.select(random_indices)

print(random_samples)

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 10
})


In [17]:
# Load model directly
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
tokenizer.vocab

{'Ġquarter': 297,
 'pend': 43238,
 'Ġwritings': 31757,
 'Ġframeworks': 32480,
 'ĠNeighbor': 36968,
 'Ġbrewers': 34176,
 'Ġlyrics': 11440,
 'SHARE': 44624,
 'âī': 48054,
 'Ġaid': 2887,
 'Ġrichness': 38857,
 'pee': 12084,
 'ĠEST': 12936,
 'ĠFILE': 9070,
 '048': 40976,
 'Ġregulatory': 4099,
 'Ġbring': 836,
 'eco': 30701,
 'ĠCertificate': 28364,
 'ĠSyndrome': 27854,
 'ĠRust': 23083,
 'flight': 15801,
 'ĠECB': 6899,
 'Ġelegance': 32595,
 'Tool': 46620,
 'HAHA': 48524,
 'Ġbeard': 21268,
 'Ġoverseeing': 14264,
 'Ġpsychiatrists': 40888,
 'ĠOK': 4954,
 'Ġdeterior': 26694,
 'ĠAirbus': 11016,
 'ĠUr': 9163,
 'ĠMeat': 25263,
 'ĠTrap': 37607,
 'Ġheated': 10819,
 'Ãĥ': 13572,
 'Ġ292': 37299,
 'LinkedIn': 48005,
 'ĠErik': 12214,
 'ĠNam': 8603,
 'ĠSecondary': 16021,
 'Ġcomparing': 12818,
 'Ġelectoral': 7169,
 'Table': 41836,
 'Ġ._': 49954,
 'ĠLCD': 20808,
 'Ġdeducted': 37728,
 'Ġelectrical': 8980,
 'Ġpass': 1323,
 'Ġreincarn': 43792,
 'ĠIts': 3139,
 'Ġdepartures': 25624,
 'ĠGA': 9575,
 'Ġneed': 240,
 '

In [18]:
# Import necessary libraries
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch
from datasets import load_dataset

# Define and load the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
dataset = load_dataset("ai4privacy/pii-masking-300k", split='train')
dataset = dataset.select(range(5))

# Define label mapping
label_map = {"O": 0, "TIME": 1, "DATE": 2, "LASTNAME1": 3, "LASTNAME2": 4, "EMAIL": 5, "SOCIALNUMBER": 6}

# Function to tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    labels = ["O"] * len(tokenized_inputs["input_ids"])
    
    # Adjust indexing to fit the tokenization
    example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length

    for span in example["privacy_mask"]:
        start, end, label_name = span["start"], span["end"], span["label"]
        label_id = label_map.get(label_name, 0)
        
        span_tokens = tokenizer(example["source_text"][start:end], add_special_tokens=False).tokens()
        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs["input_ids"]) - len(numeric_labels))
    
    tokenized_inputs["labels"] = numeric_labels
    return tokenized_inputs


# Apply function to dataset
encoded_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Convert to PyTorch tensors
input_ids = torch.tensor(encoded_dataset["input_ids"])
attention_mask = torch.tensor(encoded_dataset["attention_mask"])
labels = torch.tensor(encoded_dataset["labels"])

# Load model
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Define metric computation
metric = evaluate.load("accuracy")  # You may want to load relevant metrics for token classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)  # Note: axis=2 for token classification
    true_labels = labels != -100  # Masking out unnecessary labels
    
    # Flatten predictions and true_labels to compute accuracy
    flattened_predictions = predictions[true_labels]
    flattened_labels = labels[true_labels]
    
    results = metric.compute(references=flattened_labels, predictions=flattened_predictions)
    return results


# Tokenize validation set
val_set = load_dataset("ai4privacy/pii-masking-300k", split='validation')  # Ensure you have a validation set
small_val = val_set.shuffle(seed=123).select(range(5))
encoded_small_val = small_val.map(tokenize_and_align_labels, batched=False)

# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-2,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    gradient_accumulation_steps=2,  # Adjust if needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [None]:
# Run training
trainer.train()

 33%|███▎      | 1/3 [00:42<01:24, 42.44s/it]
 33%|███▎      | 1/3 [00:56<01:24, 42.44s/it]

{'eval_loss': 11.556055068969727, 'eval_accuracy': 0.0, 'eval_runtime': 13.9542, 'eval_samples_per_second': 0.358, 'eval_steps_per_second': 0.072, 'epoch': 1.0}


In [13]:
trainer.evaluate()

100%|██████████| 2/2 [00:04<00:00,  2.25s/it]


{'eval_loss': 0.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 22.0797,
 'eval_samples_per_second': 0.453,
 'eval_steps_per_second': 0.091,
 'epoch': 3.0}