In [1]:
import os
import pandas as pd
import numpy as np
import re

In [2]:
# Imported the dataset
from datasets import load_dataset
from datasets import Dataset

train_set1=load_dataset('ai4privacy/pii-masking-300k', split='train')
val_set=load_dataset('ai4privacy/pii-masking-300k', split='validation')

filepath = os.path.join(os.getcwd(), '..', 'data', 'maker_day_shrieyaa_stella_mini200_df.csv')
train_df2 = pd.read_csv(filepath)
train_set2 = Dataset.from_pandas(train_df2)

In [3]:
# Inspect the dataset
print(type(train_set2))
print(train_set2.shape)
val_set.shape

<class 'datasets.arrow_dataset.Dataset'>
(200, 3)


(47728, 9)

In [4]:
import random
# Select 100 random indices
# random_indices = random.sample(range(len(train_set1)), 10)
random_indices = random.sample(range(len(train_set2)), 10)

# Get the random samples
# random_samples = train_set1.select(random_indices)
random_samples = train_set2.select(random_indices)

print(random_samples)

Dataset({
    features: ['Unnamed: 0', 'unmasked_text', 'masked_text'],
    num_rows: 10
})


In [5]:
import random
# Select 100 random indices
# random_indices = random.sample(range(len(train_set1)), 10)
random_indices = random.sample(range(len(train_set2)), 10)

# Get the random samples
random_samples = train_set2.select(random_indices)

print(random_samples)

Dataset({
    features: ['Unnamed: 0', 'unmasked_text', 'masked_text'],
    num_rows: 10
})


In [1]:
# Load model directly
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('lakshyakh93/deberta_finetuned_pii')
list(tokenizer.vocab.items())[:10]

[('inqu', 32223),
 ('ĠDarkness', 39824),
 ('Ġphilosophies', 40728),
 ('MB', 8651),
 ('ĠHugo', 18148),
 ('ibling', 46514),
 ('ĠSonia', 23961),
 ('Ġscent', 26431),
 ('Ġcrim', 29322),
 ('Ġlater', 423)]

In [7]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
import torch

# Define and load the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained('lakshyakh93/deberta_finetuned_pii')
train_set1_selected = train_set1.select(range(5))

# Define label mapping
# label_map = {'O': 0, 'TIME': 1, 'DATE': 2, 'LASTNAME1': 3, 'LASTNAME2': 4, 'EMAIL': 5, 'SOCIALNUMBER': 6}
label_map = {'O': 0, 'FIRSTNAME_1': 1, 'LASTNAME_1': 2, 'EMAIL_1': 3, 'PHONENUMBER_1': 4, 'SSN_1': 5, 'JOBTITLE_1': 6,
             'SEX_1': 7, 'BUILDINGNUMBER_1': 8, 'STREET_1': 9, 'DOB_1': 10, 'USERNAME_1': 11, 'AGE_1': 12,
             'PREFIX_1': 13, 'ACCOUNTNUMBER_1': 14}


# Function to tokenize and align labels
def tokenize_and_align_labels_train(example):
    # tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    tokenized_inputs = tokenizer(example['unmasked_text'], truncation=True, padding='max_length', max_length=512)
    labels = ['O'] * len(tokenized_inputs['input_ids'])
    
    # Adjust indexing to fit the tokenization
    # example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length
    example['unmasked_text'] = example['unmasked_text'][:512]  # Ensure source_text matches max_length

    # Extract spans from masked_text
    masked_text = example['masked_text']
    # Find all labels in masked_text using regex
    matches = re.finditer(r'\[([A-Z_]+)\]', masked_text)

    # Create spans and assign labels
    for match in matches:
        label_name = match.group(1)  # Get the label from the match
        start = match.start()
        end = match.end()
        label_id = label_map.get(label_name, 0)

    # for span in example['privacy_mask']:
    #     start, end, label_name = span['start'], span['end'], span['label']
    #     label_id = label_map.get(label_name, 0)
        
        # span_tokens = tokenizer(example['source_text'][start:end], add_special_tokens=False).tokens()
        span_tokens = tokenizer(example['unmasked_text'][start:end], add_special_tokens=False).tokens()

        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs['input_ids']) - len(numeric_labels))
    
    tokenized_inputs['labels'] = numeric_labels
    return tokenized_inputs





In [8]:
# Function to tokenize and align labels
def tokenize_and_align_labels_val(example):
    tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    labels = ['O'] * len(tokenized_inputs['input_ids'])
    
    # Adjust indexing to fit the tokenization
    example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length

    # Create spans and assign labels
    for span in example['privacy_mask']:
        start, end, label_name = span['start'], span['end'], span['label']
        label_id = label_map.get(label_name, 0)
        
        span_tokens = tokenizer(example['source_text'][start:end], add_special_tokens=False).tokens()

        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs['input_ids']) - len(numeric_labels))
    
    tokenized_inputs['labels'] = numeric_labels
    return tokenized_inputs

In [9]:
# Apply function to dataset
# encoded_dataset = train_set1_selected.map(tokenize_and_align_labels, batched=False)
encoded_dataset = train_set2.map(tokenize_and_align_labels_train, batched=False)

# Convert to PyTorch tensors
input_ids = torch.tensor(encoded_dataset['input_ids'])
attention_mask = torch.tensor(encoded_dataset['attention_mask'])
labels = torch.tensor(encoded_dataset['labels'])

# Load model
model = AutoModelForTokenClassification.from_pretrained('lakshyakh93/deberta_finetuned_pii')

# Define metric computation
metric = evaluate.load('accuracy')  # You may want to load relevant metrics for token classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)  # Note: axis=2 for token classification
    true_labels = labels != -100  # Masking out unnecessary labels
    
    # Flatten predictions and true_labels to compute accuracy
    flattened_predictions = predictions[true_labels]
    flattened_labels = labels[true_labels]
    
    results = metric.compute(references=flattened_labels, predictions=flattened_predictions)
    return results


# Tokenize validation set
small_val = val_set.shuffle(seed=123).select(range(5))
encoded_small_val = small_val.map(tokenize_and_align_labels_val, batched=False)

# Training arguments
train_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-2,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    gradient_accumulation_steps=2,  # Adjust if needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



In [10]:
# Run training
trainer.train()

  0%|          | 0/36 [00:00<?, ?it/s]

{'loss': 2.0466, 'grad_norm': 0.0, 'learning_rate': 0.036111111111111115, 'epoch': 0.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.0, 'eval_accuracy': 1.0, 'eval_runtime': 4.0968, 'eval_samples_per_second': 1.22, 'eval_steps_per_second': 0.244, 'epoch': 0.96}
{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.022222222222222223, 'epoch': 1.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.0, 'eval_accuracy': 1.0, 'eval_runtime': 3.9273, 'eval_samples_per_second': 1.273, 'eval_steps_per_second': 0.255, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.008333333333333333, 'epoch': 2.4}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.0, 'eval_accuracy': 1.0, 'eval_runtime': 3.5882, 'eval_samples_per_second': 1.393, 'eval_steps_per_second': 0.279, 'epoch': 2.88}
{'train_runtime': 2545.3469, 'train_samples_per_second': 0.236, 'train_steps_per_second': 0.014, 'train_loss': 0.5685017903645834, 'epoch': 2.88}


TrainOutput(global_step=36, training_loss=0.5685017903645834, metrics={'train_runtime': 2545.3469, 'train_samples_per_second': 0.236, 'train_steps_per_second': 0.014, 'total_flos': 175710303682560.0, 'train_loss': 0.5685017903645834, 'epoch': 2.88})

In [11]:
trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 3.4,
 'eval_samples_per_second': 1.471,
 'eval_steps_per_second': 0.294,
 'epoch': 2.88}