In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [14]:
pip install chardet



In [10]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [None]:
# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [17]:
# Load the data
import chardet

filepath = os.path.join(os.getcwd(), 'mini_df_200_row_biolabels.csv')

# Detect encoding
with open(filepath, 'rb') as f:
    result = chardet.detect(f.read())
    print("Detected file encoding:", result['encoding'])

# Read the file using the detected encoding
df = pd.read_csv(filepath, encoding=result['encoding'])

# Proportional split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Since you have 200 rows, no need for the 3,000 cap, as this won't be needed
train_size = int(len(df) * train_ratio)
val_size = int(len(df) * val_ratio)
test_size = len(df) - train_size - val_size  # Remaining rows for the test set

# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=1 - train_ratio, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Print dataset sizes to confirm
print(f"Training size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Detected file encoding: Windows-1252
Training size: 134
Validation size: 29
Test size: 29


In [61]:
# Create a mapping for the bio_labels
data_label_list = []

# Extract unique labels from the 'bio_labels' column
def get_unique_labels(df):
    unique_labels = set()
    for labels in df['bio_labels']:  # Iterate through the 'bio_labels' column
        # Since bio_labels is a string representation of a list, we need to convert it back to a list
        labels = eval(labels)  # Convert the string to a list
        unique_labels.update(labels)  # Add all labels in the current row to the set
    return sorted(unique_labels)  # Return sorted labels for consistency

# Apply the function to the entire DataFrame
unique_labels = get_unique_labels(df)  # Run on the whole DataFrame

# Print the unique labels
print("Data's Unique BIO Labels:", unique_labels)

# Combine with the existing label_list (if applicable)
data_label_list = sorted(set(data_label_list + unique_labels))  # Ensure no duplicates
print("Data's Final Label List:", data_label_list)
print(len(data_label_list))  # Number of unique labels

data_label_to_id = {label: idx for idx, label in enumerate(data_label_list)}
data_id_to_label = {idx: label for label, idx in data_label_to_id.items()}

Data's Unique BIO Labels: ['B-ACCOUNTNAME_1', 'B-ACCOUNTNUMBER_1', 'B-AGE_1', 'B-AMOUNT_1', 'B-BIC_1', 'B-BITCOINADDRESS_1', 'B-BUILDINGNUMBER_1', 'B-CITY_1', 'B-COMPANYNAME_1', 'B-COUNTY_1', 'B-CREDITCARDCVV_1', 'B-CREDITCARDISSUER_1', 'B-CREDITCARDNUMBER_1', 'B-CURRENCYNAME_1', 'B-CURRENCYSYMBOL_1', 'B-CURRENCY_1', 'B-DATE_1', 'B-DOB_1', 'B-EMAIL_1', 'B-ETHEREUMADDRESS_1', 'B-EYECOLOR_1', 'B-FIRSTNAME_1', 'B-FIRSTNAME_2', 'B-GENDER_1', 'B-HEIGHT_1', 'B-IBAN_1', 'B-IPV4_1', 'B-IPV6_1', 'B-JOBAREA_1', 'B-JOBTITLE_1', 'B-JOBTYPE_1', 'B-LASTNAME_1', 'B-LITECOINADDRESS_1', 'B-MAC_1', 'B-MASKEDNUMBER_1', 'B-MEDICAL_1', 'B-MIDDLENAME_1', 'B-NEARBYGPSCOORDINATE_1', 'B-ORDINALDIRECTION_1', 'B-ORGANIZATION_1', 'B-PASSWORD_1', 'B-PHONEIMEI_1', 'B-PHONENUMBER_1', 'B-PIN_1', 'B-PREFIX_1', 'B-SECONDARYADDRESS_1', 'B-SEX_1', 'B-SSN_1', 'B-STATE_1', 'B-STREET_1', 'B-TIME_1', 'B-URL_1', 'B-USERNAME_1', 'B-VEHICLEVIN_1', 'B-VEHICLEVRM_1', 'B-ZIPCODE_1', 'I-ACCOUNTNAME_1', 'I-AGE_1', 'I-BUILDINGNUMBER_

In [60]:
model_label_list = model.config.id2label.values()  # Ensure this matches the fine-tuned model
model_id_to_label = {int(k): v for k, v in model.config.id2label.items()}
model_label_to_id = {v: int(k) for k, v in model_id_to_label.items()}

print("Model's Label to ID:", model_label_to_id)
print("Model's ID to Label:", model_id_to_label)
print(len(model_label_list))

Model's Label to ID: {'B-PREFIX': 0, 'I-PREFIX': 1, 'B-FIRSTNAME': 2, 'I-FIRSTNAME': 3, 'B-MIDDLENAME': 4, 'B-LASTNAME': 5, 'I-LASTNAME': 6, 'O': 7, 'B-JOBDESCRIPTOR': 8, 'B-JOBTITLE': 9, 'I-JOBTITLE': 10, 'B-COMPANY_NAME': 11, 'I-COMPANY_NAME': 12, 'B-JOBAREA': 13, 'B-EMAIL': 14, 'I-EMAIL': 15, 'B-TIME': 16, 'I-TIME': 17, 'B-DATE': 18, 'I-DATE': 19, 'B-URL': 20, 'I-URL': 21, 'B-BITCOINADDRESS': 22, 'I-BITCOINADDRESS': 23, 'B-ETHEREUMADDRESS': 24, 'I-ETHEREUMADDRESS': 25, 'B-ACCOUNTNAME': 26, 'I-ACCOUNTNAME': 27, 'B-IBAN': 28, 'I-IBAN': 29, 'B-ACCOUNTNUMBER': 30, 'I-ACCOUNTNUMBER': 31, 'B-BIC': 32, 'I-BIC': 33, 'B-IPV4': 34, 'I-IPV4': 35, 'B-STREETADDRESS': 36, 'I-STREETADDRESS': 37, 'B-CITY': 38, 'I-CITY': 39, 'B-ZIPCODE': 40, 'I-ZIPCODE': 41, 'B-USERNAME': 42, 'I-USERNAME': 43, 'B-IPV6': 44, 'I-IPV6': 45, 'B-CREDITCARDNUMBER': 46, 'I-CREDITCARDNUMBER': 47, 'B-VEHICLEVIN': 48, 'I-VEHICLEVIN': 49, 'B-SUFFIX': 50, 'I-SUFFIX': 51, 'B-AMOUNT': 52, 'I-AMOUNT': 53, 'B-CURRENCY': 54, 'I-CURR

In [54]:
# Define metrics calculation function
def compute_metrics(predictions, references):
    acc = accuracy_score(references, predictions)
    precision = precision_score(references, predictions, average="weighted", zero_division=1)
    recall = recall_score(references, predictions, average="weighted", zero_division=1)
    f1 = f1_score(references, predictions, average="weighted", zero_division=1)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

run inference and calculate the metrics on the 200-row dataset
don't need to train it

In [79]:
from transformers import DataCollatorForTokenClassification
from torch.utils.data import DataLoader
import numpy as np

# Prepare data for tokenization and create a PyTorch Dataset
def tokenize_and_align_labels(df, tokenizer, data_label_to_id, model_label_to_id):
    def normalize_label(label):
        # Remove suffixes like "_1", "_2"
        if "_" in label and label.split("_")[-1].isdigit():
            return "_".join(label.split("_")[:-1])
        return label

    def map_labels_to_model(data_label):
        normalized_label = normalize_label(data_label)  # Normalize dataset labels
        return model_label_to_id.get(normalized_label, model_label_to_id.get("O", -100))  # Default to "O"

    tokenized_inputs = tokenizer(
        list(df["unmasked_text"]),  # Convert Series to list of strings
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=False,
    )

    labels = []
    for i, label_seq in enumerate(df["bio_labels"]):
        word_labels = eval(label_seq)  # Convert string back to Python list
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens back to words
        token_labels = []
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                token_labels.append(-100)
            elif word_id < len(word_labels):  # Ensure within bounds
                token_labels.append(map_labels_to_model(word_labels[word_id]))
            else:
                token_labels.append(-100)  # Ignore out-of-bounds tokens
        labels.append(token_labels)

    tokenized_inputs["labels"] = torch.tensor(labels)
    return tokenized_inputs

# Tokenize the test set
tokenized_test = tokenize_and_align_labels(df, tokenizer, data_label_to_id, model_label_to_id)

# Create a DataLoader for the test set
test_dataset = torch.utils.data.TensorDataset(tokenized_test["input_ids"], tokenized_test["attention_mask"], tokenized_test["labels"])

test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [80]:
for key, value in tokenized_test.items():
    print(f"{key}: {value.shape}")

input_ids: torch.Size([192, 113])
token_type_ids: torch.Size([192, 113])
attention_mask: torch.Size([192, 113])
labels: torch.Size([192, 113])


In [71]:
id_to_label = {v: k for k, v in model_label_to_id.items()}

In [81]:
# Inference loop
all_predictions, all_references = [], []
model.eval()

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [t.to(device) for t in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()

        # Align predictions and references to only include non-special tokens
        for pred, label in zip(predictions, labels):
            true_labels = [
                id_to_label[l] for l in label if l != -100
            ]  # Convert ground truth IDs to labels
            pred_labels = [
                id_to_label.get(p, "O") for p, l in zip(pred, label) if l != -100
            ]  # Convert predictions to labels, defaulting to "O" for unmapped IDs

            all_predictions.extend(pred_labels)
            all_references.extend(true_labels)

# Calculate metrics
metrics = compute_metrics(all_predictions, all_references)
print("Metrics:", metrics)

Metrics: {'accuracy': 0.6435177673211099, 'precision': 0.8197830856231896, 'recall': 0.6435177673211099, 'f1': 0.7183113567528646}


* deal with the redaction issue
  * 'biolabels' column
  * find the unique labels
  * add them to the label map

* don't need to finetune deBERTa anymore
  * just calculate its accuracy

* can also finetune a not-finetuned deBERTa model if you have time