In [1]:
# Import necessary packages
import torch
import pandas as pd

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="lakshyakh93/deberta_finetuned_pii")




In [3]:
result = pipe('John Doe lives at 123 Elm St.')
print(result)

[{'entity': 'I-FULLNAME', 'score': 0.46424457, 'index': 2, 'word': 'ĠDoe', 'start': 4, 'end': 8}, {'entity': 'B-STREETADDRESS', 'score': 0.88947034, 'index': 5, 'word': 'Ġ123', 'start': 17, 'end': 21}, {'entity': 'I-STREETADDRESS', 'score': 0.9810363, 'index': 6, 'word': 'ĠElm', 'start': 21, 'end': 25}, {'entity': 'I-STREETADDRESS', 'score': 0.97596735, 'index': 7, 'word': 'ĠSt', 'start': 25, 'end': 28}]


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

In [5]:
inputs = tokenizer('John Doe lives at 123 Elm St.', return_tensors='pt')
outputs = model(**inputs)
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 0.0527,  0.3155,  0.4790,  ..., -0.0416, -0.3986, -1.0505],
         [ 2.1610,  0.5228,  6.3309,  ...,  2.1777, -0.5386, -2.6604],
         [ 0.5061,  0.7269,  3.7811,  ...,  2.9378, -0.1434, -2.4877],
         ...,
         [ 0.5294, -0.1343, -0.6034,  ..., -0.4739, -0.3561, -1.2317],
         [ 0.8860,  0.8328, -0.1042,  ...,  1.0760, -1.8587, -4.4006],
         [-0.1141,  0.4109,  0.5679,  ..., -0.1445, -0.4688, -1.5732]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [6]:
import torch

In [7]:
# Example logits tensor from your output

# Convert logits to probabilities
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Get predicted class indices
classes = torch.argmax(probs, dim=-1)

# Map the class indices to labels
labels = [[model.config.id2label[idx.item()] for idx in sequence] for sequence in classes]
labels

[['O',
  'O',
  'I-FULLNAME',
  'O',
  'O',
  'B-STREETADDRESS',
  'I-STREETADDRESS',
  'I-STREETADDRESS',
  'O',
  'O']]

O - Outside

In [8]:
# Try the model with text in the pii-masking-200k dataset
texts = [
    'Legal notice from Anahi8 claiming breach of contract received today. Please review facts on case. Reply to this Miller.White@hotmail.com or contact our office at 003.765 1989 for an urgent meeting.',
    'Medical intervention and occupational therapy can help Accountability workers significantly. For appointments call us at +40-724-062 4764.',
    'Legal notice from Anahi8 claiming breach of contract received today. Please review facts on case. Reply to this Miller.White@hotmail.com or contact our office at 003.765 1989 for an urgent meeting.'
]

# Tokenize the input text
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to predicted class indices
predicted_classes = torch.argmax(logits, dim=-1)

# Map indices to labels
mapped_labels = [[model.config.id2label[idx.item()] for idx in sequence] for sequence in predicted_classes]

# Prepare results
results = []
for i, text in enumerate(texts):
    result = {
        'text': text,
        'tokens': tokenizer.tokenize(text),
        'predicted_labels': mapped_labels[i]
    }
    results.append(result)

# Save results to CSV
df = pd.DataFrame(results)
df

Unnamed: 0,text,tokens,predicted_labels
0,Legal notice from Anahi8 claiming breach of co...,"[ĠLegal, Ġnotice, Ġfrom, ĠAn, ahi, 8, Ġclaimin...","[O, O, O, O, B-USERNAME, B-USERNAME, I-USERNAM..."
1,Medical intervention and occupational therapy ...,"[ĠMedical, Ġintervention, Ġand, Ġoccupational,...","[O, O, O, O, O, O, O, O, B-JOBAREA, O, O, O, O..."
2,Legal notice from Anahi8 claiming breach of co...,"[ĠLegal, Ġnotice, Ġfrom, ĠAn, ahi, 8, Ġclaimin...","[O, O, O, O, B-USERNAME, B-USERNAME, I-USERNAM..."
