<a href="https://colab.research.google.com/github/AdityaBhatt3010/PII_Tryy/blob/main/PII_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def annotate_pii(text):
    # Tokenize with offsets
    encoded_inputs = tokenizer(text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
    offset_mapping = encoded_inputs.pop("offset_mapping")[0].tolist()
    encoded_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**encoded_inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist()

    # Reconstruct the annotated text
    spans = []
    current_entity = None
    current_text = ""
    current_start = 0

    for i, (start, end) in enumerate(offset_mapping):
        if start == end:
            continue  # Skip special tokens

        label_id = predictions[i]
        label = model.config.id2label[label_id]

        if label != "O":
            if current_entity is None:
                current_entity = label
                current_text = text[start:end]
                current_start = start
            elif label == current_entity:
                current_text += text[start:end]
            else:
                spans.append((current_start, current_text, current_entity))
                current_entity = label
                current_text = text[start:end]
                current_start = start
        else:
            if current_entity:
                spans.append((current_start, current_text, current_entity))
                current_entity = None
                current_text = ""

    if current_entity:
        spans.append((current_start, current_text, current_entity))

    # Build the final annotated string
    annotated_text = ""
    last_index = 0
    for start, value, entity in sorted(spans):
        annotated_text += text[last_index:start]
        annotated_text += f"[{value} | {entity}]"
        last_index = start + len(value)

    annotated_text += text[last_index:]
    return annotated_text


In [10]:
# Example usage
example_text = "Yesterday, Aditya scheduled a team meeting at 9:00 AM to discuss the quarterly marketing strategies. She sent out invitations from her email emily.watson93@example.com and confirmed attendance over the phone at (415) 992-5587. The meeting was held at their San Francisco office, located at 220 Market Street, Suite 804, San Francisco, CA 94105. During the session, Daniel Lee mentioned integrating analytics from the new customer database, which is currently stored under Project Orion. Later in the day, Emily's package was delivered and signed for using her ID number A1234567, as registered under the California DMV. Meanwhile, the finance department cross-verified employee records using their SSNs, like 521-47-8912, and banking details such as Bank of America Account No. 004567891234. The internal Slack message also included a link to a private GitHub repo and a backup contact email: dlee.corp@protonmail.com."

print("Annotated PII Output:\n")
print(annotate_pii(example_text))

Annotated PII Output:

Yesterday,[ Aditya | I-GIVENNAME] scheduled a team meeting at 9:00 AM to discuss the quarterly marketing strategies. She sent out invitations from her email[ emily.watson93@ | I-EMAIL]example.com and confirmed attendance over the phone at[ (415) 992-5587 | I-TELEPHONENUM]. The meeting was held at their[ San Francisco | I-CITY] office, located at[ 220 | I-BUILDINGNUM][ Market Street | I-STREET], Suite 804,[ San Francisco | I-CITY], CA[ 94105. | I-ZIPCODE] During the session, Daniel Lee mentioned integrating analytics from the new customer database, which is currently stored under Project Orion. Later in the day, Emily's package was delivered and signed for using her ID number[ A1234567 | I-IDCARDNUM], as registered under the California DMV. Meanwhile, the finance department cross-verified employee records using their SSNs, like[ 521-47-8912 | I-SOCIALNUM], and banking details such as Bank of America Account No.[ 004567891234 | I-ACCOUNTNUM]. The internal Slack mes