In [1]:
import pandas as pd
import re
import spacy
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [3]:
df = pd.read_csv("combined_emails_with_natural_pii.csv")
df.head()


Unnamed: 0,email,type
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request
2,Subject: Data Analytics for Investment\n\nI am...,Request
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident
4,"Subject: Security\n\nDear Customer Support, I ...",Request


In [4]:
pii_patterns = {
    "full_name": r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b",
    "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
    "phone_number": r"\b\d{10}\b",
    "dob": r"\b\d{2}[/-]\d{2}[/-]\d{4}\b",
    "aadhar_num": r"\b\d{4} \d{4} \d{4}\b",
    "credit_debit_no": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b",
    "cvv_no": r"\b\d{3}\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])/?[0-9]{2,4}\b"
}


In [5]:
def mask_pii(text):
    masked_text = text
    entities = []
    
    for label, pattern in pii_patterns.items():
        for match in re.finditer(pattern, text):
            entity_value = match.group()
            start, end = match.span()
            entities.append({
                "position": [start, end],
                "classification": label,
                "entity": entity_value
            })
            masked_text = masked_text.replace(entity_value, f"[{label}]")
    
    return masked_text, entities


In [6]:
masked_emails = []
entities_list = []

for email in df["email"]:
    masked, entities = mask_pii(email)
    masked_emails.append(masked)
    entities_list.append(entities)

df["masked_email"] = masked_emails
df["masked_entities"] = entities_list


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df["masked_email"], df["type"], test_size=0.2, random_state=42
)


In [8]:
clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

clf_pipeline.fit(X_train, y_train)

clf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

clf_pipeline.fit(X_train, y_train)


In [9]:
y_pred = clf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      Change       0.98      0.09      0.16       479
    Incident       0.61      0.99      0.75      1920
     Problem       0.32      0.01      0.02      1009
     Request       0.78      0.91      0.84      1392

    accuracy                           0.67      4800
   macro avg       0.67      0.50      0.45      4800
weighted avg       0.64      0.67      0.57      4800



In [10]:
def classify_email(email_body):
    masked_email, entities = mask_pii(email_body)
    category = clf_pipeline.predict([masked_email])[0]
    
    return {
        "input_email_body": email_body,
        "list_of_masked_entities": entities,
        "masked_email": masked_email,
        "category_of_the_email": category
    }


In [11]:
sample_email = df["email"].iloc[0]
response = classify_email(sample_email)

print(json.dumps(response, indent=2))


{
  "input_email_body": "Subject: Unvorhergesehener Absturz der Datenanalyse-Plattform\n\nDie Datenanalyse-Plattform brach unerwartet ab, da die Speicheroberfl\u00e4che zu gering war My name is Sophia Rossi.. Ich habe versucht, Laravel 8 und meinen MacBook Pro neu zu starten, aber das Problem beh\u00e4lt sich bei. Ich ben\u00f6tige Ihre Unterst\u00fctzung, um diesen Fehler zu beheben. You can reach me at janesmith@company.com.",
  "list_of_masked_entities": [
    {
      "position": [
        9,
        34
      ],
      "classification": "full_name",
      "entity": "Unvorhergesehener Absturz"
    },
    {
      "position": [
        63,
        79
      ],
      "classification": "full_name",
      "entity": "Die Datenanalyse"
    },
    {
      "position": [
        162,
        174
      ],
      "classification": "full_name",
      "entity": "Sophia Rossi"
    },
    {
      "position": [
        361,
        382
      ],
      "classification": "email",
      "entity": "janesmith