In [268]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from names_dataset import NameDataset
from tqdm import tqdm

### Pre-Note
Token labels are presented in BIO (Beginning, Inner, Outer) format. The PII type is prefixed with “B-” when it is the beginning of an entity. If the token is a continuation of an entity, it is prefixed with “I-”. Tokens that are not PII are labeled “O”.

**Model idea** Lets instead of using the predefined token space, we define a new one without BIO definition, but where we concat tokens that are part of the same entity. We can then afterwards use SpaCy tokenizer to reverse the changes we made and get the labels.

In [269]:
file_path = "../data/train.json"

with open(file_path, "r") as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [270]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [271]:
labels = df.labels.explode().reset_index(drop=True)
labels_vc = labels.value_counts()
labels_vc

labels
O                   4989794
B-NAME_STUDENT         1365
I-NAME_STUDENT         1096
B-URL_PERSONAL          110
B-ID_NUM                 78
B-EMAIL                  39
I-STREET_ADDRESS         20
I-PHONE_NUM              15
B-USERNAME                6
B-PHONE_NUM               6
B-STREET_ADDRESS          2
I-URL_PERSONAL            1
I-ID_NUM                  1
Name: count, dtype: int64

In [272]:
# Create new tokens by concatinating items in the tokens list until a 0 label is found
# we do this for each row
new_labels_dict = {
    'O': 'O',
    'B-URL_PERSONAL': 'URL_PERSONAL',
    'I-URL_PERSONAL': 'URL_PERSONAL',
    'B-EMAIL': 'EMAIL',
    'I-EMAIL': 'EMAIL',
    'B-ID_NUM': 'ID_NUM',
    'I-ID_NUM': 'ID_NUM',
    'I-STREET_ADDRESS': 'STREET_ADDRESS',
    'B-STREET_ADDRESS': 'STREET_ADDRESS',
    'I-PHONE_NUM': 'PHONE_NUM',
    'B-PHONE_NUM': 'PHONE_NUM',
    'B-USERNAME': 'USERNAME',
    'I-USERNAME': 'USERNAME',
    'B-PHONE_NUM': 'PHONE_NUM',
    'I-PHONE_NUM': 'PHONE_NUM',
    'I-URL_PERSONAL': 'URL_PERSONAL',
    'B-URL_PERSONAL': 'URL_PERSONAL',
    'B-NAME_STUDENT': 'NAME_STUDENT',
    'I-NAME_STUDENT': 'NAME_STUDENT',
}

def create_new_tokens(row):
    tokens = row["tokens"]
    labels = row["labels"]
    new_tokens = []
    new_labels = []
    building_token, building_token_label = "", None
    for token, label in zip(tokens, labels):
        # If the token is in the new_labels_dict, we start building the new token until we find a 0 label
        if label in new_labels_dict and label != "O" and label != "I-NAME_STUDENT" and label != "B-NAME_STUDENT":
            building_token += token
            building_token_label = new_labels_dict[label]
        # Else the new token is not PII class so we append the building token and the token to the new tokens list
        else:
            if building_token:
                new_tokens.append(building_token)
                new_labels.append(building_token_label)
            
            new_tokens.append(token)
            new_labels.append(label)

            building_token = ""

    return new_tokens, new_labels

In [273]:
df[["new_tokens", "new_labels"]] = df.apply(create_new_tokens, axis=1, result_type="expand")

In [274]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,new_tokens,new_labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[Diego, Estrada, \n\n, Design, Thinking, Assig...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[Reporting, process, \n\n, by, Gilberto, Gambo...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...","[Design, Thinking, for, Innovation, \n\n, Sind...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...","[Assignment, :, , Visualization, , Reflecti...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [275]:
new_tokens_exploded = df.new_tokens.explode().reset_index(drop=True)
new_labels_exploded = df.new_labels.explode().reset_index(drop=True)
new_labels_exploded.value_counts()

new_labels
O                 4989794
B-NAME_STUDENT       1365
I-NAME_STUDENT       1096
URL_PERSONAL          110
ID_NUM                 78
EMAIL                  39
USERNAME                6
PHONE_NUM               6
STREET_ADDRESS          2
Name: count, dtype: int64

In [276]:
ids = new_labels_exploded[new_labels_exploded == 'ID_NUM'].index
new_tokens_exploded[ids]

24543      860632713425
24556      530670102508
24567      530670102508
24573      875673967537
24579      860632713425
               ...     
1842333       047378465
2840555         IV-8322
2841154         IV-8322
3889578    Z.S.30407059
4423790          V69230
Name: new_tokens, Length: 78, dtype: object

### ID_NUM

In [277]:
idx = new_labels_exploded[new_labels_exploded == "ID_NUM"].index
b_ids = new_tokens_exploded[idx]
b_ids.shape

(78,)

In [278]:
reg = r'\d{6,12}\b|\b[a-zA-Z]+\d{2,}'
specific_id = b_ids.str.contains(reg)
ids_in_tokens = new_tokens_exploded.str.contains(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  73
Regex finding Ids in all tokens:  248

Accuracy Regex finding Ids in label specific:  0.9358974358974359
Error of missclassification in whole 4.967455156699174e-05


### EMAIL

In [279]:
idx = new_labels_exploded[new_labels_exploded == "EMAIL"].index
b_email = new_tokens_exploded[idx]
b_email.shape

(39,)

In [280]:
reg = r'[\w\.-]+@[\w\.-]+'
specific_id = b_email.str.match(reg)
ids_in_tokens = new_tokens_exploded.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  39
Regex finding Ids in all tokens:  49

Accuracy Regex finding Ids in label specific:  1.0
Error of missclassification in whole 9.814729946704014e-06


### URL_PERSONAL

In [281]:
idx = new_labels_exploded[new_labels_exploded == "URL_PERSONAL"].index
b_url = new_tokens_exploded[idx]
b_url.shape

(110,)

In [282]:
reg = r'https?://\S+|www\.\S+'
specific_id = b_url.str.match(reg)
ids_in_tokens = new_tokens_exploded.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  109
Regex finding Ids in all tokens:  327

Accuracy Regex finding Ids in label specific:  0.990909090909091
Error of missclassification in whole 6.54982998484125e-05


### PHONE_NUM

In [283]:
idx = new_labels_exploded[new_labels_exploded == "PHONE_NUM"].index
b_phone = new_tokens_exploded[idx]
b_phone.shape

(6,)

In [284]:
reg = r'\(?(\d{3})\)?[-. ]?(\d{3})[-. ]?(\d{4})(x\d{2,5})?'
specific_id = b_phone.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  6
Regex finding Ids in all tokens:  327

Accuracy Regex finding Ids in label specific:  1.0
Error of missclassification in whole 6.54982998484125e-05


## Model Definition
We will attempt to create the simplest model possible. The model will evaluate each token and pass it through a couple functions to evaluate it. If one of the functions returns true, the token will be considered a keyword. The model will then return the keyword and the index of the token in the input string.

### Model Functions
- **email** This function will use regex to identify if the token is an email address.
- **phone_num** This function will use regex to identify if the token is a phone number.
- **address** This function will use regex to identify if the token is an address.
- **username** This function will use a word embedding model to identify if the token is a common word and if not then we flag it as a username.
- **personal_id** This function will use regex to identify if the token is a personal id.


In [285]:
names = NameDataset()

In [334]:
print(names.first_names.get("It"))

None


**To do**
Maybe we could make some more features for capitalised words etc.

In [335]:
import re

def tokenizer(text):
    # Remove excessive line breaks or spaces
    text = re.sub('(\r\n){2,}', ' ', text)
    
    # Convert the text to lowercase and split into tokens
    tokens = text.lower().split()
    
    return tokens

class PiiModel:
    def __init__(self, text: str = None, names_dataset = None, tokens: list = None) -> None:
        self.text = text
        self.names = names_dataset
        self.tokens = tokens
        self.tokenizer = tokenizer
        self.name_rank_threhsold = 120
        self.country_threshold = 0.41
        pass
    
    def get_firstname(self, name: str) -> bool:
        found_name = self.names.first_names.get(name)
        # we check if the max country is higher than a threshold then we say it is a valid name
        if found_name != None and max(found_name["country"].values()) > self.country_threshold:
            return True
        else:
            return False
    
    def get_lastname(self, lastname: str) -> bool:
        found_name = self.names.last_names.get(lastname)
        # we check if the max country is higher than a threshold then we say it is a valid name
        if found_name != None and max(found_name["country"].values()) > self.country_threshold:
            return True
        else:
            return False
    
    def get_email_regex(self):
        return r'[\w\.-]+@[\w\.-]+'
    
    def get_phone_number_regex(self):
        return r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
    
    def get_address_regex(self):
        return r'\d{1,5}\s\w.\s(\b\w*\b\s){1,2}\w*\.'
    
    def get_personal_id_regex(self):
        # Any 
        return r'\d{6,12}\b|\b[a-zA-Z]+\d{2,}'
    
    def get_url_personal_regex(self):
        return r'https?://[\w\.-]+'

    def detect_pii(self, text: str = "", tokens: list = None) -> list[tuple[int, str]]:
        token_indices = self.tokens
        if not self.tokens and text != "":
            token_indices = self.tokenizer(text)
            self.tokens = token_indices
        if tokens:
            token_indices = tokens
        pii_indexes = []
        for i in range(len(token_indices)):
            if re.search(self.get_email_regex(), token_indices[i]):
                pii_indexes.append((token_indices[i], 'EMAIL'))
            elif re.search(self.get_phone_number_regex(), token_indices[i]):
                pii_indexes.append((token_indices[i], 'PHONE_NUM'))
            elif re.search(self.get_address_regex(), token_indices[i]):
                pii_indexes.append((token_indices[i], 'STREET_ADDRESS'))
            elif re.search(self.get_personal_id_regex(), token_indices[i]):
                pii_indexes.append((token_indices[i], 'ID_NUM'))
            elif re.search(self.get_url_personal_regex(), token_indices[i]):
                pii_indexes.append((token_indices[i], 'URL_PERSONAL'))
            elif self.get_firstname(token_indices[i]):
                pii_indexes.append((token_indices[i], 'B-NAME_STUDENT'))
            elif self.get_lastname(token_indices[i]):
                pii_indexes.append((token_indices[i], 'I-NAME_STUDENT'))
            else:
                pii_indexes.append((token_indices[i], 'O'))

        return pii_indexes


In [336]:
tokens = df.iloc[0].new_tokens
model = PiiModel(tokens=tokens, names_dataset=names)

In [337]:
pii_indexes = model.detect_pii(tokens=tokens)
print(pii_indexes)

[('Design', 'O'), ('Thinking', 'O'), ('for', 'O'), ('innovation', 'O'), ('reflexion', 'O'), ('-', 'O'), ('Avril', 'I-NAME_STUDENT'), ('2021', 'O'), ('-', 'O'), ('Nathalie', 'B-NAME_STUDENT'), ('Sylla', 'B-NAME_STUDENT'), ('\n\n', 'O'), ('Challenge', 'O'), ('&', 'O'), ('selection', 'O'), ('\n\n', 'O'), ('The', 'O'), ('tool', 'O'), ('I', 'O'), ('use', 'O'), ('to', 'O'), ('help', 'O'), ('all', 'O'), ('stakeholders', 'O'), ('finding', 'O'), ('their', 'O'), ('way', 'O'), ('through', 'O'), ('the', 'O'), ('complexity', 'O'), ('of', 'O'), ('a', 'O'), ('project', 'O'), ('is', 'O'), ('the', 'O'), (' ', 'O'), ('mind', 'O'), ('map', 'O'), ('.', 'O'), ('\n\n', 'O'), ('What', 'O'), ('exactly', 'O'), ('is', 'O'), ('a', 'O'), ('mind', 'O'), ('map', 'O'), ('?', 'O'), ('According', 'B-NAME_STUDENT'), ('to', 'O'), ('the', 'O'), ('definition', 'O'), ('of', 'O'), ('Buzan', 'O'), ('T.', 'O'), ('and', 'O'), ('Buzan', 'O'), ('B.', 'O'), ('(', 'O'), ('1999', 'O'), (',', 'O'), ('Dessine', 'O'), ('-', 'O'), ('mo

## Evaluate
We will now attempt to evaluate the model against the true labels.

In [340]:
tp = 0
fp = 0
failed = []
correct = []

for row in tqdm(df.iterrows()):
    # Make predictions using the model
    # Replace `model.predict()` with the actual prediction code
    predictions = model.detect_pii(tokens=row[1]["new_tokens"])
    
    # Compare the predictions with the ground truth labels
    for pred, label in zip(predictions, row[1]["new_labels"]):
        if pred[0] == "O":
            continue
        if pred[1] == label:
            tp += 1
            correct.append((pred[0], pred[1], label))
        else:
            fp += 1
            failed.append((pred[0], pred[1], label))

# Print the evaluation results
print("True Positives:", tp)
print("False Positives:", fp)
print("Precision:", tp / (tp + fp))
    


6807it [00:15, 448.78it/s]

True Positives: 4924168
False Positives: 68311
Precision: 0.9863172183598569



