In [52]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from names_dataset import NameDataset

### Pre-Note
Token labels are presented in BIO (Beginning, Inner, Outer) format. The PII type is prefixed with “B-” when it is the beginning of an entity. If the token is a continuation of an entity, it is prefixed with “I-”. Tokens that are not PII are labeled “O”.

**Model idea** Lets instead of using the predefined token space, we define a new one without BIO definition, but where we concat tokens that are part of the same entity. We can then afterwards use SpaCy tokenizer to reverse the changes we made and get the labels.

In [53]:
file_path = "../data/train.json"

with open(file_path, "r") as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [54]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [55]:
labels = df.labels.explode().reset_index(drop=True)
labels_vc = labels.value_counts()
labels_vc

labels
O                   4989794
B-NAME_STUDENT         1365
I-NAME_STUDENT         1096
B-URL_PERSONAL          110
B-ID_NUM                 78
B-EMAIL                  39
I-STREET_ADDRESS         20
I-PHONE_NUM              15
B-USERNAME                6
B-PHONE_NUM               6
B-STREET_ADDRESS          2
I-URL_PERSONAL            1
I-ID_NUM                  1
Name: count, dtype: int64

In [106]:
# Create new tokens by concatinating items in the tokens list until a 0 label is found
# we do this for each row
new_labels_dict = {
    'B-URL_PERSONAL': 'URL_PERSONAL',
    'I-URL_PERSONAL': 'URL_PERSONAL',
    'B-EMAIL': 'EMAIL',
    'I-EMAIL': 'EMAIL',
    'B-ID_NUM': 'ID_NUM',
    'I-ID_NUM': 'ID_NUM',
    'I-STREET_ADDRESS': 'STREET_ADDRESS',
    'B-STREET_ADDRESS': 'STREET_ADDRESS',
    'I-PHONE_NUM': 'PHONE_NUM',
    'B-PHONE_NUM': 'PHONE_NUM',
    'B-USERNAME': 'USERNAME',
    'I-USERNAME': 'USERNAME',
    'B-PHONE_NUM': 'PHONE_NUM',
    'I-PHONE_NUM': 'PHONE_NUM',
    'I-URL_PERSONAL': 'URL_PERSONAL',
    'B-URL_PERSONAL': 'URL_PERSONAL'
}

def create_new_tokens(row):
    tokens = row["tokens"]
    labels = row["labels"]
    new_tokens = []
    new_labels = []
    building_token, building_token_label = "", None
    for token, label in zip(tokens, labels):
        # If the token is in the new_labels_dict, we start building the new token until we find a 0 label
        if label in new_labels_dict:
            building_token += token
            building_token_label = new_labels_dict[label]
        # Else the new token is not PII class so we append the building token and the token to the new tokens list
        else:
            if building_token:
                new_tokens.append(building_token)
                new_labels.append(building_token_label)
            
            new_tokens.append(token)
            new_labels.append(label)

            building_token = ""

    return new_tokens, new_labels

In [108]:
df[["new_tokens", "new_labels"]] = df.apply(create_new_tokens, axis=1, result_type="expand")

In [109]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,new_tokens,new_labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[Diego, Estrada, \n\n, Design, Thinking, Assig...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[Reporting, process, \n\n, by, Gilberto, Gambo...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...","[Design, Thinking, for, Innovation, \n\n, Sind...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...","[Assignment, :, , Visualization, , Reflecti...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [110]:
new_tokens_exploded = df.new_tokens.explode().reset_index(drop=True)
new_labels_exploded = df.new_labels.explode().reset_index(drop=True)
new_labels_exploded.value_counts()

new_labels
O                 4989794
B-NAME_STUDENT       1365
I-NAME_STUDENT       1096
URL_PERSONAL          110
ID_NUM                 78
EMAIL                  39
USERNAME                6
PHONE_NUM               6
STREET_ADDRESS          2
Name: count, dtype: int64

In [111]:
ids = new_labels_exploded[new_labels_exploded == 'ID_NUM'].index
new_tokens_exploded[ids]

24543      860632713425
24556      530670102508
24567      530670102508
24573      875673967537
24579      860632713425
               ...     
1842333       047378465
2840555         IV-8322
2841154         IV-8322
3889578    Z.S.30407059
4423790          V69230
Name: new_tokens, Length: 78, dtype: object

### ID_NUM

In [112]:
idx = new_labels_exploded[new_labels_exploded == "ID_NUM"].index
b_ids = new_tokens_exploded[idx]
b_ids.shape

(78,)

In [113]:
reg = r'\d{6,12}\b|\b[a-zA-Z]+\d{2,}'
specific_id = b_ids.str.contains(reg)
ids_in_tokens = new_tokens_exploded.str.contains(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  73
Regex finding Ids in all tokens:  248

Accuracy Regex finding Ids in label specific:  0.9358974358974359
Error of missclassification in whole 4.967455156699174e-05


### EMAIL

In [114]:
idx = new_labels_exploded[new_labels_exploded == "EMAIL"].index
b_email = new_tokens_exploded[idx]
b_email.shape

(39,)

In [115]:
reg = r'[\w\.-]+@[\w\.-]+'
specific_id = b_email.str.match(reg)
ids_in_tokens = new_tokens_exploded.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  39
Regex finding Ids in all tokens:  49

Accuracy Regex finding Ids in label specific:  1.0
Error of missclassification in whole 9.814729946704014e-06


### URL_PERSONAL

In [116]:
idx = new_labels_exploded[new_labels_exploded == "URL_PERSONAL"].index
b_url = new_tokens_exploded[idx]
b_url.shape

(110,)

In [117]:
b_url

15979       https://www.jackson.com/list/explorehomepage.htm
20981                  https://www.linkedin.com/in/mmartinez
23325                           https://youtu.be/rFD2lJuvace
39858               https://www.hall.biz/wp-contenthome.html
55536      http://www.burns-lopez.com/categories/appabout...
                                 ...                        
2060771               https://hernandez.com/exploremain.html
2060773            https://www.roman.info/list/appindex.html
2060965               https://hernandez.com/exploremain.html
2060967            https://www.roman.info/list/appindex.html
2139327            https://alvarado.com/categoriesindex.html
Name: new_tokens, Length: 110, dtype: object

In [118]:
reg = r'https?://\S+|www\.\S+'
specific_id = b_url.str.match(reg)
ids_in_tokens = new_tokens_exploded.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  109
Regex finding Ids in all tokens:  327

Accuracy Regex finding Ids in label specific:  0.990909090909091
Error of missclassification in whole 6.54982998484125e-05


### PHONE_NUM

In [124]:
idx = new_labels_exploded[new_labels_exploded == "PHONE_NUM"].index
b_phone = new_tokens_exploded[idx]
b_phone.shape

(6,)

In [125]:
reg = r'\(?(\d{3})\)?[-. ]?(\d{3})[-. ]?(\d{4})(x\d{2,5})?'
specific_id = b_phone.str.match(reg)

print('Regex finding Ids in label specific: ', specific_id.sum())
print('Regex finding Ids in all tokens: ', ids_in_tokens.sum())
print()
print('Accuracy Regex finding Ids in label specific: ', specific_id.sum() / len(specific_id))
# We calculate the percentage of ids we would have classified wrongly in the total amount of tokens
print('Error of missclassification in whole', ids_in_tokens.sum() / len(ids_in_tokens))

Regex finding Ids in label specific:  6
Regex finding Ids in all tokens:  327

Accuracy Regex finding Ids in label specific:  1.0
Error of missclassification in whole 6.54982998484125e-05


## Model Definition
We will attempt to create the simplest model possible. The model will evaluate each token and pass it through a couple functions to evaluate it. If one of the functions returns true, the token will be considered a keyword. The model will then return the keyword and the index of the token in the input string.

### Model Functions
- **email** This function will use regex to identify if the token is an email address.
- **phone_num** This function will use regex to identify if the token is a phone number.
- **address** This function will use regex to identify if the token is an address.
- **username** This function will use a word embedding model to identify if the token is a common word and if not then we flag it as a username.
- **personal_id** This function will use regex to identify if the token is a personal id.


In [17]:
class PiiModel:
    def __init__(self, train_set: pd.DataFrame) -> None:
        self.train_set = train_set
        pass

    def get_name_dataset(self):
        return NameDataset()
    
    def get_email_regex(self):
        return r'[\w\.-]+@[\w\.-]+'
    
    def get_phone_number_regex(self):
        return r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
    
    def get_address_regex(self):
        return r'\d{1,5}\s\w.\s(\b\w*\b\s){1,2}\w*\.'
    
    def get_personal_id_regex(self):
        # Any 
        return r'\d{6,12}\b|\b[a-zA-Z]+\d{2,}'
    
    def get_url_personal_regex(self):
        return r'https?://[\w\.-]+'

