# Setup

## Imports

In [3]:
!pip install "presidio-analyzer[azure-ai-language]" --no-index --find-links=../input/pii-data-detection-dataset

Looking in links: ../input/pii-data-detection-dataset
Processing /kaggle/input/pii-data-detection-dataset/presidio_analyzer-2.2.353-py3-none-any.whl
Processing /kaggle/input/pii-data-detection-dataset/tldextract-5.1.1-py3-none-any.whl (from presidio-analyzer[azure-ai-language])
Processing /kaggle/input/pii-data-detection-dataset/phonenumbers-8.13.30-py2.py3-none-any.whl (from presidio-analyzer[azure-ai-language])
Processing /kaggle/input/pii-data-detection-dataset/azure_ai_textanalytics-5.3.0-py3-none-any.whl (from presidio-analyzer[azure-ai-language])
Processing /kaggle/input/pii-data-detection-dataset/azure_core-1.30.0-py3-none-any.whl (from presidio-analyzer[azure-ai-language])
Processing /kaggle/input/pii-data-detection-dataset/azure_common-1.1.28-py2.py3-none-any.whl (from azure-ai-textanalytics->presidio-analyzer[azure-ai-language])
Processing /kaggle/input/pii-data-detection-dataset/isodate-0.6.1-py2.py3-none-any.whl (from azure-ai-textanalytics->presidio-analyzer[azure-ai-langu

In [4]:
import pandas as pd
import json
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.predefined_recognizers import AzureAILanguageRecognizer
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from kaggle_secrets import UserSecretsClient

## Load data

In [5]:
# Get training data
train_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))
test_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))

In [6]:
# Transform into pandas dataframe
train_df = pd.json_normalize(train_data)
test_df = pd.json_normalize(test_data)

# Data exploration

## Dataset description

### PII Types
The competition asks competitors to assign labels to the following seven types of PII:

* **NAME_STUDENT** - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
* **EMAIL** - A student’s email address.
* **USERNAME** - A student's username on any platform.
* **ID_NUM** - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
* **PHONE_NUM** - A phone number associated with a student.
* **URL_PERSONAL** - A URL that might be used to identify a student.
* **STREET_ADDRESS** - A full or partial street address that is associated with the student, such as their home address.

### Field Information
* **(int)**: the index of the essay
* **document**: ID of the document
* **full_text**: text of the essay
* **tokens**: list of tokens needing labelisation 
* **trailing_whitespace**: a boolean value indicating whether each token is followed by whitespace
* **labels (list)**: a token label in BIO format (B = beginning of the entity, I = token in between entities, O = outer entity) [training data only]

## Display data

In [16]:
# train_df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [25]:
# Display tokens and labels when PII are to be detected

# testing = pd.DataFrame(list(zip(train_df["tokens"][1], train_df["labels"][1])), columns=["tokens", "labels"])

# display(testing[testing["labels"] != "O"])

Unnamed: 0,tokens,labels
0,Diego,B-NAME_STUDENT
1,Estrada,I-NAME_STUDENT
464,Diego,B-NAME_STUDENT
465,Estrada,I-NAME_STUDENT


# Modeling

## Baseline model (Presidio)

### Helper functions

In [7]:
# Function to get token index and predicted label
def get_token_predictions(predictions: list, token_list: list, whitespace_list:list, label_list: list = []) -> dict:
    tok_preds = {} # dict of token indexes and labels
    id = 0 # keep our position in the full_text by token

    for i in range(len(token_list)):
        for entity in predictions:
            if token_list[i] == "\n\n": # hardcoded fix for double spaces issue
                break
                
            label = '' # format label from model output -> submission
            if entity.entity_type == "PERSON": 
                label = "NAME_STUDENT"
            if entity.entity_type == 'PHONE_NUMBER':
                label =  'PHONE_NUM'
            if entity.entity_type == 'URL':
                label = 'URL_PERSONAL'
            if entity.entity_type == 'EMAIL_ADDRESS':
                label = "EMAIL"
                
            if id == entity.start: # add B or I prefix
                if label_list: # make labels optional to work with both train and test datasets
                    tok_preds[i] = ["B-" + label, label_list[i], token_list[i], id]
                else:
                    tok_preds[i] = ["B-" + label, '', token_list[i], id]
            elif id > entity.start and id < entity.end:
                if label_list:
                    tok_preds[i] = ["I-" + label, label_list[i], token_list[i], id]
                else:
                    tok_preds[i] = ["I-" + label, '', token_list[i], id]
                                
        id += len(token_list[i]) # increase by the length of the token
        if whitespace_list[i] == True: # add whitespace if needed
            id += 1

    return(tok_preds)

In [8]:
# Function to format submission
def get_submission(token_predictions: dict) -> pd.DataFrame:
    submission = []

    i = 0
    for key, value in token_predictions.items():
        for keys, values in value.items():  
            submission.append([i, key, keys, values[0], values[1]])
            i += 1

    submission = pd.DataFrame(submission)
    submission.columns = ['row_id', 'document', 'token', 'label', 'true label']

    return(submission)

### Apply model on formatted text

In [9]:
user_secrets = UserSecretsClient()
azure_ai_endpoint = user_secrets.get_secret("AZURE_AI_ENDPOINT")
azure_ai_key = user_secrets.get_secret("AZURE_AI_KEY")

In [10]:
azure_ai_language = AzureAILanguageRecognizer(azure_ai_key=azure_ai_key, azure_ai_endpoint=azure_ai_endpoint)

In [11]:
# Object needed to perform analysis
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(azure_ai_language)

In [12]:
# Function to get predictions for each document
def get_total_predictions(data: pd.DataFrame, type: str = "test") -> dict:
    predictions = {}
    
    for line in data.index: # loop over every document to get all predictions
        doc_id = data["document"][line]
        full_text = data['full_text'][line].replace('\n', " ")
        tokens = data['tokens'][line]
        trailing_whitespaces = data['trailing_whitespace'][line]
        if type == "train":
            labels = data['labels'][line]
        
        prediction = analyzer.analyze(text=full_text,
                           entities=["PHONE_NUMBER", "PERSON", "URL", "EMAIL_ADDRESS"],
                           language='en')
                
        if type == "train":
            token_preds = get_token_predictions(prediction, tokens, trailing_whitespaces, labels)
        else:
            token_preds = get_token_predictions(prediction, tokens, trailing_whitespaces)
        
        predictions[str(doc_id)] = token_preds
        
        #print(line)

    return predictions

In [13]:
results = get_total_predictions(test_df)

In [41]:
# results.keys()

dict_keys(['7', '10', '16', '20', '56', '86', '93', '104', '112', '123'])

In [None]:
'''
# Print predictions
for entity in predictions:
    print(entity.entity_type, full_text[entity.start:entity.end], entity.start, entity.end)
'''

## Model evaluation

In [20]:
# sample_df = train_df.sample(n=500, random_state=42).sort_values("document", ignore_index=True)

### Make predictions on sample training data

In [21]:
# result_eval = get_total_predictions(sample_df, "train")

In [22]:
# eval_df = pd.DataFrame(get_submission(result_eval)).rename(columns={"label": "pred"})

In [23]:
# eval_df["document"] = eval_df["document"].astype(int)

In [24]:
# eval_df

Unnamed: 0,row_id,document,token,pred,true label
0,0,317,611,B-URL_PERSONAL,B-URL_PERSONAL
1,1,330,18,B-NAME_STUDENT,B-NAME_STUDENT
2,2,330,19,I-NAME_STUDENT,I-NAME_STUDENT
3,3,330,24,B-NAME_STUDENT,O
4,4,330,25,I-NAME_STUDENT,O
...,...,...,...,...,...
921,921,22435,701,B-NAME_STUDENT,O
922,922,22435,702,I-NAME_STUDENT,O
923,923,22456,307,B-NAME_STUDENT,O
924,924,22466,286,B-NAME_STUDENT,O


### True labels

In [25]:
# true_labels = sample_df[['document','tokens', 'labels']]

In [26]:
# true_labels.head()

Unnamed: 0,document,tokens,labels
0,317,"[Reflection, -, Storytelling, \n\n, Challenge,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,330,"[STORY, TELLING, AS, A, TOOL, FOR, KNOWLEDGE, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,375,"[Reflection, –, Mind, Mapping, –, Cesar, River...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
3,651,"[Wilson, Syafinaz, , Dec, 17, ,, 2020, \n\n...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
4,760,"[Reflection, –, The, story, of, African, river...","[O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAM..."


In [27]:
# labels_df = true_labels.apply(lambda x: x.explode()).reset_index(drop=True)

In [28]:
# labels_df['token_id'] = labels_df.groupby('document').cumcount()

In [29]:
# labels_df

Unnamed: 0,document,tokens,labels,token_id
0,317,Reflection,O,0
1,317,-,O,1
2,317,Storytelling,O,2
3,317,\n\n,O,3
4,317,Challenge,O,4
...,...,...,...,...
377968,22653,very,O,1183
377969,22653,effective,O,1184
377970,22653,way,O,1185
377971,22653,.,O,1186


### Merge both

In [30]:
# merged_df = pd.merge(labels_df, eval_df[['token', 'pred', 'document']], left_on=["document", "token_id"], right_on=["document", "token"], how='left').fillna({'pred': 'O'}).drop(["token", "document"], axis=1)

In [31]:
# merged_df

Unnamed: 0,tokens,labels,token_id,pred
0,Reflection,O,0,O
1,-,O,1,O
2,Storytelling,O,2,O
3,\n\n,O,3,O
4,Challenge,O,4,O
...,...,...,...,...
377968,very,O,1183,O
377969,effective,O,1184,O
377970,way,O,1185,O
377971,.,O,1186,O


### Metrics time

In [32]:
'''
# Compute rate
TP, FP, TN, FN = 0, 0, 0, 0
tp, fp, fn, missclass = [],[],[],[]

for index, row in merged_df.iterrows():
    if (merged_df["labels"][index] == 'O') & (merged_df["pred"][index] == 'O'):
        TN += 1
    if (merged_df["labels"][index] != 'O') & (merged_df["pred"][index] == 'O'):
        FN += 1
        fn.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
    if (merged_df["labels"][index] == 'O') & (merged_df["pred"][index] != 'O'):
        FP += 1
        fp.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
    if (merged_df["labels"][index] != 'O') & (merged_df["pred"][index] != 'O'):
        if merged_df["labels"][index] == merged_df["pred"][index]:
            TP += 1
            tp.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
        else:
            FN, FP = FN + 1, FP + 1
            missclass.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])

print(TP, FP, TN, FN)
'''

175 751 377038 10


In [14]:
# Pretty clear what this does
def calculate_f_beta_score(TP, FP, FN, beta=5):
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0

    beta_squared = beta ** 2
    f_beta_score = (1 + beta_squared) * (precision * recall) / ((beta_squared * precision) + recall) if (beta_squared * precision) + recall != 0 else 0

    return f_beta_score

In [34]:
# print(calculate_f_beta_score(TP, FP, FN))

0.819672131147541


## Errors analysis

### Analyze errors for false negatives

In [25]:
#fn_df = pd.DataFrame(fn, columns=["tokens", "labels", "preds"])

In [26]:
#fn_counts = Counter(fn_df["labels"])

In [27]:
#fn_counts.most_common()

[('B-NAME_STUDENT', 7), ('I-NAME_STUDENT', 7), ('B-ID_NUM', 1)]

### Same for false positives

In [28]:
#fp_df = pd.DataFrame(fp, columns=["tokens", "labels", "preds"])

In [29]:
#fp_counts = Counter(fp_df["preds"])

In [30]:
#fp_counts.most_common()

[('B-NAME_STUDENT', 317),
 ('I-NAME_STUDENT', 217),
 ('I-URL_PERSONAL', 91),
 ('B-URL_PERSONAL', 73)]

### Confusion matrix

In [131]:
'''
y_true, y_pred = merged_df["labels"], merged_df["pred"]
label_names = list(set(merged_df["labels"].unique()).union(set(merged_df["pred"].unique())))
label_names.reverse()
'''

In [16]:
'''
cm = confusion_matrix(y_true, y_pred, labels=label_names)
'''

'\ncm = confusion_matrix(y_true, y_pred, labels=label_names)\n'

In [17]:
'''
sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=label_names,
            yticklabels=label_names)
plt.ylabel('True', fontsize=13)
plt.xlabel('Predicted', fontsize=13)
plt.title('Confusion Matrix', fontsize=17)
plt.show()
'''

"\nsns.heatmap(cm,\n            annot=True,\n            fmt='g',\n            xticklabels=label_names,\n            yticklabels=label_names)\nplt.ylabel('True', fontsize=13)\nplt.xlabel('Predicted', fontsize=13)\nplt.title('Confusion Matrix', fontsize=17)\nplt.show()\n"

## Submission

### Get submission

In [18]:
# Get submission data
submission = get_submission(results)

### Verify formatting

In [19]:
# Compare submission with sample from Kaggle to verify formatting
pd.DataFrame(submission)

Unnamed: 0,row_id,document,token,label,true label
0,0,7,9,B-NAME_STUDENT,
1,1,7,10,I-NAME_STUDENT,
2,2,7,52,B-NAME_STUDENT,
3,3,7,53,I-NAME_STUDENT,
4,4,7,55,B-NAME_STUDENT,
...,...,...,...,...,...
96,96,123,1656,B-NAME_STUDENT,
97,97,123,1657,I-NAME_STUDENT,
98,98,123,1658,I-NAME_STUDENT,
99,99,123,1690,B-URL_PERSONAL,


In [20]:
pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv').head(10)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT
5,5,7,742,I-NAME_STUDENT
6,6,10,0,B-NAME_STUDENT
7,7,10,1,I-NAME_STUDENT
8,8,10,464,B-NAME_STUDENT
9,9,10,465,I-NAME_STUDENT


### Save as CSV and submit

In [21]:
submission.to_csv('submission.csv', index = False)