# Identifying PII in Student Essays
## Authored by: Pratik Chaudhari, Cody Ledford, Manu Achar
## Project Summary
The Kaggle Competition we are participating in is the [PII Data Detection hosted by The Learning Agency Lab](https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/overview). The goal of this competition is to develop a model that detects sensitive personally identifiable information (PII) in student writing. This is necessary to screen and clean educational data so that when released to the public for analysis and archival, the students' risk are mitigated.


## Python Libraries


In [1]:
# !python -m pip install --upgrade pip
# !pip install accelerate
# !pip install seqeval
# !pip install datasets

import pandas as pd
import numpy as np
# import spacy as sp
import re
import json
import math
from pathlib import Path
from datasets import Dataset
import os
import torch
import torch.nn as nn
from torch import cuda

import scipy.stats as stats
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import colormaps
from matplotlib.font_manager import FontProperties

# !pip install evaluate
# import evaluate

from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification


from seqeval.metrics import recall_score, precision_score
# from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

## Config

In [2]:
class Config():
    def __init__(self, platform, model_name, pretrained_model_name):
        # platform = 'Kaggle'# 
        if platform == 'kaggle':
            pretrained_model_loc = '/kaggle/input/huggingface-bert/' + pretrained_model_name
            data_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
            test_data_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
            model_path_out = '/kaggle/working/models/' + model_name
            submission_path = '/kaggle/working/submission.csv'

        elif platform == 'local':
            pretrained_model_loc = pretrained_model_name
            model_path_out = '../models/bert_models/' + model_name
            data_path = "../Datasets/Official/train.json"
            submission_path = '../models/submission/'+ model_name + '-submission.csv'
            test_data_path = "../Datasets/Official/test.json"
            
        self.config = {
            'MAX_LEN': 100,
#             'TRAIN_BATCH_SIZE': 4,
#             'VALID_BATCH_SIZE': 2,
            'EPOCHS': 5,
            'LEARNING_RATE':1e-5,
#             'MAX_GRAD_NORM': 10,
            'device': 'cuda' if cuda.is_available() else 'cpu',
            'data_path': data_path,
            'test_data_path': test_data_path,
            'model_path': model_path_out,
            'pretrained_model': BertForTokenClassification.from_pretrained(pretrained_model_loc, num_labels = 13),
            'tokenizer': BertTokenizerFast.from_pretrained(pretrained_model_loc),
#             'threshold': 0.9,
            'return_entity_level_metrics': True,
            'ignore_subwords': True, # DO NOT CHANGE
            'subm_path': submission_path
        }

In [3]:
platform = 'local'
pretrainend_model_name = 'bert-base-cased'
model_num = 1
model_name = 'model-' + str(model_num) + '-' + pretrainend_model_name

config = Config(platform,model_name, pretrainend_model_name).config

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Loading Dataset


In [31]:
df_train = pd.read_json(config['data_path'])
df = df_train
df
# df_test = pd.read_json(config['test_data_path'])

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."
...,...,...,...,...,...
6802,22678,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...,"[EXAMPLE, –, JOURNEY, MAP, \n\n, THE, CHALLENG...","[True, True, True, False, False, True, True, F...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6803,22679,Why Mind Mapping?\n\nMind maps are graphical r...,"[Why, Mind, Mapping, ?, \n\n, Mind, maps, are,...","[True, True, False, False, False, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6804,22681,"Challenge\n\nSo, a few months back, I had chos...","[Challenge, \n\n, So, ,, a, few, months, back,...","[False, False, False, True, True, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6805,22684,Brainstorming\n\nChallenge & Selection\n\nBrai...,"[Brainstorming, \n\n, Challenge, &, Selection,...","[False, False, True, True, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Preprocessing

First we scoured the data to find the usuable text with our pretrained model with Regex.

In [5]:
labels = ["O","B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
              "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME",
              "I-ID_NUM", "I-NAME_STUDENT", "I-PHONE_NUM",
              "I-STREET_ADDRESS","I-URL_PERSONAL"]

labels_to_ids = {k: v for v, k in enumerate(labels)}
ids_to_labels = {v: k for v, k in enumerate(labels)}

In [6]:
pattern_space = re.compile('\xa0|\uf0b7|\u200b')
df_train.loc[:,'full_text'] = df_train.loc[:,'full_text'].replace(pattern_space, ' ')
df_train.loc[:,'tokens'] = df_train.loc[:,'tokens'].apply(lambda line: [tok for tok in line if not re.search(pattern_space,tok)])
# [,.!?-]
df_usable_train = df_train.iloc[df_train[~(df_train.tokens.apply(len) != df_train.labels.apply(len))].index]
1-(len(df_usable_train.document))/len(df.document)

0.06184809754664311

In [7]:
# df_test.loc[:,'full_text'] = df_test.loc[:,'full_text'].replace(pattern_space, ' ')
# df_test.loc[:,'tokens'] = df_test.loc[:,'tokens'].apply(lambda line: [tok for tok in line if not re.search(pattern_space,tok)])
# [,.!?-]
# df_usable_test = df_test.iloc[df_test[~(df_test.tokens.apply(len) != df_test.labels.apply(len))].index]
# 1-(len(df_usable_test.document))/len(df.document)


In [8]:
def make_smaller_inputs(dataframe, type):
    """Splits the entire essays into MAX_LEN size blocks and remaps tokens and labels
    """
    df_out = pd.DataFrame(columns = ['tokens','labels','document','document_location'])
    idx_df = 0
    max_len = config['MAX_LEN']
    
    for _,line in dataframe.iterrows():
        location_counter = 0
        tokens = line.tokens
        if type == 'train':
            labels = line.labels
        document = line.document
        items = range(0,len(tokens),max_len)
        
        for i in items:
            df_out.at[idx_df,'tokens'] = tokens[i:i+max_len]
            if type == 'train':
                df_out.at[idx_df,'labels'] = labels[i:i+max_len]
            df_out.at[idx_df,'document'] = document
            df_out.at[idx_df,'document_location'] = location_counter
            location_counter += 1
            idx_df += 1
        
    return df_out

In [9]:
df_model_input_train = make_smaller_inputs(df_usable_train,'train')
print(len(df_model_input_train.index))
df_model_input_train.head(2)

49199


Unnamed: 0,tokens,labels,document,document_location
0,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",7,0
1,"['s, , potential, to, be, released, ., Cf, An...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",7,1


In [10]:
# df_model_input_test = make_smaller_inputs(df_test,'predict')
# print(len(df_model_input_test.index))
# df_model_input_test.head(2)

In [11]:
def compare2dict(list1, list2, pii_threshold):
    return len(set(list1).intersection(set(list2))) > 1

In [12]:
df_trainable = df_model_input_train[df_model_input_train.apply(lambda line: compare2dict(line.labels,list(labels_to_ids.keys()),0), axis = 1)]
df_trainable_input = pd.concat([df_trainable] * 10).sample(frac=1)
df_trainable_input.reset_index(drop = True, inplace=True)
df_trainable_input = df_trainable_input[['tokens','labels']]
print(len(df_trainable_input))
df_trainable_input.head(2)

11560


Unnamed: 0,tokens,labels
0,"[need, them, to, , amplify, their, social, im...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[Reflection, |, Mind, Mapping, \n\n, Isabel, P...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."


In [13]:
# df_evaluation_input = df_model_input[['tokens','labels']]
# df_evaluation_input.head(2)

## PII Inference V2

### Prepping Trainer

Function for tokenizing datasets

In [15]:
def tokenize(line, tokenizer, type):

    tokens = line.tokens
    

    if config['ignore_subwords']:
        length = config['MAX_LEN']
    else:
        length = math.ceil(config['MAX_LEN'] * 1.2)
        

    encoding = tokenizer(tokens,
                         is_split_into_words= True,
                         return_offsets_mapping= True,
                         padding= 'max_length',
                         max_length= length)
    
    item = {key: torch.as_tensor(val) for key, val in encoding.items()}
    
    if type == 'train' or type == 'eval':
        word_labels = line.labels
    
        temp_list = [0 for _ in range(length - len(word_labels))]
        labels = [labels_to_ids[label] for label in word_labels] + temp_list
        
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
    
        if config['ignore_subwords']:
            #         Ignore subword labels
            for idx, mapping in enumerate(encoding["offset_mapping"]):
                if mapping[0] == 0 and mapping[1] != 0:
                    # overwrite label
                    encoded_labels[idx] = labels[i]
                    i += 1
        else:
            #         Extend subword labels
            for idx, mapping in enumerate(encoding["offset_mapping"]):
                if mapping[0] == 0:
                    encoded_labels[idx] = labels[i]
                    i += 1
                    
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return {**item}
        
    if type == 'predict':
        document = line.document
        location = line.document_location
        
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        labels = np.zeros(len(tokens), dtype=int)
        i = 0
    
        if config['ignore_subwords']:
            #         Ignore subword labels
            for idx, mapping in enumerate(encoding["offset_mapping"]):
                if mapping[0] == 0 and mapping[1] != 0:
                    # overwrite label
                    encoded_labels[idx] = labels[i]
                    i += 1
        else:
            #         Extend subword labels
            for idx, mapping in enumerate(encoding["offset_mapping"]):
                if mapping[0] == 0:
                    encoded_labels[idx] = labels[i]
                    i += 1
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return {**item, 'document': document, 'location': location}

Create Datasets

In [16]:
train_dataset_temp = df_trainable_input.apply(lambda line: tokenize(line, config['tokenizer'],'train'), axis = 1).to_list()
# train_dataset_temp
# train_dataset = Dataset.from_list(pd.DataFrame(data=train_dataset_temp))

# from_list not implemented in this notebook's version of datasets, the process below creates what from_list would have created.
train_dataset = Dataset.from_dict({k: [s[k] for s in train_dataset_temp] for k in  train_dataset_temp[0].keys()})
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
    num_rows: 11560
})

In [17]:
# eval_dataset_temp = df_evaluation_input.apply(lambda line: tokenize(line, config['tokenizer'],'eval'), axis = 1)
# # # eval_dataset = Dataset.from_list(eval_dataset_temp)
# eval_dataset = Dataset.from_dict({k: [s[k] for s in eval_dataset_temp] for k in  eval_dataset_temp[0].keys()})
# eval_dataset

Initialize parts of trainer

In [19]:
model = config['pretrained_model']
data_collator = DataCollatorForTokenClassification(config['tokenizer'])
# metric = evaluate.load("seqeval")

Function for evaluating prediction. Only available online

In [21]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(-1)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ids_to_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ids_to_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

### Training

Initialize Trainer

In [22]:
training_args = TrainingArguments(
    output_dir= config['model_path'],
    # overwrite_output_dir = True,
    do_train = True,
    # do_eval = False,
    # per_device_eval_batch_size=1,
    auto_find_batch_size=True,
    report_to="none",
    num_train_epochs = config['EPOCHS'],
    learning_rate = config['LEARNING_RATE'],
    save_strategy = 'no',
    disable_tqdm= False,
    no_cuda = False,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_dataset,
    # eval_dataset = eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Start training

In [23]:
# Training
if training_args.do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics["train_samples"] = len(train_dataset)

    trainer.log_metrics("train", metrics)
    # trainer.save_metrics("train", metrics)
    # trainer.save_state()

  0%|          | 0/7225 [00:00<?, ?it/s]

{'loss': 0.1216, 'grad_norm': 0.6488198637962341, 'learning_rate': 9.307958477508652e-06, 'epoch': 0.35}
{'loss': 0.0312, 'grad_norm': 0.07275829464197159, 'learning_rate': 8.615916955017302e-06, 'epoch': 0.69}
{'loss': 0.0194, 'grad_norm': 1.0691379308700562, 'learning_rate': 7.923875432525952e-06, 'epoch': 1.04}
{'loss': 0.012, 'grad_norm': 1.118030309677124, 'learning_rate': 7.2318339100346025e-06, 'epoch': 1.38}
{'loss': 0.0071, 'grad_norm': 0.576580822467804, 'learning_rate': 6.539792387543253e-06, 'epoch': 1.73}
{'loss': 0.0046, 'grad_norm': 0.3281711935997009, 'learning_rate': 5.847750865051903e-06, 'epoch': 2.08}
{'loss': 0.0035, 'grad_norm': 0.0562983863055706, 'learning_rate': 5.155709342560554e-06, 'epoch': 2.42}
{'loss': 0.002, 'grad_norm': 2.3867671489715576, 'learning_rate': 4.463667820069205e-06, 'epoch': 2.77}
{'loss': 0.002, 'grad_norm': 0.7267292141914368, 'learning_rate': 3.7716262975778552e-06, 'epoch': 3.11}
{'loss': 0.0014, 'grad_norm': 0.1894765943288803, 'learni

### Evaluation

In [25]:
# # Evaluation
# if training_args.do_eval:
# #     logger.info("*** Evaluate ***")

#     metrics = trainer.evaluate()

#  #   max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
#     metrics["eval_samples"] = len(eval_dataset)

#     trainer.log_metrics("eval", metrics)
#     trainer.save_metrics("eval", metrics)