In [6]:
import numpy as np
import pandas as pd
import regex as re
import json
import random

import spacy
from spacy.training.example import Example

# Data Loading and Formatting:
Functions to load data from a JSON file and convert it into a format suitable for training a spaCy NER model, focusing on extracting text and BIO-tagged entities.

In [7]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def convert_to_spacy_format(docs):
    spacy_data = []
    for doc in docs:
        text = doc['full_text']
        entities = []
        current_pos = 0
        start_pos, end_pos, label = None, None, None

        for token, ws, bio_tag in zip(doc['tokens'], doc['trailing_whitespace'], doc['labels']):
            if bio_tag != 'O':
                bio_label = bio_tag.split('-')
                if bio_label[0] == 'B':
                    if start_pos is not None and label is not None:
                        entities.append((start_pos, end_pos, label))
                    start_pos = current_pos
                    label = bio_label[1]
                end_pos = current_pos + len(token)
            else:
                if start_pos is not None and label is not None:
                    entities.append((start_pos, end_pos, label))
                    start_pos, end_pos, label = None, None, None
            current_pos += len(token) + (1 if ws else 0)

        if start_pos is not None and label is not None:
            entities.append((start_pos, end_pos, label))
        
        spacy_data.append((text, {'entities': entities}))

    return spacy_data

train_data = load_data("C:\\Users\\nickw\\Documents\\train.json")
formatted_train_data = convert_to_spacy_format(train_data)

quick_test = False  # Set to True for a quick test run

# Use a subset of the data for a quick test
if quick_test:
    formatted_train_data = formatted_train_data[:50]  # Adjust the number for a quick test

# Model Initialization and Training:
Initializes a blank spaCy model, adds a Named Entity Recognition (NER) pipeline, defines custom PII labels, and trains the model using the formatted training data.

In [8]:
from tqdm import tqdm  # tqdm is a library that provides a progress bar

train_iterations = 10
batch_size = 5

# Initialize a blank spaCy model
nlp = spacy.blank('en')

# Create a new NER pipeline component
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner', last=True)

# Add the new PII labels to NER
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
for label in pii_labels:
    ner.add_label(label)

# Train the model
optimizer = nlp.initialize()
losses = {}

for iteration in range(train_iterations):
    # Shuffle the training data
    random.shuffle(formatted_train_data)
    
    # Calculate total number of batches for the current iteration
    total_batches = len(formatted_train_data) // batch_size
    if len(formatted_train_data) % batch_size != 0:
        total_batches += 1

    # Progress bar for batches within the current iteration
    with tqdm(total=total_batches, desc=f"Iteration {iteration + 1}", unit="batch") as progress_bar_inner:
        for batch in spacy.util.minibatch(formatted_train_data, size=batch_size):
            for text, annotations in batch:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], drop=0.0, losses=losses, sgd=optimizer)
            progress_bar_inner.update(1)

    print(f"Losses at iteration {iteration + 1}: {losses}")
    
# Save the model to disk
nlp.to_disk("spacy")

Iteration 1: 100%|██████████| 1362/1362 [15:09<00:00,  1.50batch/s]


Losses at iteration 1: {'ner': 7091.559806794224}


Iteration 2: 100%|██████████| 1362/1362 [16:26<00:00,  1.38batch/s]


Losses at iteration 2: {'ner': 8774.309531126153}


Iteration 3: 100%|██████████| 1362/1362 [16:03<00:00,  1.41batch/s]


Losses at iteration 3: {'ner': 10171.32480160933}


Iteration 4:  17%|█▋        | 234/1362 [02:48<13:31,  1.39batch/s]


KeyboardInterrupt: 

# Data Preparation and Prediction Function: 
Includes functions to load test data, reconstruct text from tokenized data, and predict PII labels using a trained spaCy model, outputting predictions for each token in the test documents.# Functions to evaluate model on test data

In [9]:
def load_test_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def reconstruct_text(tokens, trailing_whitespace):
    return ''.join([token + (' ' if ws else '') for token, ws in zip(tokens, trailing_whitespace)])

def predict_pii_labels(test_docs, model):
    predictions = []
    row_id = 0
    for doc_data in test_docs:
        doc_id = doc_data["document"]
        text = reconstruct_text(doc_data["tokens"], doc_data["trailing_whitespace"])
        doc = model(text)

        token_idx = 0
        for token in doc:
            if token.ent_iob_ != 'O' and token.ent_type_:
                predictions.append({
                    "row_id": row_id,
                    "document": doc_id,
                    "token": token_idx,
                    "label": f"{token.ent_iob_}-{token.ent_type_}"
                })
                row_id += 1
            token_idx += 1
    return predictions

# Prediction Application and Export: 
Loads test data, applies the trained spaCy model to make PII predictions, and exports these predictions to a CSV file for submission or further analysis.

In [10]:
# Load test data
test_file_name = "C:\\Users\\nickw\\Documents\\test.json"  # Replace with your test file name
test_docs = load_test_data(test_file_name)

# Predict PII labels
predictions = predict_pii_labels(test_docs, nlp)

submission_df = pd.DataFrame(predictions)
submission_df.to_csv("submission.csv", index=False)
