# PII Data Prep with W and B

## Reference
https://www.kaggle.com/code/thedrcat/pii-data-preparation-cv-stride-viz-and-w-b#Share-your-findings

# Config and Import

In [1]:
!pip install wandb -q

In [2]:
from pathlib import Path
import os

import json
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings

In [3]:
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
OUTPUT_DIR = '/kaggle/working/'

# Helper functions

In [4]:
#split data into training and validation set
def PIId2_split_by_sampling(df, test_ratio):
    """Split PII data into training and test set. 

    Args:
        df (pd.DataFrame): The input DataFrame.
        test_ratio (float, default=0.2): 
            The proportion of the dataset to include in the test split.

    Returns:
        train_df, test_df: Training and test splits of the input DataFrame.
    """
    # Get the number of rows in the DataFrame
    num_rows = len(df)
    
    # Get the split index
    split_idx = int(num_rows * test_ratio)
    
    # Shuffle the DataFrame rows
    df_shuffled = df.sample(frac=1, random_state=42)
    
    # Split the DataFrame into train and test
    train_df = df_shuffled.iloc[split_idx:]
    test_df = df_shuffled.iloc[:split_idx]
    
#     #TEMP to test
#     # Add a new column 'valid' with boolean values - need to modify the move missing if want to keep
#     train_df['valid'] = False
#     test_df['valid'] = True
    
    return train_df, test_df

In [5]:
def move_missing_labels(to_df, from_df):
    """
    Move documents containing missing labels from the extra DataFrame to the desired DataFrame.

    Parameters:
    - to_df (DataFrame): DataFrame where the documents should be moved to.
    - from_df (DataFrame): DataFrame where the documents should be moved from.

    Returns:
    - desired_df, extra_df: Updated DataFrames.
    """
    # Find the labels that are missing from the to_df DataFrame
    missing_labels = set(from_df['labels'].explode().unique()) - set(to_df['labels'].explode().unique())

    # For each missing label, move a document from the extra DataFrame to the desired DataFrame
    for label in missing_labels:
        if label in from_df['labels'].explode().unique():
            document = from_df[from_df['labels'].apply(lambda x: label in x)].sample(n=1)
            to_df = pd.concat([to_df, document])
            from_df = from_df.drop(document.index)

    return to_df, from_df

# Validation set and Miniset

In [6]:
comp_train= json.load(open(DATA_PATH +'/train.json'))

#Convert to df for EDA
base_df= pd.DataFrame(comp_train)

In [7]:
#TODO see if can remove since only needed for stride, 
#note if remove adjust create_dataset in https://www.kaggle.com/code/jonathankasprisin/pii-wandb-training/edit
def add_token_indices(doc_tokens):
    token_indices = list(range(len(doc_tokens)))
    return token_indices

base_df['token_indices'] = base_df['tokens'].apply(add_token_indices)

In [8]:
train_df, val_df = PIId2_split_by_sampling(base_df, .1)
train_df_overlap = train_df

val_df, train_df = move_missing_labels(val_df, train_df)

In [9]:
# Save the DataFrame to a JSON file 
val_df.to_json(OUTPUT_DIR + "/val.json", orient="records")
train_df.to_json(OUTPUT_DIR + "/train_df_fromval.json", orient="records")
train_df_overlap.to_json(OUTPUT_DIR + "/train_df_overlap_fromval.json", orient="records")

print("size of base: ", len(base_df))
print("size of val_df: ", len(val_df))

size of base:  6807
size of val_df:  687


In [10]:
mini_overlap, overlap_extra = PIId2_split_by_sampling(train_df_overlap, .5)
mini_no_overlap, no_overlap_extra= PIId2_split_by_sampling(train_df, .5)


mini_overlap, overlap_extra = move_missing_labels(mini_overlap, overlap_extra)
mini_no_overlap, no_overlap_extra= move_missing_labels(mini_no_overlap, no_overlap_extra)

# Save the DataFrame to a JSON file 
mini_overlap.to_json(OUTPUT_DIR + "/mini_overlap.json", orient="records")
mini_no_overlap.to_json(OUTPUT_DIR + "/mini_no_overlap.json", orient="records")

print("size of base: ", len(base_df))
print("size of mini_overlap: ", len(mini_overlap))
print("size of mini_no_overlap: ", len(mini_no_overlap))

size of base:  6807
size of mini_overlap:  3066
size of mini_no_overlap:  3063


In [11]:
from collections import Counter
from itertools import chain
#Val set analysis
val_df.info()
df = val_df
total_count = df['labels'].apply(len).sum()
# Flatten the list of labels
all_labels_flat = list(chain(* val_df.labels.values))

# Count the occurrences of each unique label
label_counts = Counter(all_labels_flat)

entity_count = total_count - label_counts['O']

print(f'total labels {total_count} \n entity labels {entity_count}')
# Print the counts
for label, count in label_counts.items():
    print(f'{label}: {count}')

<class 'pandas.core.frame.DataFrame'>
Index: 687 entries, 4624 to 1103
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             687 non-null    int64 
 1   full_text            687 non-null    object
 2   tokens               687 non-null    object
 3   trailing_whitespace  687 non-null    object
 4   labels               687 non-null    object
 5   token_indices        687 non-null    object
dtypes: int64(1), object(5)
memory usage: 53.7+ KB
total labels 518432 
 entity labels 311
O: 518121
B-NAME_STUDENT: 135
I-NAME_STUDENT: 126
B-URL_PERSONAL: 15
B-ID_NUM: 2
B-PHONE_NUM: 2
I-PHONE_NUM: 3
B-EMAIL: 2
I-ID_NUM: 1
B-STREET_ADDRESS: 2
I-STREET_ADDRESS: 20
I-URL_PERSONAL: 1
B-USERNAME: 2


# Visualization of an Essay
credit : https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy

In [12]:
#https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy
import spacy
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.blank("en")

options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "skyblue",
        "B-EMAIL": "limegreen",
        "I-EMAIL": "lime",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "silver",
        "I-URL_PERSONAL": "lightgray",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}

def visualize(row):
    doc = nlp(row.full_text)
    doc.ents = [
        Span(doc, idx, idx + 1, label=label)
        for idx, label in enumerate(row.labels)
        if label != "O"
    ]
    html = displacy.render(doc, style="ent", jupyter=False, options=options)
    return html

In [13]:
from IPython.core.display import display, HTML
# html = visualize(base_df.loc[0])
# display(HTML(html))

  from IPython.core.display import display, HTML


# Save to W and B

Running code below needs to have WANDB_API_KEY secret in kaggle secrets. Access api token through add-ons in notebook



In [14]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")



import wandb
wandb.login(key=wandb_api_key)
wandb.init(project='pii', job_type='preprocessing')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkasprisi[0m ([33mcsci566sp24[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240412_162557-kh551j4l[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfine-shadow-49[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/csci566sp24/pii[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/csci566sp24/pii/runs/kh551j4l/workspace[0m


In [15]:
#log data as artifacts

#Parquet is a columnar storage file format that is optimized for use with big data processing frameworks.
#Steps
# Save the base dataframe 'df' to a parquet file named 'base_data.parquet'
# Create a new Weights & Biases artifact named 'base_data' of type 'dataset'
# Create a new Weights & Biases artifact named 'base_data' of type 'dataset'
# Add the 'base_data.parquet' file to the 'base_data' artifact
# Log the 'raw_data' artifact to Weights & Biases, this will upload the artifact to the Weights & Biases servers

#base data
base_df.to_parquet('base_data.parquet', index=False)
base_data = wandb.Artifact(name="base_data", type="dataset")
base_data.add_file('base_data.parquet')
wandb.log_artifact(base_data)

#miniset data dataframe 'mini_no_overlap' 
mini_no_overlap.to_parquet('mini_no_overlap.parquet', index=False)
mini_no_overlap_data = wandb.Artifact(name="mini_no_overlap_data", type="dataset")
mini_no_overlap_data.add_file('mini_no_overlap.parquet')
wandb.log_artifact(mini_no_overlap_data)

#validation set dataframe 'val_df'
val_df.to_parquet('val_data.parquet', index=False)
val_data = wandb.Artifact(name="val_data", type="dataset")
val_data.add_file('val_data.parquet')
wandb.log_artifact(val_data)

<Artifact val_data>

In [16]:
#TODO troubleshoot

# # We will generate html viz for every mini_no_overlap essay, wrap it up in `wandb.Html` and create a W&B table to inspect it
# df=mini_no_overlap

# wandb_htmls = [wandb.Html(visualize(row)) for _, row in df.iterrows()]
# df['visualization'] = wandb_htmls
# table = wandb.Table(dataframe=df)
# wandb.log({'original_dataset': table})

In [17]:
# Finish W&B run
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 🚀 View run [33mfine-shadow-49[0m at: [34m[4mhttps://wandb.ai/csci566sp24/pii/runs/kh551j4l/workspace[0m
[34m[1mwandb[0m: Synced 4 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20240412_162557-kh551j4l/logs[0m


# TO DOs
- Truncation with Stride, tokenizers striding method?
- add hyperparameters to config wandb.config.update