# PII Data Prep with W and B

## Reference
https://www.kaggle.com/code/thedrcat/pii-data-preparation-cv-stride-viz-and-w-b#Share-your-findings

# Config and Import

In [1]:
!pip install wandb -q

In [2]:
from pathlib import Path
import os

import json
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings

In [3]:
DATA_PATH = '../input/pii-detection-removal-from-educational-data'
OUTPUT_DIR = '/kaggle/working/'

# Helper functions

In [4]:
#split data into training and validation set
def PIId2_split_by_sampling(df, test_ratio):
    """Split PII data into training and test set. 

    Args:
        df (pd.DataFrame): The input DataFrame.
        test_ratio (float, default=0.2): 
            The proportion of the dataset to include in the test split.

    Returns:
        train_df, test_df: Training and test splits of the input DataFrame.
    """
    # Get the number of rows in the DataFrame
    num_rows = len(df)
    
    # Get the split index
    split_idx = int(num_rows * test_ratio)
    
    # Shuffle the DataFrame rows
    df_shuffled = df.sample(frac=1, random_state=42)
    
    # Split the DataFrame into train and test
    train_df = df_shuffled.iloc[split_idx:]
    test_df = df_shuffled.iloc[:split_idx]
    
#     #TEMP to test
#     # Add a new column 'valid' with boolean values - need to modify the move missing if want to keep
#     train_df['valid'] = False
#     test_df['valid'] = True
    
    return train_df, test_df

In [5]:
def move_missing_labels(to_df, from_df):
    """
    Move documents containing missing labels from the extra DataFrame to the desired DataFrame.

    Parameters:
    - to_df (DataFrame): DataFrame where the documents should be moved to.
    - from_df (DataFrame): DataFrame where the documents should be moved from.

    Returns:
    - desired_df, extra_df: Updated DataFrames.
    """
    # Find the labels that are missing from the to_df DataFrame
    missing_labels = set(from_df['labels'].explode().unique()) - set(to_df['labels'].explode().unique())

    # For each missing label, move a document from the extra DataFrame to the desired DataFrame
    for label in missing_labels:
        if label in from_df['labels'].explode().unique():
            document = from_df[from_df['labels'].apply(lambda x: label in x)].sample(n=1)
            to_df = pd.concat([to_df, document])
            from_df = from_df.drop(document.index)

    return to_df, from_df

In [6]:
from collections import Counter
from itertools import chain

def analyze_labels(df):
    """
    Analyze the distribution of labels in a DataFrame.

    Parameters:
    - df (DataFrame): DataFrame with a 'labels' column that contains lists of labels.

    Returns:
    - None
    """
    df.info()
    # Count the number of documents that contain each label
    label_counts = df['labels'].apply(set).explode().value_counts()

    # Count the total number of documents
    total_count = len(df)

    # Count the number of documents that contain entities (i.e., labels other than 'O')
    entity_count = total_count - label_counts['O']

    print(f'total documents {total_count} \n documents with entities {entity_count}')

    # Print the counts
    for label, count in label_counts.items():
        print(f'{label}: {count} documents')

# Validation set and Miniset

In [7]:
comp_train= json.load(open(DATA_PATH +'/train.json'))

#Convert to df for EDA
base_df= pd.DataFrame(comp_train)

In [8]:
#TODO see if can remove since only needed for stride, 
#note if remove adjust create_dataset in https://www.kaggle.com/code/jonathankasprisin/pii-wandb-training/edit
def add_token_indices(doc_tokens):
    token_indices = list(range(len(doc_tokens)))
    return token_indices

base_df['token_indices'] = base_df['tokens'].apply(add_token_indices)

In [9]:
train_df, val_df = PIId2_split_by_sampling(base_df, .1)
train_df_overlap = train_df

val_df, train_df = move_missing_labels(val_df, train_df)

In [10]:
# Save the DataFrame to a JSON file 
val_df.to_json(OUTPUT_DIR + "/val.json", orient="records")
train_df.to_json(OUTPUT_DIR + "/train_df_fromval.json", orient="records")
train_df_overlap.to_json(OUTPUT_DIR + "/train_df_overlap_fromval.json", orient="records")

print("size of base: ", len(base_df))
print("size of val_df: ", len(val_df))

size of base:  6807
size of val_df:  688


In [11]:
mini_overlap, overlap_extra = PIId2_split_by_sampling(train_df_overlap, .5)
mini_no_overlap, no_overlap_extra= PIId2_split_by_sampling(train_df, .5)


mini_overlap, overlap_extra = move_missing_labels(mini_overlap, overlap_extra)
mini_no_overlap, no_overlap_extra= move_missing_labels(mini_no_overlap, no_overlap_extra)

# Save the DataFrame to a JSON file 
mini_overlap.to_json(OUTPUT_DIR + "/mini_overlap.json", orient="records")
mini_no_overlap.to_json(OUTPUT_DIR + "/mini_no_overlap.json", orient="records")

print("size of base: ", len(base_df))
print("size of mini_overlap: ", len(mini_overlap))
print("size of mini_no_overlap: ", len(mini_no_overlap))

size of base:  6807
size of mini_overlap:  3066
size of mini_no_overlap:  3061


In [12]:
analyze_labels(val_df)

<class 'pandas.core.frame.DataFrame'>
Index: 688 entries, 4624 to 1887
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             688 non-null    int64 
 1   full_text            688 non-null    object
 2   tokens               688 non-null    object
 3   trailing_whitespace  688 non-null    object
 4   labels               688 non-null    object
 5   token_indices        688 non-null    object
dtypes: int64(1), object(5)
memory usage: 53.8+ KB
total documents 688 
 documents with entities 0
O: 688 documents
B-NAME_STUDENT: 99 documents
I-NAME_STUDENT: 93 documents
B-URL_PERSONAL: 12 documents
B-PHONE_NUM: 3 documents
B-EMAIL: 3 documents
B-ID_NUM: 2 documents
I-PHONE_NUM: 2 documents
I-STREET_ADDRESS: 2 documents
B-STREET_ADDRESS: 2 documents
B-USERNAME: 1 documents
I-URL_PERSONAL: 1 documents
I-ID_NUM: 1 documents


# Val2
- val set with with custom split

In [13]:
def move_rare_labels(to_df, from_df):
    """
    Move documents containing missing labels from the extra DataFrame to the desired DataFrame.

    Parameters:
    - to_df (DataFrame): DataFrame where the documents should be moved to.
    - from_df (DataFrame): DataFrame where the documents should be moved from.

    Returns:
    - desired_df, extra_df: Updated DataFrames.
    """
    # Find the labels that are missing from the to_df DataFrame
    labels = set(from_df['labels'].explode().unique())

    # For each missing label, move documents from the extra DataFrame to the desired DataFrame
    for label in missing_labels:
        if label in from_df['labels'].explode().unique():
            # Get all documents with the current label
            documents = from_df[from_df['labels'].apply(lambda x: label in x)]
            
            # If there are less than 5 documents with the current label in to_df, move documents from from_df to to_df
            if len(to_df[to_df['labels'].apply(lambda x: label in x)]) < 5:
                # Calculate how many documents need to be moved
                num_docs_to_move = 5 - len(to_df[to_df['labels'].apply(lambda x: label in x)])
                
                # If there are enough documents in from_df, move the required number of documents
                if len(documents) >= num_docs_to_move:
                    documents_to_move = documents.sample(n=num_docs_to_move)
                    to_df = pd.concat([to_df, documents_to_move])
                    from_df = from_df.drop(documents_to_move.index)
                # If there are not enough documents in from_df, move all available documents
                else:
                    to_df = pd.concat([to_df, documents])
                    from_df = from_df.drop(documents.index)

    return to_df, from_df

In [14]:
train2_df, val2_df = PIId2_split_by_sampling(base_df, .5)

val2_df, train2_df = move_missing_labels(val2_df, train2_df)
analyze_labels(val2_df)

# Save the DataFrame to a JSON file 
val2_df.to_json(OUTPUT_DIR + "/val2.json", orient="records")
train2_df.to_json(OUTPUT_DIR + "/train2_fromval.json", orient="records")

<class 'pandas.core.frame.DataFrame'>
Index: 3403 entries, 4624 to 6149
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             3403 non-null   int64 
 1   full_text            3403 non-null   object
 2   tokens               3403 non-null   object
 3   trailing_whitespace  3403 non-null   object
 4   labels               3403 non-null   object
 5   token_indices        3403 non-null   object
dtypes: int64(1), object(5)
memory usage: 186.1+ KB
total documents 3403 
 documents with entities 0
O: 3403 documents
B-NAME_STUDENT: 453 documents
I-NAME_STUDENT: 413 documents
B-URL_PERSONAL: 47 documents
B-ID_NUM: 19 documents
B-EMAIL: 13 documents
B-PHONE_NUM: 4 documents
I-PHONE_NUM: 3 documents
B-USERNAME: 3 documents
I-STREET_ADDRESS: 1 documents
B-STREET_ADDRESS: 1 documents
I-URL_PERSONAL: 1 documents
I-ID_NUM: 1 documents


# More data
https://www.kaggle.com/code/valentinwerner/fix-punctuation-tokenization-external-dataset/output

valentinwerner fix of 
https://www.kaggle.com/datasets/alejopaullier/pii-external-dataset/data?select=pii_dataset.csv
https://www.kaggle.com/datasets/pjmathematician/pii-detection-dataset-gpt

thoughts- https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/472560

In [15]:
train_df

#TODO make into pipeline/function
moredata_dataset_fixed= json.load(open('/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json'))
pii_dataset_fixed= json.load(open('/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json'))

# Convert to df for EDA
moredata_dataset_fixed_df = pd.DataFrame(moredata_dataset_fixed)
pii_dataset_fixed_df = pd.DataFrame(pii_dataset_fixed)

# Convert and standardize documents and add indices
ext_dfs = [moredata_dataset_fixed_df, pii_dataset_fixed_df]
ext_doc_numbering_seed = [100000 * (i + 1) for i in range(len(ext_dfs))]
for df, seed in zip(ext_dfs, ext_doc_numbering_seed):
    # Add token indices
    df['token_indices'] = df['tokens'].apply(add_token_indices)
    # Convert documents to int
    df['document'] = range(seed, seed + len(df['document']))

In [16]:
# # Get the type of the first non-null element in the 'tokens' column
# first_non_null_element = next(item for item in pii_dataset_fixed_df['tokens'] if item is not None)
# print("token pii_dataset type", type(first_non_null_element[0]))

# first_non_null_element = next(item for item in moredata_dataset_fixed_df['tokens'] if item is not None)
# print("token moredata type", type(first_non_null_element[0]))

# first_non_null_element = next(item for item in train_df['tokens'] if item is not None)
# print("token train type", type(first_non_null_element[0]))


In [17]:
# # Get the type of the first non-null element in the 'tokens' column
# first_non_null_element = next(item for item in pii_dataset_fixed_df['tokens'] if item is not None)
# print("token pii_dataset type", type(first_non_null_element[0]))

# first_non_null_element = next(item for item in moredata_dataset_fixed_df['tokens'] if item is not None)
# print("token moredata type", type(first_non_null_element[0]))

# first_non_null_element = next(item for item in train_df['tokens'] if item is not None)
# print("token train type", type(first_non_null_element[0]))

In [18]:
print("train info", train_df.info())
print("piifixed info", pii_dataset_fixed_df.info())
print("moredata info", moredata_dataset_fixed_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6119 entries, 2447 to 860
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             6119 non-null   int64 
 1   full_text            6119 non-null   object
 2   tokens               6119 non-null   object
 3   trailing_whitespace  6119 non-null   object
 4   labels               6119 non-null   object
 5   token_indices        6119 non-null   object
dtypes: int64(1), object(5)
memory usage: 463.7+ KB
train info None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4434 entries, 0 to 4433
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             4434 non-null   int64 
 1   full_text            4434 non-null   object
 2   tokens               4434 non-null   object
 3   trailing_whitespace  4434 non-null   object
 4   labels               4434 non-nu

# Visualization of an Essay
credit : https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy

In [19]:
#https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy
import spacy
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.blank("en")

options = {
    "colors": {
        "B-NAME_STUDENT": "aqua",
        "I-NAME_STUDENT": "skyblue",
        "B-EMAIL": "limegreen",
        "I-EMAIL": "lime",
        "B-USERNAME": "hotpink",
        "I-USERNAME": "lightpink",
        "B-ID_NUM": "purple",
        "I-ID_NUM": "rebeccapurple",
        "B-PHONE_NUM": "red",
        "I-PHONE_NUM": "salmon",
        "B-URL_PERSONAL": "silver",
        "I-URL_PERSONAL": "lightgray",
        "B-STREET_ADDRESS": "brown",
        "I-STREET_ADDRESS": "chocolate",
    }
}

def visualize(row):
    doc = nlp(row.full_text)
    doc.ents = [
        Span(doc, idx, idx + 1, label=label)
        for idx, label in enumerate(row.labels)
        if label != "O"
    ]
    html = displacy.render(doc, style="ent", jupyter=False, options=options)
    return html

In [20]:
from IPython.core.display import display, HTML
# html = visualize(base_df.loc[0])
# display(HTML(html))

  from IPython.core.display import display, HTML


# Save to W and B

Running code below needs to have WANDB_API_KEY secret in kaggle secrets. Access api token through add-ons in notebook



In [21]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")



import wandb
wandb.login(key=wandb_api_key)
wandb.init(project='pii', job_type='preprocessing')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkasprisi[0m ([33mcsci566sp24[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240421_224400-mgf0jrcd[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mroyal-bird-67[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/csci566sp24/pii[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/csci566sp24/pii/runs/mgf0jrcd/workspace[0m


In [22]:
#log data as artifacts

#Parquet is a columnar storage file format that is optimized for use with big data processing frameworks.
#Steps
# Save the base dataframe 'df' to a parquet file named 'base_data.parquet'
# Create a new Weights & Biases artifact named 'base_data' of type 'dataset'
# Create a new Weights & Biases artifact named 'base_data' of type 'dataset'
# Add the 'base_data.parquet' file to the 'base_data' artifact
# Log the 'raw_data' artifact to Weights & Biases, this will upload the artifact to the Weights & Biases servers

#base data
base_df.to_parquet('base_data.parquet', index=False)
base_data = wandb.Artifact(name="base_data", type="dataset")
base_data.add_file('base_data.parquet')
wandb.log_artifact(base_data)

#miniset data dataframe 'mini_no_overlap' 
mini_no_overlap.to_parquet('mini_no_overlap.parquet', index=False)
mini_no_overlap_data = wandb.Artifact(name="mini_no_overlap_data", type="dataset")
mini_no_overlap_data.add_file('mini_no_overlap.parquet')
wandb.log_artifact(mini_no_overlap_data)

#validation set dataframe 'val_df'
val_df.to_parquet('val_data.parquet', index=False)
val_data = wandb.Artifact(name="val_data", type="dataset")
val_data.add_file('val_data.parquet')
wandb.log_artifact(val_data)

<Artifact val_data>

In [23]:
#validation set dataframe 'val2_df'
val2_df.to_parquet('val2_data.parquet', index=False)
val2_data = wandb.Artifact(name="val2_data", type="dataset")
val2_data.add_file('val2_data.parquet')
wandb.log_artifact(val2_data)

# train_df - 100% comp data - val2 set
train2_df.to_parquet('train2_df.parquet', index=False)
train2_df_artifact = wandb.Artifact(name="train2_df", type="dataset")
train2_df_artifact.add_file('train2_df.parquet')
wandb.log_artifact(train2_df_artifact)

<Artifact train2_df>

In [24]:
#more data logging - moth or nbroad?
moredata_dataset_fixed_df.to_parquet('moredata_dataset_fixed.parquet', index=False)
moredata_dataset_fixed_artifact = wandb.Artifact(name="moredata_dataset_fixed", type="dataset")
moredata_dataset_fixed_artifact.add_file('moredata_dataset_fixed.parquet')
wandb.log_artifact(moredata_dataset_fixed_artifact)

# pii_dataset_fixed - moth or nbroad
pii_dataset_fixed_df.to_parquet('pii_dataset_fixed.parquet', index=False)
pii_dataset_fixed_artifact = wandb.Artifact(name="pii_dataset_fixed", type="dataset")
pii_dataset_fixed_artifact.add_file('pii_dataset_fixed.parquet')
wandb.log_artifact(pii_dataset_fixed_artifact)

# train_df - 100% comp data - val set
train_df.to_parquet('train_df.parquet', index=False)
train_df_artifact = wandb.Artifact(name="train_df", type="dataset")
train_df_artifact.add_file('train_df.parquet')
wandb.log_artifact(train_df_artifact)

<Artifact train_df>

In [25]:
#TODO troubleshoot

# # We will generate html viz for every mini_no_overlap essay, wrap it up in `wandb.Html` and create a W&B table to inspect it
# df=mini_no_overlap

# wandb_htmls = [wandb.Html(visualize(row)) for _, row in df.iterrows()]
# df['visualization'] = wandb_htmls
# table = wandb.Table(dataframe=df)
# wandb.log({'original_dataset': table})

In [26]:
# Finish W&B run
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 🚀 View run [33mroyal-bird-67[0m at: [34m[4mhttps://wandb.ai/csci566sp24/pii/runs/mgf0jrcd/workspace[0m
[34m[1mwandb[0m: Synced 4 W&B file(s), 0 media file(s), 5 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20240421_224400-mgf0jrcd/logs[0m


# TO DOs
- Truncation with Stride, tokenizers striding method?
- add hyperparameters to config wandb.config.update