# PII W and B Training
-removed stride compared to reference
-raw data is full competition training comes from base_data artifact
Reference:
- https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b 
- https://colab.research.google.com/github/wandb/examples/blob/master/colabs/pytorch/Organizing_Hyperparameter_Sweeps_in_PyTorch_with_W%26B.ipynb#scrollTo=eFhyArSz826Q

# Run Configs

# Imports

In [1]:
!pip install seqeval evaluate transformers -q

In [2]:
!pip install --upgrade wandb -q
import wandb

In [3]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-04-18 02:17:54.343134: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 02:17:54.343229: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 02:17:54.497911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Util Functions

In [4]:
# https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input?select=utils.py
# https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference?scriptVersionId=161126788
# https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy

import os
import json
import numpy as np
import pandas as pd
import wandb
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.auto import tqdm
import argparse
from ast import literal_eval
from transformers import Trainer
from torch.nn import CrossEntropyLoss
from scipy.special import softmax
from transformers import TrainerCallback

#call back to log loss for sweep analysis
class WandbLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, trainer=None, **kwargs):
        # Log metrics to wandb
        if trainer is not None:
            logs = {}
            # Log loss
            logs["loss"] = trainer.state.log_history[-1]["loss"]
            wandb.log(logs)
            
def do_downsample(train_df, ratio):
    '''
        Down sample negative examples
    '''
    # Separate positive and negative samples
    p = train_df[train_df['labels'].apply(lambda x: any(label != "O" for label in x))]
    n = train_df[train_df['labels'].apply(lambda x: all(label == "O" for label in x))]

    # Downsample negative samples
    n = n.sample(int(len(n) * ratio))

    # Combine positive and downsampled negative samples
    df = pd.concat([p, n], ignore_index=True)
    
    return df

def parse_predictions(predictions, id2label, ds, threshold=0.9):
    
    # Scale last dimension to probabilities for interpretability
    pred_softmax = softmax(predictions, axis=2)
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:,:,:12].argmax(-1)
    O_preds = pred_softmax[:,:,12]
    #preds_final = predictions.argmax(-1) #Choose label with max probability
    preds_final = np.where(O_preds < threshold, preds_without_O , preds)

    triplets = set()
    row, document, token, label, token_str = [], [], [], [], []
    for i, (p, token_map, offsets, tokens, doc, indices) in enumerate(zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"], ds["token_indices"])):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            #CHECK
            token_id = token_map[start_idx] #token ID at the start of the index
#             original_token_id = token_map[start_idx]
#             token_id = indices[original_token_id]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    row.append(i)
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.add(triplet)

    df = pd.DataFrame({
        "eval_row": row,
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    df = df.drop_duplicates().reset_index(drop=True)

    df["row_id"] = list(range(len(df)))
    return df

#CHECK- modified from https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input
def get_reference_df(artifact, filename='val_data.parquet'): 
    raw_artifact = wandb.use_artifact(artifact)
    raw_artifact_dir = raw_artifact.download()
    raw_df = pd.read_parquet(raw_artifact_dir + f'/{filename}')
    
    ref_df = raw_df[['document', 'tokens', 'labels']].copy()
    ref_df = ref_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
    ref_df['token'] = ref_df.groupby('document').cumcount()
        
    reference_df = ref_df[ref_df['label'] != 'O'].copy()
    reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
    reference_df = reference_df[['row_id', 'document', 'token', 'label']].copy()
    
    return reference_df



class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        #class_weights is a Tensor of weights for each class
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # Reshape for loss calculation
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        if self.label_smoother is not None and "labels" in inputs:
            loss = self.label_smoother(outputs, inputs)
        else:
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss
    

# data Functions

In [5]:
import numpy as np
from datasets import Dataset

#prep data for NER training by tokenize the text and align labels to tokens
def tokenize(example, tokenizer, label2id, max_length, stride):
    """This function ensures that the text is correctly tokenized and the labels 
    are correctly aligned with the tokens for NER training.

    Args:
        example (dict): The example containing the text and labels.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: The tokenized example with aligned labels.

    Reference: credit to https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """

    # rebuild text from tokens
    text = []
    labels = []
    token_map = [] 
    
    idx = 0

    #iterate through tokens, labels, and trailing whitespace using zip to create tuple from three lists
    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        
        text.append(t)
        token_map.extend([idx]*len(t)) 
        #extend so we can add multiple elements to end of list if ws
        labels.extend([l] * len(t))
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1) #CHECK
            
        idx += 1

    #Tokenize text and return offsets for start and end character position. Limit length of tokenized text.
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        max_length=max_length,
        truncation=True,
        stride = stride,
    ) 

    #convert to np array for indexing
    labels = np.array(labels)

    # join text list into a single string 
    text = "".join(text)
    token_labels = []

    #iterate through each tolken
    for start_idx, end_idx in tokenized.offset_mapping:
        #if special tolken (CLS token) then append O
        #CLS : classification token added to the start of each sequence
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        #append orginal label to token_labels
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length,"token_map": token_map, } 

#create dataset if using wandb
def create_dataset(data, tokenizer, max_length, label2id, stride):
    '''
    data(pandas.DataFrame): for wandb artifact
    '''
    # Convert data to Hugging Face Dataset object
    ds = Dataset.from_dict({
        "full_text": data.full_text.tolist(),
        "document": data.document.tolist(),
        "tokens": data.tokens.tolist(),
        "trailing_whitespace": data.trailing_whitespace.tolist(),
        "provided_labels": data.labels.tolist(),
        "token_indices": data.token_indices.tolist(),
    })

    # Map the tokenize function to the Dataset
    ds = ds.map(
        tokenize,
        fn_kwargs={      # pass keyword args
            "tokenizer": tokenizer,
            "label2id": label2id,
            "max_length": max_length,
            "stride": stride,
        }, 
        num_proc=3
    )

    return ds

# Metric Functions

In [6]:
# https://www.kaggle.com/code/conjuring92/pii-metric-fine-grained-eval

from collections import defaultdict
from typing import Dict
# from utils import parse_predictions #SCRIPT version

class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(p, id2label, valid_ds, valid_df, threshold=0.9):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """
    predictions, labels = p
    
    pred_df = parse_predictions(predictions, id2label, valid_ds, threshold=threshold)
    
    references = zip(valid_df.document, valid_df.token, valid_df.label)
    predictions = zip(pred_df.document, pred_df.token, pred_df.label)
    
    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1] # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:] # avoid B- and I- prefix
            
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:] # avoid B- and I- prefix
        
        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()
    
    for prf in score_per_type.values():
        totals += prf

    results = {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k!= 'O'},
    }
    
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                if isinstance(v, dict):
                    for n2, v2 in v.items():
                        final_results[f"{key}_{n}_{n2}"] = v2
                else:
                    final_results[f"{key}_{n}"] = v              
        else:
            final_results[key] = value
            
    return final_results

# Training Script


In [7]:
import os
from itertools import chain
from functools import partial
from transformers import AutoTokenizer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import pandas as pd
from types import SimpleNamespace
import torch
import wandb
import pickle


def train(config = None):
    # Initialize new wandb run to run without sweep agent
    with wandb.init(project='pii_compare', job_type='train', config=config):
        config = wandb.config
        
       # Load the training data
        train_artifact = wandb.use_artifact(config.train_artifact)
        train_artifact_dir = train_artifact.download()
        train_df = pd.read_parquet(train_artifact_dir + '/'+ config.train_artifact_name +'.parquet')

        # Load the validation data
        val_artifact = wandb.use_artifact(config.val_artifact)
        val_artifact_dir = val_artifact.download()
        val_df = pd.read_parquet(val_artifact_dir + '/' + config.val_artifact_name + '.parquet')
        eval_df = val_df.copy()
        
        # Load external data
        for art in [config.external_data_1, config.external_data_2, config.external_data_3, config.external_data_4, config.external_data_5]:
            if art != 'none':
                print(f'Loading external data {art}...')
                artifact = wandb.use_artifact(art)
                artifact_dir = artifact.download()
                ext_df = pd.read_parquet(artifact_dir + '/ext_data.parquet')
                train_df = pd.concat([train_df, ext_df], ignore_index=True)
        
        #down sample
        train_df = do_downsample(train_df, config.downsample_ratio)
        
        # Prepare references and labels from val set
        reference_df = get_reference_df(config.val_artifact)
        all_labels = sorted(list(set(chain(*[x.tolist() for x in val_df.labels.values])))) #get from val df
        label2id = {l: i for i,l in enumerate(all_labels)}
        id2label = {v:k for k,v in label2id.items()}

        # Create the training and validation datasets
        tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
        train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id, config.stride)
        valid_ds = create_dataset(val_df, tokenizer, config.inference_max_length, label2id, config.stride)

        # Initialize the model and data collator
        model = AutoModelForTokenClassification.from_pretrained(
            config.training_model_path,
            num_labels=len(all_labels),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

        # Define the training arguments
        args = TrainingArguments(
            output_dir=config.output_dir, 
            fp16=config.fp16,
            learning_rate=config.learning_rate,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            report_to=config.report_to,
            evaluation_strategy=config.evaluation_strategy,
            eval_steps = config.eval_steps,
            save_strategy = config.evaluation_strategy, #these need to be the same
            do_eval=config.do_eval,
            save_total_limit=config.save_total_limit,
            logging_steps=config.logging_steps,
            lr_scheduler_type=config.lr_scheduler_type,
            warmup_ratio=config.warmup_ratio,
            weight_decay=config.weight_decay,
            load_best_model_at_end = config.load_best_model_at_end,
            metric_for_best_model = config.metric_for_best_model ,
            greater_is_better = config.greater_is_better,
        )

        #class weights based on dataset to go to CustomTrainer Class #TODO try without or move to config or make refactor
        class_weights = torch.tensor([1.]*12 + [config.o_weight]).to('cuda')

        # Initialize Trainer with custom class weights
        trainer = CustomTrainer(
            model=model, 
            args=args, 
            train_dataset=train_ds,
            eval_dataset=valid_ds,
            data_collator=collator, 
            tokenizer=tokenizer,
            compute_metrics=partial(compute_metrics, id2label=id2label, valid_ds=valid_ds, valid_df=reference_df, threshold=config.threshold),
            class_weights=class_weights,
            #callbacks=[WandbLoggingCallback], #added for wandb anaylsis
        )

        # Train the model
        trainer.train()    

        # Make predictions on the validation dataset
        preds = trainer.predict(valid_ds)

        #theshold tests
        print("doing threshold tests:")
        threshold_tests = [.7, .8, 0.9, 0.95, 0.99]
        scores =[]
        
        for threshold in threshold_tests:
            metrics = compute_metrics((preds.predictions, None), id2label, valid_ds, reference_df, threshold=threshold)
            f5_score = metrics['ents_f5']
            scores.append(f5_score)
            wandb.log({'threshold': threshold, 'final_f5': f5_score})
            print(f'threshold:f5 {threshold}: {f5_score}')

        best_threshold = 0.0  
        best_f5 = 0.0  
        for thresh, score in zip(threshold_tests, scores):
            if score > best_f5:
                best_threshold = thresh
                best_f5 = score
            
        wandb.config.best_threshold = best_threshold
        preds_df = parse_predictions(preds.predictions, id2label, valid_ds, threshold=best_threshold)
        
        # Save the model and upload it to Kaggle
        os.makedirs(config.experiment, exist_ok=True)
        trainer.save_model(config.experiment)
        tokenizer.save_pretrained(config.experiment)
        print('Experiment finished, test it out on the inference notebook!')
    
#         #Save variables to experiment in non GPU notebook    
#         with open('variables.pkl', 'wb') as f:
#             pickle.dump({
#                 'preds_df': preds_df,
#                 'eval_df': eval_df,
#                 'tokenizer': tokenizer,
#                 'predictions': preds.predictions,
#                 'id2label': id2label,
#                 'valid_ds': valid_ds,
#                 'best_threshold': best_threshold,
#                 'reference_df': reference_df,
#             }, f)
        
#         print("saved val_set_preds and valid_ds with pickle for loss testing")
    
    return best_threshold

# W and B 
- login
- default config
- update config
- train

In [8]:
# make sure to attach key from secrets in add-ons
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

import wandb
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
#Default Config
sweep_config = {
    'method': 'bayes' #grid, random, bayes
    }

#metrics for evaluation
metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

#intialize parameters 
parameters_dict = {
    'experiment': {'value': 'pii_00'},
    'threshold': {'value': 0.99},
    'o_weight': {'value': 0.05},  # set to 1 for equal weight for classes
    'downsample_ratio' : {'value': 1.0},  # set to 1 for no downsample
    'raw_artifact': {'value': 'csci566sp24/pii/base_data:v1'},
    'train_artifact': {'value': 'csci566sp24/pii/mini_no_overlap_data:v4'},
    'val_artifact': {'value': 'csci566sp24/pii/val_data:v2'},
    'train_artifact_name': {'value': 'mini_no_overlap'},
    'val_artifact_name': {'value': 'val_data'},
    'external_data_1': {'value': 'none'},
    'external_data_2': {'value': 'none'},
    'external_data_3': {'value': 'none'},
    'external_data_4': {'value': 'none'},
    'external_data_5': {'value': 'none'},
    'output_dir': {'value': 'output'},
    'inference_max_length': {'value': 1024},
    'training_max_length': {'value': 1024},
    'stride': {'value': 128}, # set to 0 for no effect
    'training_model_path': {'value': 'microsoft/deberta-v3-xsmall'},
    'fp16': {'value': True},
    'learning_rate': {'value': 1e-5},
    'num_train_epochs': {'value': .5},
    'per_device_train_batch_size': {'value': 4},
    'per_device_eval_batch_size': {'value': 4},
    'gradient_accumulation_steps': {'value': 2},
    'report_to': {'value': 'wandb'},
    'evaluation_strategy': {'value': 'epoch'},
    'eval_steps': {'value': 20},
    'do_eval': {'value': False},
    'save_total_limit': {'value': 2},
    'logging_steps': {'value': 10},
    'lr_scheduler_type': {'value': 'cosine'},
    'warmup_ratio': {'value': 0.1},
    'weight_decay': {'value': 0.01},
    'load_best_model_at_end': {'value': False},
    'metric_for_best_model': {'value': 'ents_f5'},
    'greater_is_better': {'value': True},
}
    
sweep_config['parameters'] = parameters_dict
train_config = parameters_dict



In [10]:
#update train parameters using dictionary so that it works with sweep

train_config.update({
    'experiment': {
        'value': 'pii010a'},
    'train_artifact': {'value': 'csci566sp24/pii/mini_no_overlap_data:v4'},
    'external_data_1': {'value': 'none'},
    'external_data_2': {'value': 'none'},
    'num_train_epochs': {
        'value': 3},
    'per_device_eval_batch_size': {
        'value': 2},
    'per_device_train_batch_size': {
        'value': 2},
    'gradient_accumulation_steps': {
        'value': 8},
    'learning_rate': {
        'value': 1e-4},
    'evaluation_strategy': {
        'value': 'epoch'}, 
    'o_weight': {
        'value': .76}, #set to 1 for equal weight for classes
    
    #gradient_checkpointing=True #slow down training by 20% but saves memory
    #optim="adafactor" change from adam for memory saving. convergence might be slower
    })

In [11]:
#nested dictionary of parameters interested in and method we are trying
import pprint
pprint.pprint(train_config)

{'do_eval': {'value': False},
 'downsample_ratio': {'value': 1.0},
 'eval_steps': {'value': 20},
 'evaluation_strategy': {'value': 'epoch'},
 'experiment': {'value': 'pii010a'},
 'external_data_1': {'value': 'none'},
 'external_data_2': {'value': 'none'},
 'external_data_3': {'value': 'none'},
 'external_data_4': {'value': 'none'},
 'external_data_5': {'value': 'none'},
 'fp16': {'value': True},
 'gradient_accumulation_steps': {'value': 8},
 'greater_is_better': {'value': True},
 'inference_max_length': {'value': 1024},
 'learning_rate': {'value': 0.0001},
 'load_best_model_at_end': {'value': False},
 'logging_steps': {'value': 10},
 'lr_scheduler_type': {'value': 'cosine'},
 'metric_for_best_model': {'value': 'ents_f5'},
 'num_train_epochs': {'value': 3},
 'o_weight': {'value': 0.76},
 'output_dir': {'value': 'output'},
 'per_device_eval_batch_size': {'value': 2},
 'per_device_train_batch_size': {'value': 2},
 'raw_artifact': {'value': 'csci566sp24/pii/base_data:v1'},
 'report_to': {'

In [12]:
# runs training script

# Extract inner values from the dictionary
config = {k: v['value'] for k, v in train_config.items()}

# Convert to SimpleNamespace
config = SimpleNamespace(**config)
best_threshold = train(config)
print(f'Best Threshold : {best_threshold}')

[34m[1mwandb[0m: Currently logged in as: [33mkasprisi[0m ([33mcsci566sp24[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240418_021805-jttxikku[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mtreasured-eon-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/csci566sp24/pii_compare[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/csci566sp24/pii_compare/runs/jttxikku[0m
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map (num_proc=3):   0%|          | 0/3061 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/688 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Ents P,Ents R,Ents F5,Ents Per Type Name Student P,Ents Per Type Name Student R,Ents Per Type Name Student F5,Ents Per Type Url Personal P,Ents Per Type Url Personal R,Ents Per Type Url Personal F5,Ents Per Type Email P,Ents Per Type Email R,Ents Per Type Email F5,Ents Per Type Street Address P,Ents Per Type Street Address R,Ents Per Type Street Address F5,Ents Per Type Phone Num P,Ents Per Type Phone Num R,Ents Per Type Phone Num F5,Ents Per Type Id Num P,Ents Per Type Id Num R,Ents Per Type Id Num F5,Ents Per Type Username P,Ents Per Type Username R,Ents Per Type Username F5
0,0.0162,0.011097,0.024469,0.361194,0.236186,0.024524,0.446494,0.268682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.004,0.004805,0.279476,0.764179,0.716392,0.30125,0.889299,0.827195,0.194805,0.9375,0.81761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0036,0.00378,0.440823,0.767164,0.745925,0.471735,0.892989,0.863337,0.357143,0.9375,0.882353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


doing threshold tests:
threshold:f5 0.7: 0.7444608567208272
threshold:f5 0.8: 0.7458049886621315
threshold:f5 0.9: 0.7490679019319851
threshold:f5 0.95: 0.749296408870877
threshold:f5 0.99: 0.7459254297834338
Experiment finished, test it out on the inference notebook!


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                         eval/ents_f5 ▁██
[34m[1mwandb[0m:                          eval/ents_p ▁▅█
[34m[1mwandb[0m:          eval/ents_per_type_EMAIL_f5 ▁▁▁
[34m[1mwandb[0m:           eval/ents_per_type_EMAIL_p ▁▁▁
[34m[1mwandb[0m:           eval/ents_per_type_EMAIL_r ▁▁▁
[34m[1mwandb[0m:         eval/ents_per_type_ID_NUM_f5 ▁▁▁
[34m[1mwandb[0m:          eval/ents_per_type_ID_NUM_p ▁▁▁
[34m[1mwandb[0m:          eval/ents_per_type_ID_NUM_r ▁▁▁
[34m[1mwandb[0m:   eval/ents_per_type_NAME_STUDENT_f5 ▁██
[34m[1mwandb[0m:    eval/ents_per_type_NAME_STUDENT_p ▁▅█
[34m[1mwandb[0m:    eval/ents_per_type_NAME_STUDENT_r ▁██
[34m[1mwandb[0m:      eval/ents_per_type_PHONE_NUM_f5 ▁▁▁
[34m[1mwandb[0m:       eval/ents_per_type_PHONE_NUM_p ▁▁▁
[34m[1mwandb[0m:       eval/ents_per_type_PHONE_NU

Best Threshold : 0.95


# TODO
- move helper functions to seperate script