# PII W and B Training
This notebook is used for experimentation and training models for PII. Training is logged using W&B to monitor, analyze and evaluate performance for formulating additional experiments. Select iterations of previous experiments can be found under prior runs. A sweep implimentation for hyperparamter optimization is also present but commented out for training. 

Data artifacts are generated using this notebook: https://www.kaggle.com/code/jonathankasprisin/pii-prep-ens
Final inference post processing and submission is done using this notebook: https://www.kaggle.com/code/jonathankasprisin/pii-inference/


Reference:
1. https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b 
2. https://colab.research.google.com/github/wandb/examples/blob/master/colabs/pytorch/Organizing_Hyperparameter_Sweeps_in_PyTorch_with_W%26B.ipynb#scrollTo=eFhyArSz826Q

Additional references for specific functions are provided in their code blocks. 

# Run Configs

# Imports

In [None]:
# #if google colab
# from google.colab import drive
# drive.mount('/content/drive')

# !pip install -r requirements.txt

In [None]:
!pip install --upgrade wandb -q
import wandb

In [None]:
!pip freeze >requirements.txt

In [None]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

# Util Functions

In [None]:

# References:
# https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input?select=utils.py
# https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference?scriptVersionId=161126788
# https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy

import os
import json
import numpy as np
import pandas as pd
import wandb
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.auto import tqdm
import argparse
from ast import literal_eval
from transformers import Trainer
from torch.nn import CrossEntropyLoss
from scipy.special import softmax
from transformers import TrainerCallback

def identify_incorrect_labels(reference_df, pred_df):
    """
    Identify incorrectly labeled tokens and classify them as False Negatives or False Positives.

    Parameters:
    - reference_df (DataFrame): DataFrame with the reference labels.
    - pred_df (DataFrame): DataFrame with the predicted labels.

    Returns:
    - incorrectly_labeled (DataFrame): DataFrame with the incorrectly labeled tokens and their error types.
    """
    # Drop unnecessary columns from pred_df
    pred_df = pred_df.drop(columns=['eval_row', 'row_id'])

    # Merge the DataFrames
    merged_df = pd.merge(reference_df, pred_df, on=['document', 'token'], how='outer', suffixes=('_actual', '_pred'))

    # Identify incorrectly labeled tokens
    incorrectly_labeled = merged_df[merged_df['label_actual'] != merged_df['label_pred']].copy()

    # Fill NaN values in 'label_actual' and 'label_pred' with 'O'
    incorrectly_labeled['label_actual'] = incorrectly_labeled['label_actual'].fillna('O')
    incorrectly_labeled['label_pred'] = incorrectly_labeled['label_pred'].fillna('O')

    # Define conditions for False Negatives and False Positives
    condition_fn = (
        (incorrectly_labeled['label_actual'] != 'O')  &
        ((incorrectly_labeled['label_pred'] == 'O') | (incorrectly_labeled['label_actual'] != incorrectly_labeled['label_pred']))
    )
    condition_fp = ((incorrectly_labeled['label_actual'] == 'O') & (incorrectly_labeled['label_pred'] != 'O'))

    # Use np.select to choose between 'FN', 'FP', and None based on the conditions
    choices = ['FN', 'FP']
    incorrectly_labeled['error'] = np.select([condition_fn, condition_fp], choices, default=None)

    return incorrectly_labeled

            
def do_downsample(train_df, ratio):
    '''
        Down sample negative examples
    '''
    # Separate positive and negative samples
    p = train_df[train_df['labels'].apply(lambda x: any(label != "O" for label in x))]
    n = train_df[train_df['labels'].apply(lambda x: all(label == "O" for label in x))]

    # Downsample negative samples
    n = n.sample(int(len(n) * ratio))

    # Combine positive and downsampled negative samples
    df = pd.concat([p, n], ignore_index=True)
    
    return df

def parse_predictions(predictions, id2label, ds, threshold=0.9):
    
    # Scale last dimension to probabilities for interpretability
    pred_softmax = softmax(predictions, axis=2)
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:,:,:12].argmax(-1)
    O_preds = pred_softmax[:,:,12]
    #preds_final = predictions.argmax(-1) #Choose label with max probability
    preds_final = np.where(O_preds < threshold, preds_without_O , preds)

    triplets = set()
    row, document, token, label, token_str = [], [], [], [], []
    for i, (p, token_map, offsets, tokens, doc, indices) in enumerate(zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"], ds["token_indices"])):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            #CHECK
            token_id = token_map[start_idx] #token ID at the start of the index

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    row.append(i)
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.add(triplet)

    df = pd.DataFrame({
        "eval_row": row,
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    df = df.drop_duplicates().reset_index(drop=True)

    df["row_id"] = list(range(len(df)))
    return df

#modified from https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input
def get_reference_df(parquet_path): 
    raw_df = pd.read_parquet(parquet_path)
    
    ref_df = raw_df[['document', 'tokens', 'labels']].copy()
    ref_df = ref_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
    ref_df['token_str'] = ref_df['token']
    ref_df['token'] = ref_df.groupby('document').cumcount()
        
    reference_df = ref_df[ref_df['label'] != 'O'].copy()
    reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
    reference_df = reference_df[['row_id', 'document', 'token', 'label', 'token_str']].copy()
    
    return reference_df



class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        #class_weights is a Tensor of weights for each class
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # Reshape for loss calculation
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        if self.label_smoother is not None and "labels" in inputs:
            loss = self.label_smoother(outputs, inputs)
        else:
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss
    

# data Functions

In [None]:
import numpy as np
from datasets import Dataset

#prep data for NER training by tokenize the text and align labels to tokens
def tokenize(example, tokenizer, label2id, max_length, stride):
    """This function ensures that the text is correctly tokenized and the labels 
    are correctly aligned with the tokens for NER training.

    Args:
        example (dict): The example containing the text and labels.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: The tokenized example with aligned labels.

    Reference: credit to https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """

    # rebuild text from tokens
    text = []
    labels = []
    token_map = [] 
    
    idx = 0

    #iterate through tokens, labels, and trailing whitespace using zip to create tuple from three lists
    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        
        text.append(t)
        token_map.extend([idx]*len(t)) 
        #extend so we can add multiple elements to end of list if ws
        labels.extend([l] * len(t))
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1) #CHECK
            
        idx += 1

    #Tokenize text and return offsets for start and end character position. Limit length of tokenized text.
    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        max_length=max_length,
        truncation=True,
        stride = stride,
    ) 

    #convert to np array for indexing
    labels = np.array(labels)

    # join text list into a single string 
    text = "".join(text)
    token_labels = []

    #iterate through each tolken
    for start_idx, end_idx in tokenized.offset_mapping:
        #if special tolken (CLS token) then append O
        #CLS : classification token added to the start of each sequence
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        #append orginal label to token_labels
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length,"token_map": token_map, } 

#create dataset if using wandb
def create_dataset(data, tokenizer, max_length, label2id, stride):
    '''
    data(pandas.DataFrame): for wandb artifact
    '''
    
    # Convert data to Hugging Face Dataset object
    ds = Dataset.from_dict({
        "full_text": data.full_text.tolist(),
        "document": data.document.tolist(),
        "tokens": data.tokens.tolist(),
        "trailing_whitespace": data.trailing_whitespace.tolist(),
        "provided_labels": data.labels.tolist(),
        "token_indices": data.token_indices.tolist(),
    })

    # Map the tokenize function to the Dataset
    ds = ds.map(
        tokenize,
        fn_kwargs={      # pass keyword args
            "tokenizer": tokenizer,
            "label2id": label2id,
            "max_length": max_length,
            "stride": stride,
        }, 
        num_proc=2
    )

    return ds

# train Functions

In [None]:
# https://www.kaggle.com/code/conjuring92/pii-metric-fine-grained-eval
from collections import defaultdict
from typing import Dict

class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(p, id2label, valid_ds, valid_df, threshold=0.9):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """
    predictions, labels = p
    
    pred_df = parse_predictions(predictions, id2label, valid_ds, threshold=threshold)
    
    references = zip(valid_df.document, valid_df.token, valid_df.label)
    predictions = zip(pred_df.document, pred_df.token, pred_df.label)
    
    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1] # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:] # avoid B- and I- prefix
            
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:] # avoid B- and I- prefix
        
        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()
    
    for prf in score_per_type.values():
        totals += prf

    results = {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k!= 'O'},
    }
    
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                if isinstance(v, dict):
                    for n2, v2 in v.items():
                        final_results[f"{key}_{n}_{n2}"] = v2
                else:
                    final_results[f"{key}_{n}"] = v              
        else:
            final_results[key] = value
            
    return final_results

# Training Script


In [None]:
import os
from itertools import chain
from functools import partial
from transformers import AutoTokenizer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import pandas as pd
from types import SimpleNamespace
import torch
import wandb
import pickle
import re
import gc


def train(config = None):
    gc.collect()
    torch.cuda.empty_cache() #free up memory that isnt in use
    
    # Initialize new wandb run to run without sweep agent
    with wandb.init(project='pii_compare2', job_type='train', config=config):
        config = wandb.config
        
       # Load the training data
        train_df = pd.read_parquet(config.train_artifact_path)

        # Load the validation data
        val_df = pd.read_parquet(config.val_artifact_path)
        
        # Load external data
        for parquet_path in [config.external_data_1, config.external_data_2, config.external_data_3, config.external_data_4, config.external_data_5]:
            if parquet_path != 'none':
                print(f'Loading external data...')
                ext_df = pd.read_parquet(parquet_path)
                train_df = pd.concat([train_df, ext_df], ignore_index=True)
                del ext_df
        
        wandb.log({'num_docs_train_raw': len(train_df)})
        #down sample
        train_df = do_downsample(train_df, config.downsample_ratio)
        wandb.log({'num_docs_train': len(train_df)})
        
        # Prepare references and labels from val set
        reference_df = get_reference_df(config.val_artifact_path)
        all_labels = sorted(list(set(chain(*[x.tolist() for x in val_df.labels.values])))) #get from val df
        label2id = {l: i for i,l in enumerate(all_labels)}
        id2label = {v:k for k,v in label2id.items()}

        # Create the training and validation datasets
        tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
        train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id, config.stride)
        valid_ds = create_dataset(val_df, tokenizer, config.inference_max_length, label2id, config.stride)
        del train_df
        del val_df
        gc.collect()

        # Initialize the model and data collator
        model = AutoModelForTokenClassification.from_pretrained(
            config.training_model_path,
            num_labels=len(all_labels),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

        # Define the training arguments
        args = TrainingArguments(
            output_dir=config.output_dir, 
            fp16=config.fp16,
            learning_rate=config.learning_rate,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            report_to=config.report_to,
            evaluation_strategy=config.evaluation_strategy,
            eval_steps = config.eval_steps,
            save_strategy = config.evaluation_strategy, #these need to be the same
            do_eval=config.do_eval,
            save_total_limit=config.save_total_limit,
            logging_steps=config.logging_steps,
            lr_scheduler_type=config.lr_scheduler_type,
            warmup_ratio=config.warmup_ratio,
            weight_decay=config.weight_decay,
            load_best_model_at_end = config.load_best_model_at_end,
            metric_for_best_model = config.metric_for_best_model ,
            greater_is_better = config.greater_is_better,
        )

        #class weights based on dataset to go to CustomTrainer Class
        class_weights = torch.tensor([1.]*12 + [config.o_weight]).to('cuda')

        # Initialize Trainer with custom class weights
        trainer = CustomTrainer(
            model=model, 
            args=args, 
            train_dataset=train_ds,
            eval_dataset=valid_ds,
            data_collator=collator, 
            tokenizer=tokenizer,
            compute_metrics=partial(compute_metrics, id2label=id2label, valid_ds=valid_ds, valid_df=reference_df, threshold=config.threshold),
            class_weights=class_weights,
        )

        # Train the model
        trainer.train()    

        del train_ds
        gc.collect()
        torch.cuda.empty_cache() #free up memory that isnt in use
        
        # Make predictions on the validation dataset
        preds = trainer.predict(valid_ds)

        #theshold tests
        print("doing threshold tests:")
        threshold_tests = [.6,.7,.8,.9,.99] #[.7,.9, 0.99] #TEMP
        scores =[]
        
        for threshold in threshold_tests:
            metrics = compute_metrics((preds.predictions, None), id2label, valid_ds, reference_df, threshold=threshold)
            f5_score = metrics['ents_f5']
            scores.append(f5_score)
            wandb.log({'threshold': threshold, 'final_f5': f5_score})
            print(f'threshold:f5 {threshold}: {f5_score}')

        best_threshold = 0.0  
        best_f5 = 0.0  
        for thresh, score in zip(threshold_tests, scores):
            if score > best_f5:
                best_threshold = thresh
                best_f5 = score
            
        wandb.config.best_threshold = best_threshold
        wandb.log({'val_f5': best_f5})
        preds_df = parse_predictions(preds.predictions, id2label, valid_ds, threshold=best_threshold)
        
        #make DF of errors and save to wandb
        incorrectly_labeled = identify_incorrect_labels(reference_df, preds_df)
        errors_table = wandb.Table(dataframe=incorrectly_labeled)
        wandb.log({'errors_table': errors_table})
        
        # Save the model and upload it to Kaggle
        os.makedirs(config.experiment, exist_ok=True)
        trainer.save_model(config.experiment)
        tokenizer.save_pretrained(config.experiment)
        
#         #pickle for testing. Open a file in binary write mode
#         with open('arguments.pkl', 'wb') as file:
#             # Dump all objects at once using a tuple
#             pickle.dump((preds.predictions, id2label, valid_ds, reference_df, threshold_tests), file)
    #best_threshold =0    
    print('Experiment finished, test it out on the inference notebook!')
    
    return best_threshold

# W and B 


In [None]:
# make sure to attach key from secrets in add-ons
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

# from google.colab import userdata
# wandb_api_key = userdata.get('WANDB_API_KEY')

import wandb
wandb.login(key=wandb_api_key)

In [None]:
#Default Config
sweep_config = {
    'method': 'bayes' #grid, random, bayes
    }

#metrics for evaluation
metric = {
    'name': 'val_f5',
    'goal': 'maximize'
    }

sweep_config['metric'] = metric

#intialize parameters
parameters_dict = {
    'experiment': {'value': 'pii_00'},
    'threshold': {'value': 0.99},
    'o_weight': {'value': 0.05},  # set to 1 for equal weight for classes
    'downsample_ratio' : {'value': 1.0},  # set to 1 for no downsample
    'raw_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/val2.parquet'},
    'train_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/train2_fromval.parquet'},
    'val_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/val2.parquet'},
    'external_data_1': {'value': 'none'},
    'external_data_2': {'value': 'none'},
    'external_data_3': {'value': 'none'},
    'external_data_4': {'value': 'none'},
    'external_data_5': {'value': 'none'},
    'output_dir': {'value': 'output'},
    'inference_max_length': {'value': 1024},
    'training_max_length': {'value': 1024},
    'stride': {'value': 0}, # set to 0 for no effect
    'training_model_path': {'value': 'microsoft/deberta-v3-xsmall'},
    'fp16': {'value': True},
    'learning_rate': {'value': 1e-5},
    'num_train_epochs': {'value': .2},
    'per_device_train_batch_size': {'value': 1},
    'per_device_eval_batch_size': {'value': 1},
    'gradient_accumulation_steps': {'value': 1},
    'report_to': {'value': 'wandb'},
    'evaluation_strategy': {'value': 'no'},
    'eval_steps': {'value': 20},
    'do_eval': {'value': False},
    'save_total_limit': {'value': 1},
    'logging_steps': {'value': 10},
    'lr_scheduler_type': {'value': 'cosine'},
    'warmup_ratio': {'value': 0.1},
    'weight_decay': {'value': 0.01},
    'load_best_model_at_end': {'value': False},
    'metric_for_best_model': {'value': 'ents_f5'},
    'greater_is_better': {'value': True},
}

sweep_config['parameters'] = parameters_dict
train_config = parameters_dict

# Best single model 

In [None]:
#base
  
train_config.update({
'experiment': {
    'value': f'pii_ens1_base_full'},
'training_model_path': {'value': 'microsoft/deberta-v3-base'},
'train_artifact_path': {'value': f'/kaggle/input/pii-bagging-datasets/artifacts/total_train.parquet'},
'external_data_1': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/val2.parquet'},
'learning_rate': {'value': 5e-5},
'num_train_epochs': {'value': 3},
'gradient_accumulation_steps': {'value': 1},
'per_device_train_batch_size': {'value': 1},
'stride': {'value': 256},
'o_weight': {'value': .76}, #set to 1 for equal weight for classes
'downsample_ratio' : {'value': .75},  # set to 1 for no downsample
'training_max_length': {'value': 1500},
})

# Extract inner values from the dictionary
config = {k: v['value'] for k, v in train_config.items()}

# Convert to SimpleNamespace
config = SimpleNamespace(**config)
best_threshold = train(config)
print(f'Best Threshold : {best_threshold}')
    

# Prior experiments


In [None]:
# #Ensemble loop for bags train_bag_0.parquet to train_bag_4.parquet

# for i in range(1):
    
#     train_config.update({
#     'experiment': {
#         'value': 'pii_ens1_small_val'},#f'pii_ens1v2_bag{i}'},
#     'training_model_path': {'value': 'microsoft/deberta-v3-small'},
#     'train_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/docs_over_3000.parquet'}, #f'/kaggle/input/pii-bagging-datasets/artifacts/train_bag{i}.parquet'},
#     'external_data_1': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/val2.parquet'},
#     'learning_rate': {'value': 5e-5},
#     'num_train_epochs': {'value': 3},
#     'gradient_accumulation_steps': {'value': 2},
#     'stride': {'value': 256},
#     'o_weight': {'value': .76}, #set to 1 for equal weight for classes
#     'downsample_ratio' : {'value': .75},  # set to 1 for no downsample
#     'training_max_length': {'value': 3000},
#     })

#     # Extract inner values from the dictionary
#     config = {k: v['value'] for k, v in train_config.items()}

#     # Convert to SimpleNamespace
#     config = SimpleNamespace(**config)
#     best_threshold = train(config)
#     print(f'Best Threshold : {best_threshold}')
    

# base

In [None]:
# #base

# id2label ={'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL',} # '12': 'O'}

# for label in id2label.values():
    
#     train_config.update({
#     'experiment': {
#         'value': f'pii_ens1_{label}'},
#     'training_model_path': {'value': 'microsoft/deberta-v3-base'},
#     'train_artifact_path': {'value': f'/kaggle/input/pii-bagging-datasets/artifacts/by_class_{label}.parquet'},
#     'external_data_1': {'value': 'none'},
#     'learning_rate': {'value': 5e-5},
#     'num_train_epochs': {'value': 2},
#     'gradient_accumulation_steps': {'value': 1},
#     'per_device_train_batch_size': {'value': 2},
#     'stride': {'value': 256},
#     'o_weight': {'value': .76}, #set to 1 for equal weight for classes
#     'downsample_ratio' : {'value': .75},  # set to 1 for no downsample
#     'training_max_length': {'value': 1500},
#     })

#     # Extract inner values from the dictionary
#     config = {k: v['value'] for k, v in train_config.items()}

#     # Convert to SimpleNamespace
#     config = SimpleNamespace(**config)
#     best_threshold = train(config)
#     print(f'Best Threshold : {best_threshold}')
    

# large

In [None]:
# #Ensemble loop for bags train_bag_0.parquet to train_bag_4.parquet

# for i in range(1):
    
#     train_config.update({
#     'experiment': {
#         'value': 'pii_ens1_large_val'},#f'pii_ens1v2_bag{i}'},
#     'training_model_path': {'value': 'microsoft/deberta-v3-large'},
#     'train_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/docs_over_3000.parquet'}, #f'/kaggle/input/pii-bagging-datasets/artifacts/train_bag{i}.parquet'},
#     'external_data_1': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/val2.parquet'},
#     'learning_rate': {'value': 1e-5},
#     'num_train_epochs': {'value': 3},
#     'gradient_accumulation_steps': {'value': 1},
#     'stride': {'value': 0},
#     'o_weight': {'value': .76}, #set to 1 for equal weight for classes
#     'downsample_ratio' : {'value': .75},  # set to 1 for no downsample
#     'training_max_length': {'value': 1000},
#     })

#     # Extract inner values from the dictionary
#     config = {k: v['value'] for k, v in train_config.items()}

#     # Convert to SimpleNamespace
#     config = SimpleNamespace(**config)
#     best_threshold = train(config)
#     print(f'Best Threshold : {best_threshold}')
    

# lakshyak

In [None]:
# #lakshyak

# for i in range(1):
    
#     train_config.update({
#     'experiment': {
#         'value': 'pii_ens1_large_val'},#f'pii_ens1v2_bag{i}'},
#     'training_model_path': {'value': 'microsoft/deberta-v3-large'},
#     'train_artifact_path': {'value': '/kaggle/input/pii-prep-ens1/artifacts/docs_over_3000.parquet'}, #f'/kaggle/input/pii-bagging-datasets/artifacts/train_bag{i}.parquet'},
#     'external_data_1': {'value': '/kaggle/input/pii-prep-ens1/artifacts/val2.parquet'},,
#     'learning_rate': {'value': 5e-5},
#     'num_train_epochs': {'value': 3},
#     'stride': {'value': 256},
#     'o_weight': {'value': .76}, #set to 1 for equal weight for classes
#     'downsample_ratio' : {'value': .75},  # set to 1 for no downsample
#     'training_max_length': {'value': 2000},
#     })

#     # Extract inner values from the dictionary
#     config = {k: v['value'] for k, v in train_config.items()}

#     # Convert to SimpleNamespace
#     config = SimpleNamespace(**config)
#     best_threshold = train(config)
#     print(f'Best Threshold : {best_threshold}')
    

# Sweep

Note for running sweep it is recommended to not calculate and log the error table in the training script. 

In [None]:
# #update train parameters using dictionary so that it works with sweep

# # Update 'method' in sweep_config
# sweep_config['method'] = 'grid'

# train_config.update({
#     'experiment': {
#         'value': 'pii_train2'},
#     'training_model_path': {'value': 'microsoft/deberta-v3-xsmall'},
#     'train_artifact_path': {'value': '/kaggle/input/pii-bagging-datasets/artifacts/train2_fromval.parquet'},
#     'learning_rate': {'value': 1e-4},
#     'stride': {'value': 128},
#     'o_weight': {'value': .76}, #set to 1 for equal weight for classes
#     'downsample_ratio' : {'value': 1},  # set to 1 for no downsample
#     'inference_max_length': {'value': 1024},
#     })

# parameters_dict.update({
#     'training_max_length': {'values': [512, 1024, 2000, 3000]},
# })

In [None]:
# # runs training script
# sweep_id = wandb.sweep(sweep_config, project= 'PII_sweep2')
# wandb.agent(sweep_id, train, count=4)