# PII W and B Training
-removed stride compared to reference
-raw data is full competition training comes from base_data artifact
Reference:
- https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b 
- https://colab.research.google.com/github/wandb/examples/blob/master/colabs/pytorch/Organizing_Hyperparameter_Sweeps_in_PyTorch_with_W%26B.ipynb#scrollTo=eFhyArSz826Q

# Imports

In [1]:
!pip install seqeval evaluate transformers -q

In [2]:
!pip install --upgrade wandb -q
import wandb

In [3]:
from pathlib import Path
import os

import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd

2024-04-16 17:38:27.082475: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 17:38:27.082646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 17:38:27.254846: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Util Functions

In [4]:
# https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input?select=utils.py
# https://www.kaggle.com/code/valentinwerner/915-deberta3base-inference?scriptVersionId=161126788
# https://www.kaggle.com/code/sinchir0/visualization-code-using-displacy

import os
import json
import numpy as np
import pandas as pd
import wandb
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.auto import tqdm
import argparse
from ast import literal_eval
from transformers import Trainer
from torch.nn import CrossEntropyLoss
from scipy.special import softmax
from transformers import TrainerCallback

#call back to log loss for sweep analysis
class WandbLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, trainer=None, **kwargs):
        # Log metrics to wandb
        if trainer is not None:
            logs = {}
            # Log loss
            logs["loss"] = trainer.state.log_history[-1]["loss"]
            wandb.log(logs)

def parse_predictions(predictions, id2label, ds, threshold=0.9):
    
    # Scale last dimension to probabilities for interpretability
    pred_softmax = softmax(predictions, axis=2)
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:,:,:12].argmax(-1)
    O_preds = pred_softmax[:,:,12]
    #preds_final = predictions.argmax(-1) #Choose label with max probability
    preds_final = np.where(O_preds < threshold, preds_without_O , preds)

    triplets = set()
    row, document, token, label, token_str = [], [], [], [], []
    for i, (p, token_map, offsets, tokens, doc, indices) in enumerate(zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"], ds["token_indices"])):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[token_pred]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            #CHECK
            token_id = token_map[start_idx] #token ID at the start of the index
#             original_token_id = token_map[start_idx]
#             token_id = indices[original_token_id]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    row.append(i)
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.add(triplet)

    df = pd.DataFrame({
        "eval_row": row,
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    df = df.drop_duplicates().reset_index(drop=True)

    df["row_id"] = list(range(len(df)))
    return df

#CHECK- modified from https://www.kaggle.com/code/thedrcat/pii-data-detection-train-with-w-b/input
def get_reference_df(artifact, filename='val_data.parquet'): 
    raw_artifact = wandb.use_artifact(artifact)
    raw_artifact_dir = raw_artifact.download()
    raw_df = pd.read_parquet(raw_artifact_dir + f'/{filename}')
    
    ref_df = raw_df[['document', 'tokens', 'labels']].copy()
    ref_df = ref_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
    ref_df['token'] = ref_df.groupby('document').cumcount()
        
    reference_df = ref_df[ref_df['label'] != 'O'].copy()
    reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
    reference_df = reference_df[['row_id', 'document', 'token', 'label']].copy()
    
    return reference_df



class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Assuming class_weights is a Tensor of weights for each class
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # Reshape for loss calculation
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        if self.label_smoother is not None and "labels" in inputs:
            loss = self.label_smoother(outputs, inputs)
        else:
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss
    

# data Functions

In [5]:
import numpy as np
from datasets import Dataset

#prep data for NER training by tokenize the text and align labels to tokens
def tokenize(example, tokenizer, label2id, max_length):
    """This function ensures that the text is correctly tokenized and the labels 
    are correctly aligned with the tokens for NER training.

    Args:
        example (dict): The example containing the text and labels.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        label2id (dict): A dictionary mapping labels to their corresponding ids.
        max_length (int): The maximum length of the tokenized text.

    Returns:
        dict: The tokenized example with aligned labels.

    Reference: credit to https://www.kaggle.com/code/valentinwerner/915-deberta3base-training/notebook
    """

    # rebuild text from tokens
    text = []
    labels = []
    token_map = [] 
    
    idx = 0

    #iterate through tokens, labels, and trailing whitespace using zip to create tuple from three lists
    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        
        text.append(t)
        token_map.extend([idx]*len(t)) 
        #extend so we can add multiple elements to end of list if ws
        labels.extend([l] * len(t))
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1) #CHECK
            
        idx += 1

    #Tokenize text and return offsets for start and end character position. Limit length of tokenized text.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length, truncation=True) #TODO check truncation

    #convert to np array for indexing
    labels = np.array(labels)

    # join text list into a single string 
    text = "".join(text)
    token_labels = []

    #iterate through each tolken
    for start_idx, end_idx in tokenized.offset_mapping:
        #if special tolken (CLS token) then append O
        #CLS : classification token added to the start of each sequence
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        #append orginal label to token_labels
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length,"token_map": token_map, } 

#create dataset if using wandb
def create_dataset(data, tokenizer, max_length, label2id):
    '''
    data(pandas.DataFrame): for wandb artifact
    '''
    # Convert data to Hugging Face Dataset object
    ds = Dataset.from_dict({
        "full_text": data.full_text.tolist(),
        "document": data.document.tolist(),
        "tokens": data.tokens.tolist(),
        "trailing_whitespace": data.trailing_whitespace.tolist(),
        "provided_labels": data.labels.tolist(),
        "token_indices": data.token_indices.tolist(),
    })

    # Map the tokenize function to the Dataset
    ds = ds.map(
        tokenize,
        fn_kwargs={      # pass keyword args
            "tokenizer": tokenizer,
            "label2id": label2id,
            "max_length": max_length
        }, 
        num_proc=3
    )

    return ds

# Metric Functions

In [6]:
# https://www.kaggle.com/code/conjuring92/pii-metric-fine-grained-eval

from collections import defaultdict
from typing import Dict
# from utils import parse_predictions #SCRIPT version

class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(p, id2label, valid_ds, valid_df, threshold=0.9):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """
    predictions, labels = p
    
    pred_df = parse_predictions(predictions, id2label, valid_ds, threshold=threshold)
    
    references = zip(valid_df.document, valid_df.token, valid_df.label)
    predictions = zip(pred_df.document, pred_df.token, pred_df.label)
    
    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1] # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:] # avoid B- and I- prefix
            
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:] # avoid B- and I- prefix
        
        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()
    
    for prf in score_per_type.values():
        totals += prf

    results = {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k!= 'O'},
    }
    
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                if isinstance(v, dict):
                    for n2, v2 in v.items():
                        final_results[f"{key}_{n}_{n2}"] = v2
                else:
                    final_results[f"{key}_{n}"] = v              
        else:
            final_results[key] = value
            
    return final_results

# Training Script


In [7]:
import os
from itertools import chain
from functools import partial
from transformers import AutoTokenizer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import pandas as pd
from types import SimpleNamespace
import torch
import wandb


def train(config = None):
    # Initialize new wandb run
    with wandb.init(config=config):
    #if called by wandb.agent this will be set by sweep controller
        config = wandb.config
        
       # Load the training data
        train_artifact = wandb.use_artifact(config.train_artifact)
        train_artifact_dir = train_artifact.download()
        print(f'train art dir: {train_artifact_dir}')
        print(f'train_artifact_name: {config.train_artifact_name}')
        train_df = pd.read_parquet(train_artifact_dir + '/'+ config.train_artifact_name +'.parquet')

        # Load the validation data
        val_artifact = wandb.use_artifact(config.val_artifact)
        val_artifact_dir = val_artifact.download()
        val_df = pd.read_parquet(val_artifact_dir + '/' + config.val_artifact_name + '.parquet')
        eval_df = val_df.copy()
        
        # Load external data
        for art in [config.external_data_1, config.external_data_2, config.external_data_3, config.external_data_4, config.external_data_5]:
            if art != 'none':
                print(f'Loading external data {art}...')
                artifact = wandb.use_artifact(art)
                artifact_dir = artifact.download()
                ext_df = pd.read_parquet(artifact_dir + '/ext_data.parquet')
                train_df = pd.concat([train_df, ext_df], ignore_index=True)

        # Prepare references and labels from val set
        reference_df = get_reference_df(config.val_artifact)
        all_labels = sorted(list(set(chain(*[x.tolist() for x in val_df.labels.values])))) #get from val df
        label2id = {l: i for i,l in enumerate(all_labels)}
        id2label = {v:k for k,v in label2id.items()}

        # Create the training and validation datasets
        tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
        train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id)
        valid_ds = create_dataset(val_df, tokenizer, config.inference_max_length, label2id)

        # Initialize the model and data collator
        model = AutoModelForTokenClassification.from_pretrained(
            config.training_model_path,
            num_labels=len(all_labels),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

        # Define the training arguments
        args = TrainingArguments(
            output_dir=config.output_dir, 
            fp16=config.fp16,
            learning_rate=config.learning_rate,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            report_to=config.report_to,
            evaluation_strategy=config.evaluation_strategy,
            eval_steps = config.eval_steps,
            save_strategy = config.evaluation_strategy, #these need to be the same
            do_eval=config.do_eval,
            save_total_limit=config.save_total_limit,
            logging_steps=config.logging_steps,
            lr_scheduler_type=config.lr_scheduler_type,
            warmup_ratio=config.warmup_ratio,
            weight_decay=config.weight_decay,
            load_best_model_at_end = config.load_best_model_at_end,
            metric_for_best_model = config.metric_for_best_model ,
            greater_is_better = config.greater_is_better,
        )

        #class weights based on dataset to go to CustomTrainer Class #TODO try without or move to config or make refactor
        class_weights = torch.tensor([1.]*12 + [config.o_weight]).to('cuda')

        # Initialize Trainer with custom class weights
        trainer = CustomTrainer(
            model=model, 
            args=args, 
            train_dataset=train_ds,
            eval_dataset=valid_ds,
            data_collator=collator, 
            tokenizer=tokenizer,
            compute_metrics=partial(compute_metrics, id2label=id2label, valid_ds=valid_ds, valid_df=reference_df, threshold=config.threshold),
            class_weights=class_weights,
            #callbacks=[WandbLoggingCallback], #added for wandb anaylsis
        )

        # Train the model
        trainer.train()    

        # Make predictions on the validation dataset
        preds = trainer.predict(valid_ds)

        #theshold tests
        print("doing threshold tests:")
        threshold_tests = [.7, .8, 0.9, 0.95, 0.99]
        scores =[]
        
        for threshold in threshold_tests:
            metrics = compute_metrics((preds.predictions, None), id2label, valid_ds, reference_df, threshold=threshold)
            f5_score = metrics['ents_f5']
            scores.append(f5_score)
            wandb.log({'threshold': threshold, 'final_f5': f5_score})
#             print(f'threshold:f5 {threshold}: {final_f5})

        best_threshold = 0.0  
        best_f5 = 0.0  
        for thresh, score in zip(threshold_tests, scores):
            if score > best_f5:
                best_threshold = thresh
                best_f5 = score
            
        wandb.config.best_threshold = best_threshold
        preds_df = parse_predictions(preds.predictions, id2label, valid_ds, threshold=best_threshold)
        
        # Save the model and upload it to Kaggle
        os.makedirs(config.experiment, exist_ok=True)
        trainer.save_model(config.experiment)
        tokenizer.save_pretrained(config.experiment)
        print('Experiment finished, test it out on the inference notebook!')
    
    

# W and B 

In [8]:
# make sure to attach key from secrets in add-ons
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

import wandb
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
# #stop previous sweeps that might be running
# # Replace 'entity/project/your_sweep_id' with your actual sweep ID
# sweep = api.sweep("csci566sp24/PII_sweep_0/your_sweep_id")

# # Stop all runs in the sweep
# for run in sweep.runs:
#     run.finish()

In [10]:
#Sweep config
#search type
sweep_config = {
    'method': 'bayes' #grid, random, bayes
    }

#metrics for evaluation
metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

#intialize parameters 
parameters_dict = {
    'experiment': {'value': 'pii_sweep'},
    'threshold': {'value': 0.99},
    'o_weight': {'value': 0.05},  # set to 1 for equal weight for classes
    'raw_artifact': {'value': 'csci566sp24/pii/base_data:v1'},
    'train_artifact': {'value': 'csci566sp24/pii/mini_no_overlap_data:v4'},
    'val_artifact': {'value': 'csci566sp24/pii/val_data:v2'},
    'train_artifact_name': {'value': 'mini_no_overlap'},
    'val_artifact_name': {'value': 'val_data'},
    'external_data_1': {'value': 'none'},
    'external_data_2': {'value': 'none'},
    'external_data_3': {'value': 'none'},
    'external_data_4': {'value': 'none'},
    'external_data_5': {'value': 'none'},
    'output_dir': {'value': 'output'},
    'inference_max_length': {'value': 1024},
    'training_max_length': {'value': 1024},
    'training_model_path': {'value': 'microsoft/deberta-v3-xsmall'},
    'fp16': {'value': True},
    'learning_rate': {'value': 1e-5},
    'num_train_epochs': {'value': .5},
    'per_device_train_batch_size': {'value': 4},
    'per_device_eval_batch_size': {'value': 4},
    'gradient_accumulation_steps': {'value': 2},
    'report_to': {'value': 'wandb'},
    'evaluation_strategy': {'value': 'epoch'},
    'eval_steps': {'value': 20},
    'do_eval': {'value': False},
    'save_total_limit': {'value': 2},
    'logging_steps': {'value': 10},
    'lr_scheduler_type': {'value': 'cosine'},
    'warmup_ratio': {'value': 0.1},
    'weight_decay': {'value': 0.01},
    'load_best_model_at_end': {'value': False},
    'metric_for_best_model': {'value': 'ents_f5'},
    'greater_is_better': {'value': True},
}

sweep_config['parameters'] = parameters_dict



In [11]:
# udpates for this sweep
#update values we dont want to vary during the sweep

# # Update method to  from bayes
# sweep_config['method'] = 'random'

parameters_dict.update({
    'num_train_epochs': {
        'value': 3},
    'per_device_eval_batch_size': {
        'value': 2},
    'per_device_train_batch_size': {
        'value': 2},
    'gradient_accumulation_steps': {
        'value': 2},
    'learning_rate': {
        'value': 1e-4},
    'evaluation_strategy': {
        'value': 'epoch'}, 
    })

parameters_dict.update({
    'o_weight': {
        'distribution': 'uniform',
        'min': 0.01,
        'max': 1
    },
})


In [12]:
#nested dictionary of parameters interested in and method we are trying
import pprint

pprint.pprint(sweep_config)

{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'do_eval': {'value': False},
                'eval_steps': {'value': 20},
                'evaluation_strategy': {'value': 'epoch'},
                'experiment': {'value': 'pii_sweep'},
                'external_data_1': {'value': 'none'},
                'external_data_2': {'value': 'none'},
                'external_data_3': {'value': 'none'},
                'external_data_4': {'value': 'none'},
                'external_data_5': {'value': 'none'},
                'fp16': {'value': True},
                'gradient_accumulation_steps': {'value': 2},
                'greater_is_better': {'value': True},
                'inference_max_length': {'value': 1024},
                'learning_rate': {'value': 0.0001},
                'load_best_model_at_end': {'value': False},
                'logging_steps': {'value': 10},
                'lr_scheduler_type': {'value': 'cosine'},
                'metric_for

In [13]:
# runs training script
sweep_id = wandb.sweep(sweep_config, project= 'PII_sweep')
wandb.agent(sweep_id, train, count=3)

Create sweep with ID: kyq1jsxg
Sweep URL: https://wandb.ai/csci566sp24/PII_sweep/sweeps/kyq1jsxg


[34m[1mwandb[0m: Agent Starting Run: 3wmqoy7m with config:
[34m[1mwandb[0m: 	do_eval: False
[34m[1mwandb[0m: 	eval_steps: 20
[34m[1mwandb[0m: 	evaluation_strategy: epoch
[34m[1mwandb[0m: 	experiment: pii_sweep
[34m[1mwandb[0m: 	external_data_1: none
[34m[1mwandb[0m: 	external_data_2: none
[34m[1mwandb[0m: 	external_data_3: none
[34m[1mwandb[0m: 	external_data_4: none
[34m[1mwandb[0m: 	external_data_5: none
[34m[1mwandb[0m: 	fp16: True
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	greater_is_better: True
[34m[1mwandb[0m: 	inference_max_length: 1024
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	load_best_model_at_end: False
[34m[1mwandb[0m: 	logging_steps: 10
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	metric_for_best_model: ents_f5
[34m[1mwandb[0m: 	num_train_epochs: 3
[34m[1mwandb[0m: 	o_weight: 0.3930790882801976
[34m[1mwandb[0m: 	output_dir: output
[34m[1mwandb[0m: 

train art dir: /kaggle/working/artifacts/mini_no_overlap_data:v4
train_artifact_name: mini_no_overlap


[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/3061 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/688 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Traceback (most recent call last):
  File "/tmp/ipykernel_18/2921415804.py", line 62, in train
    args = TrainingArguments(
  File "<string>", line 124, in __init__
  File "/opt/conda/lib/python3.10/site-packages/transformers/training_args.py", line 1557, in __post_init__
    raise ValueError(
ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 🚀 View run [33mfallen-sweep-1

train art dir: /kaggle/working/artifacts/mini_no_overlap_data:v4
train_artifact_name: mini_no_overlap


[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/3061 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/688 [00:00<?, ? examples/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Traceback (most recent call last):
  File "/tmp/ipykernel_18/2921415804.py", line 62, in train
    args = TrainingArguments(
  File "<string>", line 124, in __init__
  File "/opt/conda/lib/python3.10/site-packages/transformers/training_args.py", line 1557, in __post_init__
    raise ValueError(
ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 🚀 View run [33melated-sweep-2[0m at: [34m[4mhttps://wandb.ai/csci566sp24/

train art dir: /kaggle/working/artifacts/mini_no_overlap_data:v4
train_artifact_name: mini_no_overlap


[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/3061 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/688 [00:00<?, ? examples/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Traceback (most recent call last):
  File "/tmp/ipykernel_18/2921415804.py", line 62, in train
    args = TrainingArguments(
  File "<string>", line 124, in __init__
  File "/opt/conda/lib/python3.10/site-packages/transformers/training_args.py", line 1557, in __post_init__
    raise ValueError(
ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 🚀 View run [33mdevoted-sweep-3[0m at: [34m[4mhttps://wandb.ai/csci566sp24

# TODO
- move helper functions to seperate script