# Setting up and experimenting with training pipeline

For this task I plan to set up a token classification pipeline with the dataset created at the previous step.

Inspiration comes from thi tutorial on HF: https://huggingface.co/docs/peft/task_guides/token-classification-lora

My training data is structured the same way as in tutorial except for token classes - I have just 2.
I suppose almost all of the code will be very simmilar to the one inside the provided guide and I can then iterate from this starting point.

In [1]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np
import os

## Set up WandB

In [2]:
os.environ["WANDB_PROJECT"] = "wiki-comma-placement" # name your W&B project 
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

In [3]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtemnov-dmitry[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
seqeval = evaluate.load("seqeval")

## Some common params and config variables.

They will go into separate file once I finish the set up.

In [5]:
base_model = "roberta-base"
lr = 1e-3
batch_size = 32
num_epochs = 1

dataset_path = "just097/wiki-comma-placement" # My formatted dataset
model_name = "roberta-base-lora-comma-placement"
checkpoints_path = f"../models/{model_name}"

### Our labels

In [6]:
ID2LABEL = {0: "O", 1: "B-COMMA"}
LABEL2ID = {"O": 0, "B-COMMA": 1}

## Set up metrics

In [7]:
label_list = ["O", "B-COMMA"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [8]:
wiki_comma_placement = load_dataset(dataset_path)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model, add_prefix_space=True)

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_wiki = wiki_comma_placement.map(tokenize_and_align_labels, batched=True)

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Set up the model

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    base_model, num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, modules_to_save=["classifier"]
)

In [15]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 591,362 || all params: 124,647,940 || trainable%: 0.47442581080762347


In [16]:
training_args = TrainingArguments(
    f"{model_name}-finetuned",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
    run_name=model_name,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wiki["train"],
    eval_dataset=tokenized_wiki["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
wandb.finish()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0264,0.04143,0.831474,0.850108,0.840688,0.983288


[34m[1mwandb[0m: Adding directory to artifact (./roberta-base-lora-comma-placement-finetuned/checkpoint-2582)... Done. 0.0s
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.




0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
eval/accuracy,0.98329
eval/f1,0.84069
eval/loss,0.04143
eval/precision,0.83147
eval/recall,0.85011
eval/runtime,26.1903
eval/samples_per_second,788.497
eval/steps_per_second,24.666
train/epoch,1.0
train/global_step,2582.0


## Test eval and Inference example

In [None]:
peft_model_id = "just097/roberta-base-lora-comma-placement-finetuned"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForTokenClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, peft_model_id)
model.cuda()
model.eval()

In [76]:
sample_sentences = ["one two three.", "Hey Mark how are you?", "This sentence shouldn't have any commas.", "You have to buy milk bread and coffee.", "This sentence shoud have comma here here and here however it doesn't."]

In [65]:
def infer(text):
    tokenized = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, return_length=True)
    tokenized.to(model.device)
    with torch.inference_mode():
        logits = model(tokenized["input_ids"], tokenized["attention_mask"]).logits
    tokens = tokenized.tokens()
    predictions = torch.argmax(logits, dim=2).detach().cpu()
    labels = [model.config.id2label[prediction] for prediction in predictions[0].numpy()]
    return tokens, labels, tokenized['offset_mapping'][0].detach().cpu().numpy()

In [68]:
def _should_insert_comma(label, result, current_offset) -> bool:
    # Only insert commas for the final token of a word, that is, if next word starts with a space.
    # TODO perhaps for low confidence tokens, we should use the original decision of the user in the input?
    return label == 'B-COMMA'

def fix_commas_based_on_labels_and_offsets(
        labels: list[str],
        original_s: str,
        offset_map: list[tuple[int, int]]
) -> str:
    """
    This function returns the original string with only commas fixed, based on the predicted labels from the main
    model and the offsets from the tokenizer.
    :param labels: Predicted labels for the tokens.
    Should already be converted to string, since we will look for B-COMMA tags.
    :param original_s: The original string, used to preserve original spacing and punctuation.
    :param offset_map: List of offsets in the original string, we will only use the second integer of each pair
    indicating where the token ended originally in the string.
    :return: The string with commas fixed, and everything else intact.
    """
    result = original_s
    commas_inserted = 0

    for i, label in enumerate(labels):
        current_offset = offset_map[i][1] + commas_inserted
        if _should_insert_comma(label, result, current_offset):
            result = result[:current_offset] + ',' + result[current_offset:]
            commas_inserted += 1
    return result



In [77]:
def convert_to_text(text: str) -> str:
    tokens, predictions, offset = infer(text)
    res = fix_commas_based_on_labels_and_offsets(predictions, text, offset)
    return res

In [78]:
for i in sample_sentences:
    print(convert_to_text(i))

one, two, three.
Hey Mark, how are you?
This sentence shouldn't have any commas.
You have to buy milk, bread, and coffee.
This sentence shoud have comma here, here and here, however, it doesn't.
