# Setting up and experimenting with training pipeline

For this task I plan to set up a token classification pipeline with the dataset created at the previous step.

Inspiration comes from thi tutorial on HF: https://huggingface.co/docs/peft/task_guides/token-classification-lora

My training data is structured the same way as in tutorial except for token classes - I have just 2.
I suppose almost all of the code will be very simmilar to the one inside the provided guide and I can then iterate from this starting point.

In [1]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np
import os

## Set up WandB

In [None]:
os.environ["WANDB_PROJECT"] = "wiki-comma-placement" # name your W&B project 
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

In [None]:
import wandb
wandb.login()

In [None]:
seqeval = evaluate.load("seqeval")

## Some common params and config variables.

They will go into separate file once I finish the set up.

In [None]:
model_checkpoint = "roberta-large"
lr = 1e-3
batch_size = 16
num_epochs = 10

dataset_path = "just097/wiki-comma-placement" # My formatted dataset
model_name = "roberta-large-lora-comma-placement"
checkpoints_path = f"../models/{model_name}"

### Our labels

In [None]:
ID2LABEL = {0: "O", 1: "B-COMMA"}
LABEL2ID = {"O": 0, "B-COMMA": 1}

## Set up metrics

In [None]:
label_list = ["O", "B-COMMA"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
wiki_comma_placement = load_dataset(dataset_path)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_wiki = wiki_comma_placement.map(tokenize_and_align_labels, batched=True)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Set up the model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
)

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="none"
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir="roberta-large-lora-token-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
    run_name=model_name,
    logging_steps=1,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wiki["train"],
    eval_dataset=tokenized_wiki["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
wandb.finish()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub(model_name)

## Test eval and Inference example

In [None]:
peft_model_id = f"just097/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForTokenClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=11, id2label=ID2LABEL, label2id=LABEL2ID
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, peft_model_id)

In [None]:
sample_sentences = ["one two three.", "Hey Mark how are you?", "You have to buy milk bread and coffee."]

In [None]:
def infer(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.inference_mode():
        logits = model(**inputs).logits
    tokens = inputs.tokens()
    predictions = torch.argmax(logits, dim=2)
    for token, prediction in zip(tokens, predictions[0].numpy()):
        print((token, model.config.id2label[prediction]))
    return tokens, predictions[0].numpy()

In [None]:
def convert_to_text(tokens, predictions):
    final_text = []
    for token, prediction in zip(tokens, predictions):
        if prediction == 1:
            token = token+","
        final_text.append(token)
    return "".join(final_text)

In [None]:
for sample in sample_sentences:
    tokens, predictions = infer(sample)
    print(convert_to_text(tokens, predictions))