# FOSSistant difficulty prediction model v0.3.0 training

## Preparation

### Setup

In [None]:
%pip install -U -q datasets evaluate "huggingface_hub[hf_xet]" "huggingface_hub[hf_transfer]"

In [None]:
# GDRIVE_DIR = r"/content/drive/"
ROOT_DIR = r"/teamspace/studios/this_studio/"

OUTPUT_DIR = ROOT_DIR + "models/FOSSistant-Difficulty-Prediction-v0.3.0"

MODEL_PATH = "answerdotai/ModernBERT-large"
# MODEL_PATH = r"answerdotai/ModernBERT-base"

In [None]:
import os
import random
import numpy as np

!export HF_HUB_ENABLE_HF_TRANSFER=1
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="FOSSistant"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

# !export WANDB_DISABLED=true
# os.environ["WANDB_DISABLED"] = "true"

random.seed(42)
np.random.seed(42)

In [None]:
# from google.colab import drive
# drive.mount(GDRIVE_DIR)

## Training

In [None]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from datasets import load_from_disk
import evaluate
from huggingface_hub import login

import wandb

# login("")

In [None]:
tokenized_ds = load_from_disk(ROOT_DIR + "datasets/fossistant/github_issues_tokenized")
tokenized_ds

In [None]:
labels = tokenized_ds["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

In [None]:
def compute_metrics(eval_pred):
    metric1 = evaluate.load("accuracy")
    metric2 = evaluate.load("f1")
    metric3 = evaluate.load("precision")
    metric4 = evaluate.load("recall")

    # average = "weighted"
    average = "macro"
    # average = "micro"

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # predictions = np.argmax(predictions, axis=-1)

    accuracy = metric1.compute(predictions=predictions,
                               references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions,
                         references=labels,
                         average=average)["f1"]
    precision = metric3.compute(predictions=predictions,
                                references=labels,
                                average=average)["precision"]
    recall = metric4.compute(predictions=predictions,
                             references=labels,
                             average=average)["recall"]

    return {"accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall}

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    logging_steps=100,
    logging_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_strategy="no",
    overwrite_output_dir=True,

    # report_to="tensorboard",
    # report_to="wandb",
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_token=HfFolder.get_token(),

    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    learning_rate=5e-5,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    optim="adamw_torch_fused",
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained(OUTPUT_DIR)
trainer.create_model_card()
trainer.save_model(OUTPUT_DIR)
# trainer.push_to_hub()

!rm -r "$OUTPUT_DIR"/checkpoint*

In [None]:
wandb.finish()

## Evaluation

In [None]:
from transformers import pipeline
from datasets import load_from_disk
import evaluate

In [None]:
ds = load_from_disk(ROOT_DIR + "datasets/fossistant/github_issues")
ds

In [None]:
pipe = pipeline(
    "text-classification",
    # model=OUTPUT_DIR,
    model=ROOT_DIR + "models/FOSSistant-Difficulty-Prediction-v0.3.0-bak",
    device=0,
)

In [None]:
def compute_metrics(eval_pred):
    metric1 = evaluate.load("accuracy")
    metric2 = evaluate.load("f1")
    metric3 = evaluate.load("precision")
    metric4 = evaluate.load("recall")

    # average = "weighted"
    average = "macro"
    # average = "micro"

    predictions, labels = eval_pred

    accuracy = metric1.compute(predictions=predictions,
                               references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions,
                         references=labels,
                         average=average)["f1"]
    precision = metric3.compute(predictions=predictions,
                                references=labels,
                                average=average)["precision"]
    recall = metric4.compute(predictions=predictions,
                             references=labels,
                             average=average)["recall"]

    return {"accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall}

In [None]:
predictions = pipe(ds["test"]["text"])

In [None]:
label_mapping = {"easy": 0, "medium": 1, "hard": 2, "misc": 3}
predicted_labels = [label_mapping[p["label"]] for p in predictions]
true_labels = ds["test"]["labels"]
metrics = compute_metrics((predicted_labels, true_labels))
metrics