In [1]:
import os

import evaluate
import datasets
from transformers import (
    AutoTokenizer,
    EvalPrediction,
    TrainingArguments,
    AutoModelForSequenceClassification,
    Trainer
)
from tqdm import tqdm
from transformers.adapters import  AdapterTrainer, AutoAdapterModel, AdapterConfig

from personalized_nlp.models.adapters.heads import RegressionHead
from personalized_nlp.datasets.doccano.doccano import DoccanoDataModule
from personalized_nlp.utils import seed_everything
from settings import STORAGE_DIR


BATCHSIZE = 16

OUTPUT_DIR = STORAGE_DIR / "adapter_finetuned" / 'results'
PRETRAINED_DIR = STORAGE_DIR / "adapter_finetuned" / 'pretrained_models'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PRETRAINED_DIR, exist_ok=True)

In [None]:
def tokenize_function(examples):  # type: ignore
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=128
    )

def compute_metrics(prediction: EvalPrediction) -> dict:
    labels = prediction.label_ids
    predictions = prediction.predictions

    mse = evaluate.load("mse")
    mse_val = mse.compute(references=labels.flatten(), predictions=predictions.flatten())

    return {
        "mse": mse_val,
    }

In [None]:
for fold in range(5):
    seed_everything()

    experiment_file_path = PRETRAINED_DIR / f"fold_{fold}.pkl"

    dm = DoccanoDataModule(regression=True, 
                        stratify_folds_by='texts', 
                        min_annotations_per_text=20, 
                        use_cuda=False, 
                        folds_num=5,
                        test_fold=fold,
                        empty_annotations_strategy="drop", 
                        randomize_questionnaries=False)

    data = dm.annotations.merge(dm.data)

    full_data = data.loc[:, ['text', 'split']]
    vals_arr = data[dm.annotation_columns].values
    full_data['labels'] = [vals_arr[i] for i in range(vals_arr.shape[0])]

    train_data = full_data.loc[full_data.split == 'train']
    val_data = full_data.loc[full_data.split == 'val']
    test_data = full_data.loc[full_data.split == 'test']

    train_dataset = datasets.Dataset.from_pandas(train_data.loc[:, ["text", "labels"]])
    train_dataset = train_dataset.map(tokenize_function, batched=True)

    val_dataset = datasets.Dataset.from_pandas(val_data.loc[:, ["text", "labels"]])
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    test_dataset = datasets.Dataset.from_pandas(test_data.loc[:, ["text"]])
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        "sentence-transformers/LaBSE",
       num_labels=vals_arr.shape[1],
       problem_type='regression'
    )

    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

    training_args = TrainingArguments(
        output_dir=str("./tmp/test_trainer"),
        evaluation_strategy="epoch",  # type: ignore
        save_strategy="epoch",  # type: ignore
        per_device_train_batch_size=BATCHSIZE,
        #report_to="mlflow",  # type: ignore
        num_train_epochs=6,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

    trainer.train()

    preds = trainer.predict(test_dataset=test_dataset)  # type: ignore,

    test_data['preds'] = [preds.predictions[i] for i in range(len(preds.predictions))]

    test_data.to_pickle(experiment_file_path)
    model.save_pretrained(PRETRAINED_DIR / f"fold_{fold}")

In [None]:
for fold in range(5):
    seed_everything()

    dm = DoccanoDataModule(regression=True, 
                        stratify_folds_by='texts', 
                        min_annotations_per_text=20, 
                        use_cuda=False, 
                        folds_num=5,
                        test_fold=fold,
                        empty_annotations_strategy="drop", 
                        randomize_questionnaries=False)

    model = AutoAdapterModel.from_pretrained(
        PRETRAINED_DIR / f"fold_{fold}",
        num_labels=vals_arr.shape[1],
        problem_type='regression'
    )

    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")


    for annotator_id in tqdm(range(40)):
        experiment_file_path = OUTPUT_DIR / f"annotator_{annotator_id}_fold_{fold}.pkl"
        # if os.path.exists(experiment_file_path):
        #     continue

        data = dm.annotations.loc[dm.annotations.user_id == annotator_id]

        full_data = dm.data.merge(data).loc[:, ['text', 'split']]
        vals_arr = dm.data.merge(data)[dm.annotation_columns].values
        full_data['labels'] = [vals_arr[i] for i in range(vals_arr.shape[0])]

        train_data = full_data.loc[full_data.split == 'train']
        val_data = full_data.loc[full_data.split == 'val']
        test_data = full_data.loc[full_data.split == 'test']

        train_dataset = datasets.Dataset.from_pandas(train_data.loc[:, ["text", "labels"]])
        train_dataset = train_dataset.map(tokenize_function, batched=True)

        val_dataset = datasets.Dataset.from_pandas(val_data.loc[:, ["text", "labels"]])
        val_dataset = val_dataset.map(tokenize_function, batched=True)

        test_dataset = datasets.Dataset.from_pandas(test_data.loc[:, ["text"]])
        test_dataset = test_dataset.map(tokenize_function, batched=True)

        task_name = f"annotator_{annotator_id}"

        head = RegressionHead(model, task_name, num_labels=26)
        model.add_prediction_head(head, overwrite_ok=True)

        if task_name not in model.config.adapters:
            adapter_config = AdapterConfig.load("pfeiffer")
            model.add_adapter(task_name, config=adapter_config)

        model.train_adapter(task_name)
        model.set_active_adapters(task_name)

        training_args = TrainingArguments(
            output_dir=str("./tmp/test_trainer"),
            evaluation_strategy="epoch",  # type: ignore
            save_strategy="epoch",  # type: ignore
            per_device_train_batch_size=BATCHSIZE,
            #report_to="mlflow",  # type: ignore
            num_train_epochs=6,
            load_best_model_at_end=True,
        )

        trainer = AdapterTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
            )

        trainer.train()

        preds = trainer.predict(test_dataset=test_dataset)  # type: ignore,

        test_data['preds'] = [preds.predictions[i] for i in range(len(preds.predictions))]
        test_data.to_pickle(experiment_file_path)