In [1]:
%env NCCL_SHM_DISABLE=TRUE
%env NCCL_DEBUG=INFO

env: NCCL_SHM_DISABLE=TRUE
env: NCCL_DEBUG=INFO


In [2]:
from transformers import set_seed, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, default_data_collator, TrainerCallback
from reduced_encoders import MPNetCompressedForSequenceClassification, MPNetReducedConfig
from datasets import load_dataset
import evaluate
import numpy as np
import argparse
import os

2024-07-11 14:02:17.804518: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

## Load data (GLUE task)

In [4]:
task_name = "cola"

In [5]:
data = load_dataset("glue", task_name)
is_regression = False

In [6]:
is_regression = task_name == "stsb"
if not is_regression:
    label_list = data["train"].features["label"].names
    num_labels = len(label_list)
else:
    num_labels = 1

## Load Model

In [7]:
checkpoint = "cayjobla/all-mpnet-base-v2-compressed"
revision = "unnormalized"

In [8]:
config = MPNetReducedConfig.from_pretrained(checkpoint, revision=revision, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, revision=revision)
model = MPNetCompressedForSequenceClassification.from_pretrained(
    checkpoint, revision=revision, config=config
)

Some weights of MPNetCompressedForSequenceClassification were not initialized from the model checkpoint at cayjobla/all-mpnet-base-v2-compressed and are newly initialized: ['classifier.bias', 'classifier.weight', 'layernorm.bias', 'layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocessing

In [9]:
metric = evaluate.load("glue", task_name)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    result = metric.compute(predictions=preds, references=p.label_ids)
    if len(result) > 1:
        result["combined_score"] = np.mean(list(result.values())).item()
    return result

In [10]:
training_args = TrainingArguments(
    output_dir="test-eval-glue-" + task_name,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="no",
    push_to_hub=False,
    logging_steps=50,
    run_name="test-eval-glue-" + task_name
)



#### Data preprocessing

In [11]:
sentence1_key, sentence2_key = task_to_keys[task_name]
padding = "max_length"

In [12]:
def preprocess_function(examples):  # Tokenize
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*args, padding=padding, max_length=128, truncation=True)   
    return result

In [13]:
with training_args.main_process_first(desc="dataset map pre-processing"):
    data = data.map(
        preprocess_function,
        batched=True,
        desc="Running tokenizer on dataset",
    )

In [14]:
train_dataset = data["train"]
eval_dataset = data["validation_matched" if task_name == "mnli" else "validation"]

## Training

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=default_data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

class EvaluateFirstStepCallback(TrainerCallback):
    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step == 0:
            control.should_evaluate = True

trainer.add_callback(EvaluateFirstStepCallback())

In [16]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcayjobla[0m. Use [1m`wandb login --relogin`[0m to force relogin


vega:470945:470945 [0] NCCL INFO cudaDriverVersion 12020
vega:470945:470945 [0] NCCL INFO Bootstrap : Using eno3:192.168.173.17<0>
vega:470945:470945 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
NCCL version 2.19.3+cuda12.3
vega:470945:471339 [0] NCCL INFO Failed to open libibverbs.so[.1]
vega:470945:471339 [0] NCCL INFO NET/Socket : Using [0]eno3:192.168.173.17<0>
vega:470945:471339 [0] NCCL INFO Using non-device net plugin version 0
vega:470945:471339 [0] NCCL INFO Using network Socket
vega:470945:471341 [2] NCCL INFO Using non-device net plugin version 0
vega:470945:471341 [2] NCCL INFO Using network Socket
vega:470945:471340 [1] NCCL INFO Using non-device net plugin version 0
vega:470945:471340 [1] NCCL INFO Using network Socket

vega:470945:471339 [0] misc/nvmlwrap.cc:100 NCCL WARN nvmlInit_v2() failed: Driver/library version mismatch
vega:470945:471339 [

RuntimeError: NCCL Error 2: unhandled system error (run with NCCL_DEBUG=INFO for details)