In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from accelerate.utils import release_memory
import gc
import pickle
import pandas as pd
import random
import datasets
from datetime import datetime
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report


SEED = 44

def main():
    model_name="MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33"
    force_cpu = False
    device = "cuda" if torch.cuda.is_available() and not force_cpu else "cpu"
    print(f"Device: {device}")
    max_length = 512
    label2id = {"entailment": 0, "not_entailment": 1}  #{"entailment": 0, "neutral": 1, "contradiction": 2}
    id2label = {0: "entailment", 1: "not_entailment"}  #{0: "entailment", 1: "neutral", 2: "contradiction"}
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, model_max_length=max_length)  # model_max_length=512
    label_text_unique = list(label2id.keys())
    #model = pipeline("zero-shot-classification", model=model_name, device = device)
    if device == "cuda":
        # free memory
        flush()

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, label2id=label2id, id2label=id2label
    ).to(device)
    
    def tokenize_func(examples):
        return tokenizer(examples["text"], examples["hypothesis"], truncation=True)


    train_test = get_training_formatted().sample(frac = 0.5)
    limit = int(len(train_test)/3)
    training = train_test.iloc[limit:3*limit, :]
    test = train_test.iloc[:limit, :]

    encoded_dataset_train = datasets.Dataset.from_pandas(training).map(tokenize_func, batched=True)
    print(len(encoded_dataset_train))
    # testing during training loop on aggregated testset:
    encoded_dataset_test = datasets.Dataset.from_pandas(test).map(tokenize_func, batched=True)
    print(len(encoded_dataset_test))

    # remove columns the library does not expect
    encoded_dataset_train = encoded_dataset_train.remove_columns(["hypothesis", "text"])
    encoded_dataset_test = encoded_dataset_test.remove_columns(["hypothesis", "text"])

    now = datetime.now().strftime("%Y-%m-%d-%H-%M")
    run_name = f"{model_name.split('/')[-1]}-zeroshot-retrained-{now}"
    training_directory = f'data/train/{run_name}'
    fp16_bool = True if torch.cuda.is_available() else False
    if "mDeBERTa" in model_name: fp16_bool = False  # mDeBERTa does not support FP16 yet

    # https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
    eval_batch = 64 if "large" in model_name else 64*2
    per_device_train_batch_size = 8 if "large" in model_name else 32
    gradient_accumulation_steps = 4 if "large" in model_name else 1

  
    train_args = TrainingArguments(
        output_dir=training_directory,
        logging_dir=f'{training_directory}/logs',
        #deepspeed="ds_config_zero3.json",  # if using deepspeed
        lr_scheduler_type= "linear",
        group_by_length=False,  # can increase speed with dynamic padding, by grouping similar length texts https://huggingface.co/transformers/main_classes/trainer.html
        learning_rate=9e-6, #if "large" in model_name else 2e-5,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=eval_batch,
        gradient_accumulation_steps=gradient_accumulation_steps,  # (!adapt/halve batch size accordingly). accumulates gradients over X steps, only then backward/update. decreases memory usage, but also slightly speed
        #eval_accumulation_steps=2,
        num_train_epochs=3,
        #max_steps=400,
        #warmup_steps=0,  # 1000,
        warmup_ratio=0.06,  #0.1, 0.06
        weight_decay=0.01,  #0.1,
        fp16=fp16_bool,   # ! only makes sense at batch-size > 8. loads two copies of model weights, which creates overhead. https://huggingface.co/transformers/performance.html?#fp16
        fp16_full_eval=fp16_bool,
        eval_strategy="epoch",
        seed=SEED,
        #load_best_model_at_end=True,
        #metric_for_best_model="accuracy",
        #eval_steps=300,  # evaluate after n steps if evaluation_strategy!='steps'. defaults to logging_steps
        save_strategy="no",  # options: "no"/"steps"/"epoch"
        #save_steps=1_000_000,  # Number of updates steps before two checkpoint saves.
        save_total_limit=1,  # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir
        #logging_strategy="epoch",
        report_to="all",  # "all"
        run_name=run_name,
        #push_to_hub=True,  # does not seem to work if save_strategy="no"
        #hub_model_id=hub_model_id,
        #hub_token=config.HF_ACCESS_TOKEN,
        #hub_strategy="end",
        #hub_private_repo=True,
    )
    trainer = Trainer(
        model=model,
        #model_init=model_init,
        tokenizer=tokenizer,
        args=train_args,
        train_dataset=encoded_dataset_train,  #.shard(index=1, num_shards=200),  # https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
        eval_dataset=encoded_dataset_test,  #.shard(index=1, num_shards=20),
        compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=label_text_unique)  #compute_metrics,
        #data_collator=data_collator,  # for weighted sampling per dataset; for dynamic padding probably not necessary because done by default  https://huggingface.co/course/chapter3/3?fw=pt
    )
    print("Training")
    try:
        trainer.train()
        print("Trained")
    finally:
        if device == "cuda":
            flush()
            release_memory(model)

    model_path = f'data/{model_name.split("/")[-1]}-zeroshot-retrained'


    trainer.save_model(output_dir=model_path)



def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
def compute_metrics_standard(eval_pred, label_text_alphabetical=None):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)  # argmax on each row (axis=1) in the tensor

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)

    metrics = {'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            #'label_gold_raw': labels,
            #'label_predicted_raw': preds_max
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )  # print metrics but without label lists
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

def get_training_formatted():
    with open("data/datasets-training-test.pickle", "rb") as f:
        d = pickle.load(f)
    
    training = d["baseterm"]["training"]
    codes =  [*training.category]
    random.seed(SEED)
    def other_codes(code):
        return [*{*random.sample(codes, 1)} - {code}]
    training["other_code"] = training.apply(lambda r: other_codes(r["category"]), axis = 1)
    isa = training[["text", "hierarchy", "category"]]
    isa["labels"] = 0
    isnot =  training.explode("other_code")[["text", "hierarchy", "other_code"]].rename(columns = {"other_code":"category"})
    isnot["labels"] = 1
    training = pd.concat([isa, isnot], ignore_index = True).sort_values(["text", "labels"])
    terms = pd.read_pickle("data/terms.pickle").set_index(["hierarchyCode", "termCode"])
    joined = training.join(terms, on=["hierarchy", "category"], how='inner', lsuffix='', rsuffix='', sort=False, validate="many_to_one")
    #assert len(training) == len(joined), f"{len(training)} != {len(joined)}"
    joined = joined[["text", "termExtendedName", "labels"]].rename(columns = {"termExtendedName": "hypothesis"})
    joined["hypothesis"] = "This food is a: "+ joined["hypothesis"].map(str)
    joined["task_name"] = "food classification"
    return joined.reset_index()
    
main()

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17782/17782 [00:01<00:00, 10860.50 examples/s]


17782


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8891/8891 [00:00<00:00, 12000.79 examples/s]
  trainer = Trainer(


8891
Training


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
1,0.2653,0.157583,0.945888,0.9459,0.945844,0.9459,0.946089,0.945844,0.9459,0.9459
2,0.1487,0.149296,0.954325,0.954336,0.954277,0.954336,0.954552,0.954277,0.954336,0.954336
3,0.1171,0.15527,0.956581,0.956585,0.956562,0.956585,0.956625,0.956562,0.956585,0.956585


Aggregate metrics:  {'f1_macro': 0.9458880572629351, 'f1_micro': 0.9459003486671915, 'accuracy_balanced': np.float64(0.945844116781454), 'accuracy': 0.9459003486671915, 'precision_macro': 0.9460891278246668, 'recall_macro': 0.945844116781454, 'precision_micro': 0.9459003486671915, 'recall_micro': 0.9459003486671915}
Detailed metrics:  {'entailment': {'precision': 0.9382824511311223, 'recall': 0.9552772808586762, 'f1-score': 0.9467036011080332, 'support': 4472.0}, 'not_entailment': {'precision': 0.9538958045182112, 'recall': 0.9364109527042317, 'f1-score': 0.9450725134178372, 'support': 4419.0}, 'accuracy': 0.9459003486671915, 'macro avg': {'precision': 0.9460891278246668, 'recall': 0.945844116781454, 'f1-score': 0.9458880572629351, 'support': 8891.0}, 'weighted avg': {'precision': 0.9460425915672426, 'recall': 0.9459003486671915, 'f1-score': 0.9458929187884993, 'support': 8891.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Aggregate metrics:  {'f1_macro': 0.9543246814902995, 'f1_micro': 0.9543358452367563, 'accuracy_balanced': np.float64(0.9542765606801982), 'accuracy': 0.9543358452367563, 'precision_macro': 0.9545515213459732, 'recall_macro': 0.9542765606801982, 'precision_micro': 0.9543358452367563, 'recall_micro': 0.9543358452367563}
Detailed metrics:  {'entailment': {'precision': 0.9460289600702062, 'recall': 0.964221824686941, 'f1-score': 0.9550387596899225, 'support': 4472.0}, 'not_entailment': {'precision': 0.9630740826217401, 'recall': 0.9443312966734555, 'f1-score': 0.9536106032906764, 'support': 4419.0}, 'accuracy': 0.9543358452367563, 'macro avg': {'precision': 0.9545515213459732, 'recall': 0.9542765606801982, 'f1-score': 0.9543246814902995, 'support': 8891.0}, 'weighted avg': {'precision': 0.9545007176402466, 'recall': 0.9543358452367563, 'f1-score': 0.9543289381706032, 'support': 8891.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Aggregate metrics:  {'f1_macro': 0.9565813426107543, 'f1_micro': 0.9565853109886402, 'accuracy_balanced': np.float64(0.9565623126432817), 'accuracy': 0.9565853109886402, 'precision_macro': 0.9566251623053904, 'recall_macro': 0.9565623126432817, 'precision_micro': 0.9565853109886402, 'recall_micro': 0.9565853109886402}
Detailed metrics:  {'entailment': {'precision': 0.9535968028419183, 'recall': 0.9604203935599285, 'f1-score': 0.9569964349376114, 'support': 4472.0}, 'not_entailment': {'precision': 0.9596535217688625, 'recall': 0.952704231726635, 'f1-score': 0.9561662502838973, 'support': 4419.0}, 'accuracy': 0.9565853109886402, 'macro avg': {'precision': 0.9566251623053904, 'recall': 0.9565623126432817, 'f1-score': 0.9565813426107543, 'support': 8891.0}, 'weighted avg': {'precision': 0.9566071099995123, 'recall': 0.9565853109886402, 'f1-score': 0.9565838170110831, 'support': 8891.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Trained


In [31]:
#from datasets import load_dataset
#ds = load_dataset("MoritzLaurer/dataset_train_nli")["train"]
#ds = None
ds.to_pandas()

Unnamed: 0,text,hypothesis,labels,task_name,label_text
0,"Know anyone for hire, Callie?""","Callie, know anyone that's for hire?",0,mnli,
1,The First Word in Protective & Marine Coatings...,This text is about: protective coatings,0,mixtral_small_zeroshot,protective coatings
2,It was in 1904 he and I worked together ”the A...,I met him for the first time in 1915.,1,mnli,
3,Mike Pence is a conservative Republican.,Mike Pence . He served as the chairman of the ...,0,fevernli,
4,Management of Human Capital,There is no way to account for human capital.,1,mnli,
...,...,...,...,...,...
1018728,"Since 1931, when gambling was officially legal...",Gambling is seen as a blight on Nevada and is ...,1,mnli,
1018729,I have no doubt that several entries were part...,I have doubts that good examples existed.,1,mnli,
1018730,"As a result, it is important to determine the ...",Heating water helps people.,1,wanli,
1018731,The river forms a natural line between the nor...,The north and south divide the city into secti...,0,mnli,


In [102]:
flush()
