In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from accelerate.utils import release_memory
import gc
import pickle
import pandas as pd
import random
import datasets
from datetime import datetime
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report


SEED = 44

def main():
    model_name="MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33"
    mode = "trained-with-parents" #"retrained"
    force_cpu = False
    device = "cuda" if torch.cuda.is_available() and not force_cpu else "cpu"
    print(f"Device: {device}")
    max_length = 512
    label2id = {"entailment": 0, "not_entailment": 1}  #{"entailment": 0, "neutral": 1, "contradiction": 2}
    id2label = {0: "entailment", 1: "not_entailment"}  #{0: "entailment", 1: "neutral", 2: "contradiction"}
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, model_max_length=max_length)  # model_max_length=512
    label_text_unique = list(label2id.keys())
    #model = pipeline("zero-shot-classification", model=model_name, device = device)
    if device == "cuda":
        # free memory
        flush()

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, label2id=label2id, id2label=id2label
    ).to(device)
    
    def tokenize_func(examples):
        return tokenizer(examples["text"], examples["hypothesis"], truncation=True)


    training, test = get_training_formatted(mode = mode)

    encoded_dataset_train = datasets.Dataset.from_pandas(training).map(tokenize_func, batched=True)
    print(len(encoded_dataset_train))
    # testing during training loop on aggregated testset:
    encoded_dataset_test = datasets.Dataset.from_pandas(test).map(tokenize_func, batched=True)
    print(len(encoded_dataset_test))

    # remove columns the library does not expect
    encoded_dataset_train = encoded_dataset_train.remove_columns(["hypothesis", "text"])
    encoded_dataset_test = encoded_dataset_test.remove_columns(["hypothesis", "text"])

    now = datetime.now().strftime("%Y-%m-%d-%H-%M")
    run_name = f"{model_name.split('/')[-1]}-zeroshot-retrained-{now}"
    training_directory = f'data/train/{run_name}'
    fp16_bool = True if torch.cuda.is_available() else False
    if "mDeBERTa" in model_name: fp16_bool = False  # mDeBERTa does not support FP16 yet

    # https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
    eval_batch = 64 if "large" in model_name else 64*2
    per_device_train_batch_size = 8 if "large" in model_name else 32
    gradient_accumulation_steps = 4 if "large" in model_name else 1

  
    train_args = TrainingArguments(
        output_dir=training_directory,
        logging_dir=f'{training_directory}/logs',
        #deepspeed="ds_config_zero3.json",  # if using deepspeed
        lr_scheduler_type= "linear",
        group_by_length=False,  # can increase speed with dynamic padding, by grouping similar length texts https://huggingface.co/transformers/main_classes/trainer.html
        learning_rate=9e-6, #if "large" in model_name else 2e-5,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=eval_batch,
        gradient_accumulation_steps=gradient_accumulation_steps,  # (!adapt/halve batch size accordingly). accumulates gradients over X steps, only then backward/update. decreases memory usage, but also slightly speed
        #eval_accumulation_steps=2,
        num_train_epochs=3,
        #max_steps=400,
        #warmup_steps=0,  # 1000,
        warmup_ratio=0.06,  #0.1, 0.06
        weight_decay=0.01,  #0.1,
        fp16=fp16_bool,   # ! only makes sense at batch-size > 8. loads two copies of model weights, which creates overhead. https://huggingface.co/transformers/performance.html?#fp16
        fp16_full_eval=fp16_bool,
        eval_strategy="epoch",
        seed=SEED,
        #load_best_model_at_end=True,
        #metric_for_best_model="accuracy",
        #eval_steps=300,  # evaluate after n steps if evaluation_strategy!='steps'. defaults to logging_steps
        save_strategy="no",  # options: "no"/"steps"/"epoch"
        #save_steps=1_000_000,  # Number of updates steps before two checkpoint saves.
        save_total_limit=1,  # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir
        #logging_strategy="epoch",
        report_to="all",  # "all"
        run_name=run_name,
        #push_to_hub=True,  # does not seem to work if save_strategy="no"
        #hub_model_id=hub_model_id,
        #hub_token=config.HF_ACCESS_TOKEN,
        #hub_strategy="end",
        #hub_private_repo=True,
    )
    trainer = Trainer(
        model=model,
        #model_init=model_init,
        tokenizer=tokenizer,
        args=train_args,
        train_dataset=encoded_dataset_train,  #.shard(index=1, num_shards=200),  # https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
        eval_dataset=encoded_dataset_test,  #.shard(index=1, num_shards=20),
        compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=label_text_unique)  #compute_metrics,
        #data_collator=data_collator,  # for weighted sampling per dataset; for dynamic padding probably not necessary because done by default  https://huggingface.co/course/chapter3/3?fw=pt
    )
    print("Training")
    try:
        trainer.train()
        print("Trained")
    finally:
        if device == "cuda":
            flush()
            release_memory(model)

    model_path = f'data/{model_name.split("/")[-1]}-zeroshot-{mode}'


    trainer.save_model(output_dir=model_path)



def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
def compute_metrics_standard(eval_pred, label_text_alphabetical=None):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)  # argmax on each row (axis=1) in the tensor

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)

    metrics = {'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            #'label_gold_raw': labels,
            #'label_predicted_raw': preds_max
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )  # print metrics but without label lists
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

def get_training_formatted(mode):
    with open("data/datasets-training-test.pickle", "rb") as f:
        d = pickle.load(f)
    
    training = d["baseterm"]["training"]
    codes =  [*training.category]
    random.seed(SEED)
    def other_codes(code):
        return [*{*random.sample(codes, 1)} - {code}]
    training["other_code"] = training.apply(lambda r: other_codes(r["category"]), axis = 1)
    isa = training[["text", "hierarchy", "category"]]
    isa["labels"] = 0
    isnot =  training.explode("other_code")[["text", "hierarchy", "other_code"]].rename(columns = {"other_code":"category"})
    isnot["labels"] = 1
    training = pd.concat([isa, isnot], ignore_index = True).sort_values(["text", "labels"])
    terms = pd.read_pickle("data/terms.pickle").set_index(["hierarchyCode", "termCode"])
    joined = training.join(terms, on=["hierarchy", "category"], how='inner', lsuffix='', rsuffix='', sort=False, validate="many_to_one")
    #assert len(training) == len(joined), f"{len(training)} != {len(joined)}"
    joined = joined.reset_index()
    limit = int(len(joined)/3)
    joined = joined.sample(frac=1, random_state=SEED)
    train = joined.iloc[limit:3*limit, :]
    test = joined.iloc[:limit, :]
    dfs = [train, test]
    if mode == "trained-with-parents":
        hCode = d["baseterm"]["training"].hierarchy.unique().tolist()
        assert len(hCode) == 1
        hCode = hCode[0]
        terms = pd.read_pickle("data/terms.pickle")
        parents =  {a:b for a,b in terms[terms.hierarchyCode == hCode][["termCode", "parentCode"]].values}
        descs = {a:b for a,b in terms[terms.hierarchyCode == hCode][["termCode", "termExtendedName"]].values}
        for i in range(0, len(dfs)):
            current = {(text, code):(desc, label) for code,text,desc, label in dfs[i][["category", "text", "termExtendedName", "labels"]].values}
            labels = [0, 1] #start with 0 positive and then 1 negative
            for lab in labels:
                l = 0
                while l < len(current):
                    l = len(current)
                    toadd = {(text, parents[code]): (descs[parents[code]], label) for (text, code), (desc, label) in current.items() if parents[code] != "root" and label == lab and (text, parents[code]) not in current } 
                    #print (lab, len(train), len(toadd))
                    current = {**current, **toadd} 
            dfs[i] =  pd.DataFrame([{"text":text,"termExtendedName":desc, "labels":label } for (text, code),(desc, label) in current.items()]).drop_duplicates()
            dfs[i] = dfs[i].rename(columns = {"termExtendedName": "hypothesis"})
            dfs[i]["hypothesis"] = "This food is a: "+ dfs[i]["hypothesis"].map(str)
            dfs[i]["task_name"] = "food classification"
            # balancing classes
    

    cols = ["text", "hypothesis", "labels", "task_name"]
    #limiyting class umbalance
    tr = dfs[0].drop_duplicates().groupby(["hypothesis", "labels"]).head(40)[cols]
    te = dfs[1].drop_duplicates().groupby(["hypothesis", "labels"]).head(10)[cols]
    tr.to_csv(f"data/training-{mode}.csv")
    te.to_csv(f"data/validate-{mode}.csv")
    return tr,te 
   
main()



  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42009/42009 [00:03<00:00, 11918.59 examples/s]


42009


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14112/14112 [00:01<00:00, 11202.52 examples/s]
  trainer = Trainer(


14112
Training


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
1,0.2066,0.155817,0.942311,0.942319,0.94229,0.942319,0.942347,0.94229,0.942319,0.942319
2,0.132,0.156961,0.950292,0.950326,0.950172,0.950326,0.950987,0.950172,0.950326,0.950326
3,0.1025,0.167324,0.953007,0.953019,0.952952,0.953019,0.953163,0.952952,0.953019,0.953019


Aggregate metrics:  {'f1_macro': 0.9423109917874593, 'f1_micro': 0.9423185941043084, 'accuracy_balanced': np.float64(0.9422897766254084), 'accuracy': 0.9423185941043084, 'precision_macro': 0.9423473542515224, 'recall_macro': 0.9422897766254084, 'precision_micro': 0.9423185941043084, 'recall_micro': 0.9423185941043084}
Detailed metrics:  {'entailment': {'precision': 0.9403381305016069, 'recall': 0.9456231558240832, 'f1-score': 0.9429732380552053, 'support': 7117.0}, 'not_entailment': {'precision': 0.9443565780014378, 'recall': 0.9389563974267334, 'f1-score': 0.9416487455197132, 'support': 6995.0}, 'accuracy': 0.9423185941043084, 'macro avg': {'precision': 0.9423473542515224, 'recall': 0.9422897766254084, 'f1-score': 0.9423109917874593, 'support': 14112.0}, 'weighted avg': {'precision': 0.942329984261621, 'recall': 0.9423185941043084, 'f1-score': 0.942316716989037, 'support': 14112.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Aggregate metrics:  {'f1_macro': 0.9502923452239042, 'f1_micro': 0.9503259637188208, 'accuracy_balanced': np.float64(0.9501721507052097), 'accuracy': 0.9503259637188208, 'precision_macro': 0.9509866883998913, 'recall_macro': 0.9501721507052097, 'precision_micro': 0.9503259637188208, 'recall_micro': 0.9503259637188208}
Detailed metrics:  {'entailment': {'precision': 0.9357511545775604, 'recall': 0.9679640297878319, 'f1-score': 0.9515850542164515, 'support': 7117.0}, 'not_entailment': {'precision': 0.9662222222222222, 'recall': 0.9323802716225875, 'f1-score': 0.9489996362313569, 'support': 6995.0}, 'accuracy': 0.9503259637188208, 'macro avg': {'precision': 0.9509866883998913, 'recall': 0.9501721507052097, 'f1-score': 0.9502923452239042, 'support': 14112.0}, 'weighted avg': {'precision': 0.9508549753098741, 'recall': 0.9503259637188208, 'f1-score': 0.9503035208543669, 'support': 14112.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Aggregate metrics:  {'f1_macro': 0.9530067614619318, 'f1_micro': 0.9530187074829932, 'accuracy_balanced': np.float64(0.9529520925794263), 'accuracy': 0.9530187074829932, 'precision_macro': 0.9531630474451088, 'recall_macro': 0.9529520925794263, 'precision_micro': 0.9530187074829932, 'recall_micro': 0.9530187074829932}
Detailed metrics:  {'entailment': {'precision': 0.9469529085872577, 'recall': 0.9606575804411971, 'f1-score': 0.9537560159029086, 'support': 7117.0}, 'not_entailment': {'precision': 0.95937318630296, 'recall': 0.9452466047176554, 'f1-score': 0.9522575070209548, 'support': 6995.0}, 'accuracy': 0.9530187074829932, 'macro avg': {'precision': 0.9531630474451088, 'recall': 0.9529520925794263, 'f1-score': 0.9530067614619318, 'support': 14112.0}, 'weighted avg': {'precision': 0.9531093600201755, 'recall': 0.9530187074829932, 'f1-score': 0.9530132388600183, 'support': 14112.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Trained


In [31]:
#from datasets import load_dataset
#ds = load_dataset("MoritzLaurer/dataset_train_nli")["train"]
#ds = None
ds.to_pandas()

Unnamed: 0,text,hypothesis,labels,task_name,label_text
0,"Know anyone for hire, Callie?""","Callie, know anyone that's for hire?",0,mnli,
1,The First Word in Protective & Marine Coatings...,This text is about: protective coatings,0,mixtral_small_zeroshot,protective coatings
2,It was in 1904 he and I worked together ”the A...,I met him for the first time in 1915.,1,mnli,
3,Mike Pence is a conservative Republican.,Mike Pence . He served as the chairman of the ...,0,fevernli,
4,Management of Human Capital,There is no way to account for human capital.,1,mnli,
...,...,...,...,...,...
1018728,"Since 1931, when gambling was officially legal...",Gambling is seen as a blight on Nevada and is ...,1,mnli,
1018729,I have no doubt that several entries were part...,I have doubts that good examples existed.,1,mnli,
1018730,"As a result, it is important to determine the ...",Heating water helps people.,1,wanli,
1018731,The river forms a natural line between the nor...,The north and south divide the city into secti...,0,mnli,


In [55]:
import pandas as pd
import pickle
def do():
    terms = pd.read_pickle("data/terms.pickle")
    with open("data/datasets-training-test.pickle", "rb") as f:
        d = pickle.load(f)
    hCode = d["baseterm"]["training"].hierarchy.unique().tolist()
    assert len(hCode) == 1
    hCode = hCode[0]
    training = d["baseterm"]["training"]
    

    
do()

Unnamed: 0,text,hierarchy,category
6703,"Pork, shoulder with rind,reheated n.s.,canned,...",expo,A0EYQ
6704,"Pork, shoulder with rind,reheated n.s.,chilled...",expo,A01RG
6705,"Pork, shoulder with rind,roasted/baked in oven...",expo,A01RG
6706,"Pork, shoulder with rind,roasted/baked in oven...",expo,A01RG
6707,"Pork, shoulder with rind,stewed n.s.,chilled/r...",expo,A01RG
...,...,...,...
33511,ZZZ-OUD VEGETABLES FOR NASI/BAMI/ETC.-FACETS_D...,expo,A00ZQ
33512,ZZZ-OUD VEGETABLES FOR NASI/BAMI/ETC.-FACETS_D...,expo,A00ZQ
33513,"Βaby food, cereal cream in a jar",expo,A03RJ
33514,Βroccoli,expo,A00FN
