In [None]:
import io
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torch
import torch.nn as nn
from transformers import AutoImageProcessor, ViTForImageClassification, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, Image, ClassLabel
import evaluate
import wandb
from sklearn.metrics import confusion_matrix
from copy import deepcopy


wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcebangu[0m ([33mmegdecoding[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# get the paths
label_path = os.path.join(os.getcwd(), "scalograms_test/average_labels.csv")
image_path = os.path.join(os.getcwd(), "scalograms_test/images_average")
#get the labels
df = pd.read_csv(label_path)
# attach paths to labels
df['image'] = df['FileName'].apply(lambda x: os.path.join(image_path, x))
df["label"] = df["Label"].astype("category")
#organize df
df = df.drop(columns=["Label", "FileName"])
# set labels as a label class - might be a good idea to do a label2id and change the labels from ints. 
unique_labels = set(df["label"])
class_label = ClassLabel(names=list(unique_labels))

In [None]:
#format dataset
dataset = Dataset.from_pandas(df).cast_column("image", Image())
dataset = dataset.cast_column("label", class_label)


Casting the dataset:   0%|          | 0/324 [00:00<?, ? examples/s]

In [None]:
# processor object for images for the model
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)
processor.size = {"height" : 384, "width" : 384} # reccomended to fine-tune with 384x384

In [None]:
def transform_for_model(example):
    """Transform the dataset into the format the model is expecting"""
    example['pixel_values'] = [image.convert("RGB") for image in example['image']]
    example['pixel_values'] = processor(example['pixel_values'], return_tensors='pt')["pixel_values"]
    return example

dataset = dataset.map(transform_for_model, batched=True)

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

In [6]:
dataset

Dataset({
    features: ['image', 'label', 'pixel_values'],
    num_rows: 324
})

In [None]:
# 90/10
dataset = dataset.train_test_split(test_size=0.1, stratify_by_column="label", seed=42)

### Ok it's probably best to just wrap the model in a custom class

The original model is trained on 224x224, but the best practices are to finetune of 384x384. So we need to at least interpolate the positional encodings. 

In [None]:
class MEGVisionTransformer(ViTForImageClassification):
    """Custom VIT wrapper that makes sure the positional encodings are interpolated
    This inherits from the base ViTForImageClassification class, doesn't need any arguments
    It adds the interpolate_pos_encoding to the forward method
    It also adds a method that allows you to freeze/unfreeze layers:
        You can either unfreeze the classifier only (around 2k trainable params) : 'classifier'
        Or you can unfreeze the classifier and the attention heads (around 28m trainable params) : 'attention'
        Or you can unfreeze the entire model (around 80m trainable params) 'all'
        """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs) # parent init

    def forward(self, pixel_values=None, labels=None, **kwargs):
        """Custom forward pass because we need positional interpolation"""
        return super().forward(
            pixel_values=pixel_values,
            labels=labels,
            interpolate_pos_encoding=True, # need to define custom forward method so that we can interpolate the encodings
            **kwargs
        )
    
    def freeze_type(self, freeze_type=None):
        if freeze_type not in ["classifier", "attention", "all"]:
            raise ValueError('freeze_type is either classifier, attention, or all')

        if freeze_type == "classifier":
            self.classifier_only()
        elif freeze_type == "attention":
            self.mha_training()
        elif freeze_type == 'all':
            self.train_all()

    ### layer freezing methods
    def classifier_only(self):
        """this keeps only the classifier open to training"""
        for name, param in self.named_parameters():
            # Unfreeze if the parameter belongs to the classifier or an attention layer.
            if "classifier" in name:
              param.requires_grad = True
              print(f"Unfreezing {name}")
            else:
                param.requires_grad = False
                print(f"Freezing {name}")

    def mha_training(self):
        """This keeps the classifier and attention heads open to training, following
        Touvron et al."""
        for name, param in self.named_parameters():
            # Unfreeze if the parameter belongs to the classifier or an attention layer.
            if "classifier" in name or "attention" in name:
                param.requires_grad = True
                print(f"Unfreezing {name}")
            else:
                param.requires_grad = False
                print(f"Freezing {name}")
    
    def train_all(self):
        """Full model training"""
        for name, param in self.named_parameters():
            param.requires_grad = True
            print(f"Unfreezing {name}")

In [None]:
# for loading the pretrained model
model_checkpoint = "google/vit-base-patch16-224"
# num_classes = len(unique_labels)
# model = MEGVisionTransformer.from_pretrained(
#     model_checkpoint,
#     ignore_mismatched_sizes=True,
#     num_labels=num_classes
# )

Which parameters to fine-tune? Well, Touvron et al say to fine tune the MHA layers

In [10]:
# model.freeze_type("attention")

In [11]:
# num_params = sum([p.numel() for p in model.parameters()])
# trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad])

# print(f"{num_params = :,} | {trainable_params = :,}")

In [None]:
def collate_fn(examples):
    """This function gets the samples in the right format for the model"""
    pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
# different metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred, num_classes=3):
    """this functions handles the metric computation for the trainer"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) #logits to class
    accuracy = accuracy_metric.compute(predictions=predictions, 
                                       references=labels)
    f1 = f1_metric.compute(predictions=predictions, 
                           references=labels, 
                           average='weighted',
                           )
    precision = precision_metric.compute(predictions=predictions, 
                                         references=labels, 
                                         average='weighted',
                                         zero_division=0.0 #control null predicition
                                         )
    recall = recall_metric.compute(predictions=predictions, 
                                   references=labels, 
                                   average='weighted',
                                   zero_division=0.0 #control null predicition
                                   )

    # specificity we will have to compute manuall unfortunately
    cm = confusion_matrix(labels, predictions, labels=list(range(num_classes))) # classes remember to change this!
    
    specificity_per_class = []
    
    for i in range(num_classes): #num classes
        # true negatives
        TN = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i]) 
        FP = np.sum(cm[:, i]) - cm[i, i]

        # compute specificity for each class and avoid dividing by 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        specificity_per_class.append(specificity)
    
    avg_specificity = np.mean(specificity_per_class)

    wandb.log({
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "specificity": avg_specificity,
    })

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "specificity": avg_specificity,
    }

In [None]:
class FoldEvalTrackingCallback(TrainerCallback):
    """This Callback allows the tracking of per-fold metrics, like loss. Because the way the optimizer works it wants all the folds to be
    part of the same run, but we want to get the results from each fold."""
    def __init__(self, fold_number):
        self.fold_number = fold_number

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            new_logs = {f"fold_{self.fold_number}/{key}": value for key, value in logs.items()}
            wandb.log(new_logs, commit=False)


class TrainMetricsCallback(TrainerCallback): #creds to sid8491 from Transformers forum
    """This callback allows you to track evaluation metrics on the train set, to make sure
    its learning. Unfortunately, it invloves doing ANOTHER forward pass over the data, rather than capturing
    the logits and doing the evaluations on the fly. for 10k samples, i don't think this is a problem, but this is NOT
    a scallable solution"""
    def __init__(self, trainer):
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset = self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy
        

        

In [26]:
from sklearn.model_selection import KFold
def cross_validate_kfold(train_dataset, model_class, model_name, num_classes=3, config=None, freeze_type=None, k=10):
    """Train dataset has to be dataset["train"]"""
    
    run=wandb.init(project="VIT-KFold-HyperSweep")
    config=wandb.config if config is None else config
    group_name = f"ViT_lr:{config.learning_rate}_optim:{config.optim}_sched:{config.lr_scheduler_type}_grads:{config.gradient_accumulation_steps}"
    # Optionally, set tags or name here:
    run.name = group_name


    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
        print(f"Training Fold {fold + 1}/{k}...")

        # remember to reinstantiate the model!!! and freeze the layers for each fold
        
        # model code
        model = model_class.from_pretrained(model_name, 
                                            ignore_mismatched_sizes=True,
                                            num_labels=num_classes
                                            )
        #trainer handles devices
        #apply the freeze type
        model.freeze_type(freeze_type)

        # splitting into train and val
        train_subset = train_dataset.select(train_idx.tolist())
        val_subset = train_dataset.select(val_idx.tolist())
        training_args = TrainingArguments(
            f"{model_name}-finetune_test",
            seed=42,
            remove_unused_columns=False,
            eval_strategy="epoch", #maybe epoch is better?
            save_strategy="epoch",
            learning_rate=config.learning_rate, # take it from the wandb config
            lr_scheduler_type=config.lr_scheduler_type, # take from config
            optim=config.optim, # tune optimizer
            gradient_accumulation_steps=config.gradient_accumulation_steps, #tune gradient accumulation
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1, 
            num_train_epochs=5, # SUPER IMPORTANT
            warmup_ratio=0.1,
            logging_steps=1, # change to more later
            metric_for_best_model='eval_loss',
            report_to="wandb",
            push_to_hub=False,
        )
        trainer = Trainer(
            model=model, 
            args=training_args,
            tokenizer=processor,
            train_dataset=train_subset,
            eval_dataset=val_subset,
            data_collator=collate_fn,
            compute_metrics=compute_metrics,
            callbacks=[FoldEvalTrackingCallback(fold+1)]
        )
        
        trainer.add_callback(TrainMetricsCallback(trainer=trainer))
        trainer.train()
        eval_results = trainer.evaluate()

        wandb.log({f"fold_{fold+1}_eval_loss": eval_results["eval_loss"]})
        fold_results.append(eval_results["eval_loss"]) # is this really what we want to track?
        
    avg_eval_loss=np.mean(fold_results)
    wandb.log({"avg_eval_loss": avg_eval_loss})
    
    run.finish()
    return avg_eval_loss

    

In [17]:
dataset['train'] = dataset['train'].select(list(range(20)))

In [18]:
len(dataset['train'])

20

In [None]:
model_name = model_checkpoint.split("/")[-1]


# configuration for hyperparameter sweep
sweep_config = {
    "method": "bayes",
    "metric": {"name": "avg_eval_loss", "goal": "minimize"}, # we want to optimize the average eval loss accross folds
    "parameters": {
        "learning_rate": {"values": [1e-5, 3e-5, 5e-5, 1e-4]}, # sweep learning rates (we'll see how many we can do)
        "lr_scheduler_type": {"values": ["linear", "cosine", "constant"]},
        "optim": {"values": ["adamw_torch", "adamw_hf", "adafactor"]}, # have to consider this some more
        "gradient_accumulation_steps": {"values": [1, 4, 8]}, # does this really matter?
        },
        "early_terminate": {
            "type": "hyperband", # stop runs early
            "min_iter": 15,
        }
}

In [27]:
sweep_id = wandb.sweep(sweep_config, project="VIT-KFold-HyperSweep")
wandb.agent(sweep_id, 
            lambda: cross_validate_kfold(train_dataset=dataset['train'], 
                                         model_class=MEGVisionTransformer, 
                                         model_name=model_checkpoint,
                                         num_classes=3, 
                                         freeze_type="classifier", 
                                         k=2), 
            count=2)

Create sweep with ID: 7kgx687p
Sweep URL: https://wandb.ai/megdecoding/VIT-KFold-HyperSweep/sweeps/7kgx687p


[34m[1mwandb[0m: Agent Starting Run: 5zdj9hwc with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 1
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	lr_scheduler_type: linear
[34m[1mwandb[0m: 	optim: adamw_hf


Training Fold 1/2...


Some weights of MEGVisionTransformer were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing vit.embeddings.cls_token
Freezing vit.embeddings.position_embeddings
Freezing vit.embeddings.patch_embeddings.projection.weight
Freezing vit.embeddings.patch_embeddings.projection.bias
Freezing vit.encoder.layer.0.attention.attention.query.weight
Freezing vit.encoder.layer.0.attention.attention.query.bias
Freezing vit.encoder.layer.0.attention.attention.key.weight
Freezing vit.encoder.layer.0.attention.attention.key.bias
Freezing vit.encoder.layer.0.attention.attention.value.weight
Freezing vit.encoder.layer.0.attention.attention.value.bias
Freezing vit.encoder.layer.0.attention.output.dense.weight
Freezing vit.encoder.layer.0.attention.output.dense.bias
Freezing vit.encoder.layer.0.intermediate.dense.weight
Freezing vit.encoder.layer.0.intermediate.dense.bias
Freezing vit.encoder.layer.0.output.dense.weight
Freezing vit.encoder.layer.0.output.dense.bias
Freezing vit.encoder.layer.0.layernorm_before.weight
Freezing vit.encoder.layer.0.layernorm_before.bias
Freezing vit.encoder



  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 0.6489, 'grad_norm': 15.175420761108398, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.1}
{'loss': 0.8265, 'grad_norm': 17.016061782836914, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.2}
{'loss': 0.7092, 'grad_norm': 16.111312866210938, 'learning_rate': 6e-06, 'epoch': 0.3}
{'loss': 0.921, 'grad_norm': 18.984773635864258, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.639, 'grad_norm': 14.324466705322266, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 1.2964, 'grad_norm': 24.309703826904297, 'learning_rate': 9.777777777777779e-06, 'epoch': 0.6}
{'loss': 0.607, 'grad_norm': 14.573837280273438, 'learning_rate': 9.555555555555556e-06, 'epoch': 0.7}
{'loss': 1.9719, 'grad_norm': 27.325204849243164, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.8}
{'loss': 1.8825, 'grad_norm': 25.389753341674805, 'learning_rate': 9.111111111111112e-06, 'epoch': 0.9}
{'loss': 1.7113, 'grad_norm': 25.66259002685547, 'learning_rate': 8.888888888888888e-06, 'epoch': 

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1115542650222778, 'train_accuracy': 0.5, 'train_f1': 0.4, 'train_precision': 0.33333333333333337, 'train_recall': 0.5, 'train_specificity': 0.6296296296296297, 'train_runtime': 1.7549, 'train_samples_per_second': 5.698, 'train_steps_per_second': 5.698, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.4181541204452515, 'eval_accuracy': 0.5, 'eval_f1': 0.34615384615384615, 'eval_precision': 0.2777777777777778, 'eval_recall': 0.5, 'eval_specificity': 0.7222222222222223, 'eval_runtime': 1.7972, 'eval_samples_per_second': 5.564, 'eval_steps_per_second': 5.564, 'epoch': 1.0}
{'loss': 0.8994, 'grad_norm': 18.70414924621582, 'learning_rate': 8.666666666666668e-06, 'epoch': 1.1}
{'loss': 0.6237, 'grad_norm': 14.074369430541992, 'learning_rate': 8.444444444444446e-06, 'epoch': 1.2}
{'loss': 1.9665, 'grad_norm': 27.318836212158203, 'learning_rate': 8.222222222222222e-06, 'epoch': 1.3}
{'loss': 0.5969, 'grad_norm': 14.38106632232666, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.4}
{'loss': 1.3148, 'grad_norm': 24.492656707763672, 'learning_rate': 7.77777777777778e-06, 'epoch': 1.5}
{'loss': 0.6814, 'grad_norm': 15.653271675109863, 'learning_rate': 7.555555555555556e-06, 'epoch': 1.6}
{'loss': 0.8006, 'grad_norm': 16.660770416259766, 'learning_rate': 7.333333333333333e-06,

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1030337810516357, 'train_accuracy': 0.5, 'train_f1': 0.4, 'train_precision': 0.33333333333333337, 'train_recall': 0.5, 'train_specificity': 0.6296296296296297, 'train_runtime': 1.7196, 'train_samples_per_second': 5.815, 'train_steps_per_second': 5.815, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.411024808883667, 'eval_accuracy': 0.5, 'eval_f1': 0.34615384615384615, 'eval_precision': 0.2777777777777778, 'eval_recall': 0.5, 'eval_specificity': 0.7222222222222223, 'eval_runtime': 1.7204, 'eval_samples_per_second': 5.812, 'eval_steps_per_second': 5.812, 'epoch': 2.0}
{'loss': 0.6207, 'grad_norm': 14.672813415527344, 'learning_rate': 6.444444444444445e-06, 'epoch': 2.1}
{'loss': 0.7956, 'grad_norm': 16.588382720947266, 'learning_rate': 6.222222222222223e-06, 'epoch': 2.2}
{'loss': 1.8696, 'grad_norm': 25.353952407836914, 'learning_rate': 6e-06, 'epoch': 2.3}
{'loss': 0.8774, 'grad_norm': 18.381887435913086, 'learning_rate': 5.777777777777778e-06, 'epoch': 2.4}
{'loss': 1.6993, 'grad_norm': 25.64452362060547, 'learning_rate': 5.555555555555557e-06, 'epoch': 2.5}
{'loss': 1.9562, 'grad_norm': 27.31292152404785, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.6}
{'loss': 0.6681, 'grad_norm': 15.41782283782959, 'learning_rate': 5.1111111111111115e-06, 'epoch': 2.7}
{

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.096724033355713, 'train_accuracy': 0.6, 'train_f1': 0.45, 'train_precision': 0.36, 'train_recall': 0.6, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7406, 'train_samples_per_second': 5.745, 'train_steps_per_second': 5.745, 'epoch': 3.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.4052608013153076, 'eval_accuracy': 0.5, 'eval_f1': 0.34615384615384615, 'eval_precision': 0.2777777777777778, 'eval_recall': 0.5, 'eval_specificity': 0.7222222222222223, 'eval_runtime': 1.7173, 'eval_samples_per_second': 5.823, 'eval_steps_per_second': 5.823, 'epoch': 3.0}
{'loss': 1.6945, 'grad_norm': 25.631290435791016, 'learning_rate': 4.222222222222223e-06, 'epoch': 3.1}
{'loss': 0.788, 'grad_norm': 16.478614807128906, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.2}
{'loss': 1.3411, 'grad_norm': 24.728364944458008, 'learning_rate': 3.777777777777778e-06, 'epoch': 3.3}
{'loss': 1.8608, 'grad_norm': 25.326915740966797, 'learning_rate': 3.555555555555556e-06, 'epoch': 3.4}
{'loss': 0.6027, 'grad_norm': 13.70849609375, 'learning_rate': 3.3333333333333333e-06, 'epoch': 3.5}
{'loss': 1.9499, 'grad_norm': 27.302040100097656, 'learning_rate': 3.1111111111111116e-06, 'epoch': 3.6}
{'loss': 0.5789, 'grad_norm': 14.024446487426758, 'learning_rate': 2.888888888888889e-06,

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0935015678405762, 'train_accuracy': 0.6, 'train_f1': 0.45, 'train_precision': 0.36, 'train_recall': 0.6, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7216, 'train_samples_per_second': 5.809, 'train_steps_per_second': 5.809, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.4025793075561523, 'eval_accuracy': 0.5, 'eval_f1': 0.34615384615384615, 'eval_precision': 0.2777777777777778, 'eval_recall': 0.5, 'eval_specificity': 0.7222222222222223, 'eval_runtime': 1.7461, 'eval_samples_per_second': 5.727, 'eval_steps_per_second': 5.727, 'epoch': 4.0}
{'loss': 1.858, 'grad_norm': 25.31893539428711, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.1}
{'loss': 0.6075, 'grad_norm': 14.419218063354492, 'learning_rate': 1.777777777777778e-06, 'epoch': 4.2}
{'loss': 0.6585, 'grad_norm': 15.245996475219727, 'learning_rate': 1.5555555555555558e-06, 'epoch': 4.3}
{'loss': 1.3472, 'grad_norm': 24.78129005432129, 'learning_rate': 1.3333333333333334e-06, 'epoch': 4.4}
{'loss': 0.7837, 'grad_norm': 16.416072845458984, 'learning_rate': 1.111111111111111e-06, 'epoch': 4.5}
{'loss': 0.5988, 'grad_norm': 13.640379905700684, 'learning_rate': 8.88888888888889e-07, 'epoch': 4.6}
{'loss': 0.8609, 'grad_norm': 18.13213539123535, 'learning_rate': 6.666666666666667e-07

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0922694206237793, 'train_accuracy': 0.6, 'train_f1': 0.45, 'train_precision': 0.36, 'train_recall': 0.6, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7404, 'train_samples_per_second': 5.746, 'train_steps_per_second': 5.746, 'epoch': 5.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.4017517566680908, 'eval_accuracy': 0.5, 'eval_f1': 0.34615384615384615, 'eval_precision': 0.2777777777777778, 'eval_recall': 0.5, 'eval_specificity': 0.7222222222222223, 'eval_runtime': 1.7799, 'eval_samples_per_second': 5.618, 'eval_steps_per_second': 5.618, 'epoch': 5.0}
{'train_runtime': 28.2532, 'train_samples_per_second': 1.77, 'train_steps_per_second': 1.77, 'train_loss': 1.1037298560142517, 'epoch': 5.0}


  0%|          | 0/10 [00:00<?, ?it/s]

Training Fold 2/2...


Some weights of MEGVisionTransformer were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing vit.embeddings.cls_token
Freezing vit.embeddings.position_embeddings
Freezing vit.embeddings.patch_embeddings.projection.weight
Freezing vit.embeddings.patch_embeddings.projection.bias
Freezing vit.encoder.layer.0.attention.attention.query.weight
Freezing vit.encoder.layer.0.attention.attention.query.bias
Freezing vit.encoder.layer.0.attention.attention.key.weight
Freezing vit.encoder.layer.0.attention.attention.key.bias
Freezing vit.encoder.layer.0.attention.attention.value.weight
Freezing vit.encoder.layer.0.attention.attention.value.bias
Freezing vit.encoder.layer.0.attention.output.dense.weight
Freezing vit.encoder.layer.0.attention.output.dense.bias
Freezing vit.encoder.layer.0.intermediate.dense.weight
Freezing vit.encoder.layer.0.intermediate.dense.bias
Freezing vit.encoder.layer.0.output.dense.weight
Freezing vit.encoder.layer.0.output.dense.bias
Freezing vit.encoder.layer.0.layernorm_before.weight
Freezing vit.encoder.layer.0.layernorm_before.bias
Freezing vit.encoder



  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 0.8807, 'grad_norm': 18.228017807006836, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.1}
{'loss': 1.0062, 'grad_norm': 19.848224639892578, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.2}
{'loss': 1.2905, 'grad_norm': 24.113126754760742, 'learning_rate': 6e-06, 'epoch': 0.3}
{'loss': 0.5356, 'grad_norm': 12.705487251281738, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.4}
{'loss': 1.5625, 'grad_norm': 24.86301040649414, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.4894, 'grad_norm': 11.795477867126465, 'learning_rate': 9.777777777777779e-06, 'epoch': 0.6}
{'loss': 0.6396, 'grad_norm': 14.654577255249023, 'learning_rate': 9.555555555555556e-06, 'epoch': 0.7}
{'loss': 1.3845, 'grad_norm': 23.97956085205078, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.8}
{'loss': 1.253, 'grad_norm': 22.888071060180664, 'learning_rate': 9.111111111111112e-06, 'epoch': 0.9}
{'loss': 1.4743, 'grad_norm': 23.89535903930664, 'learning_rate': 8.888888888888888e-06, 'epoch': 

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0472664833068848, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7208, 'train_samples_per_second': 5.811, 'train_steps_per_second': 5.811, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.1275652647018433, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7596, 'eval_samples_per_second': 5.683, 'eval_steps_per_second': 5.683, 'epoch': 1.0}
{'loss': 0.5292, 'grad_norm': 12.593332290649414, 'learning_rate': 8.666666666666668e-06, 'epoch': 1.1}
{'loss': 1.5692, 'grad_norm': 24.91934585571289, 'learning_rate': 8.444444444444446e-06, 'epoch': 1.2}
{'loss': 1.3811, 'grad_norm': 23.970802307128906, 'learning_rate': 8.222222222222222e-06, 'epoch': 1.3}
{'loss': 0.6333, 'grad_norm': 14.55295467376709, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.4}
{'loss': 0.4844, 'grad_norm': 11.700690269470215, 'learning_rate': 7.77777777777778e-06, 'epoch': 1.5}
{'loss': 1.292, 'grad_norm': 24.17259407043457, 'learning_rate': 7.555555555555556e-06, 'epoch': 1.6}
{'loss': 0.9854, 'grad_norm': 19.60185432434082, 'learning_rate': 7.333333333333333e-06, 'epoch': 1.7}
{'

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0427796840667725, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7824, 'train_samples_per_second': 5.61, 'train_steps_per_second': 5.61, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.1250560283660889, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7591, 'eval_samples_per_second': 5.685, 'eval_steps_per_second': 5.685, 'epoch': 2.0}
{'loss': 0.8587, 'grad_norm': 17.9343318939209, 'learning_rate': 6.444444444444445e-06, 'epoch': 2.1}
{'loss': 0.9798, 'grad_norm': 19.53397560119629, 'learning_rate': 6.222222222222223e-06, 'epoch': 2.2}
{'loss': 1.2442, 'grad_norm': 22.837846755981445, 'learning_rate': 6e-06, 'epoch': 2.3}
{'loss': 0.526, 'grad_norm': 12.547428131103516, 'learning_rate': 5.777777777777778e-06, 'epoch': 2.4}
{'loss': 1.4631, 'grad_norm': 23.855905532836914, 'learning_rate': 5.555555555555557e-06, 'epoch': 2.5}
{'loss': 1.3733, 'grad_norm': 23.952470779418945, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.6}
{'loss': 1.2866, 'grad_norm': 24.157344818115234, 'learning_rate': 5.1111111111111115e-06, 'epoch': 2.7}
{'loss': 1.5838

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0385234355926514, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7284, 'train_samples_per_second': 5.786, 'train_steps_per_second': 5.786, 'epoch': 3.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.1220834255218506, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7317, 'eval_samples_per_second': 5.775, 'eval_steps_per_second': 5.775, 'epoch': 3.0}
{'loss': 1.4575, 'grad_norm': 23.825977325439453, 'learning_rate': 4.222222222222223e-06, 'epoch': 3.1}
{'loss': 0.9717, 'grad_norm': 19.435148239135742, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.2}
{'loss': 0.4803, 'grad_norm': 11.621099472045898, 'learning_rate': 3.777777777777778e-06, 'epoch': 3.3}
{'loss': 1.2363, 'grad_norm': 22.7814998626709, 'learning_rate': 3.555555555555556e-06, 'epoch': 3.4}
{'loss': 1.5876, 'grad_norm': 25.022693634033203, 'learning_rate': 3.3333333333333333e-06, 'epoch': 3.5}
{'loss': 1.3672, 'grad_norm': 23.92259407043457, 'learning_rate': 3.1111111111111116e-06, 'epoch': 3.6}
{'loss': 0.6224, 'grad_norm': 14.377535820007324, 'learning_rate': 2.888888888888889e-06, 'epoch': 3.

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0364038944244385, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7766, 'train_samples_per_second': 5.629, 'train_steps_per_second': 5.629, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.1208440065383911, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.8591, 'eval_samples_per_second': 5.379, 'eval_steps_per_second': 5.379, 'epoch': 4.0}
{'loss': 1.2344, 'grad_norm': 22.768720626831055, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.1}
{'loss': 0.8464, 'grad_norm': 17.76696014404297, 'learning_rate': 1.777777777777778e-06, 'epoch': 4.2}
{'loss': 1.2807, 'grad_norm': 24.123355865478516, 'learning_rate': 1.5555555555555558e-06, 'epoch': 4.3}
{'loss': 0.4792, 'grad_norm': 11.599104881286621, 'learning_rate': 1.3333333333333334e-06, 'epoch': 4.4}
{'loss': 0.9664, 'grad_norm': 19.37066078186035, 'learning_rate': 1.111111111111111e-06, 'epoch': 4.5}
{'loss': 1.5906, 'grad_norm': 25.038902282714844, 'learning_rate': 8.88888888888889e-07, 'epoch': 4.6}
{'loss': 0.5246, 'grad_norm': 12.535374641418457, 'learning_rate': 6.666666666666667e-07, 'epoch': 4

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0356286764144897, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 2.0719, 'train_samples_per_second': 4.826, 'train_steps_per_second': 4.826, 'epoch': 5.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.120453119277954, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 2.1474, 'eval_samples_per_second': 4.657, 'eval_steps_per_second': 4.657, 'epoch': 5.0}
{'train_runtime': 28.9282, 'train_samples_per_second': 1.728, 'train_steps_per_second': 1.728, 'train_loss': 1.0424906867742538, 'epoch': 5.0}


  0%|          | 0/10 [00:00<?, ?it/s]

0,1
accuracy,▆▆▆▆█▆█▆█▆▆▆▁▆▁▆▁▆▁▆▁▁
avg_eval_loss,▁
eval/accuracy,██████▁▁▁▁▁▁
eval/f1,██████▁▁▁▁▁▁
eval/loss,██████▁▁▁▁▁▁
eval/precision,██████▁▁▁▁▁▁
eval/recall,██████▁▁▁▁▁▁
eval/runtime,▂▁▁▁▂▁▁▁▁▂▅█
eval/samples_per_second,▇███▇▇▇▇█▆▃▁
eval/specificity,██████▁▁▁▁▁▁

0,1
accuracy,0.3
avg_eval_loss,1.2611
eval/accuracy,0.3
eval/f1,0.13846
eval/loss,1.12045
eval/precision,0.09
eval/recall,0.3
eval/runtime,2.4401
eval/samples_per_second,4.098
eval/specificity,0.66667


[34m[1mwandb[0m: Agent Starting Run: m6rikv8v with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	optim: adamw_torch


Training Fold 1/2...


Some weights of MEGVisionTransformer were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing vit.embeddings.cls_token
Freezing vit.embeddings.position_embeddings
Freezing vit.embeddings.patch_embeddings.projection.weight
Freezing vit.embeddings.patch_embeddings.projection.bias
Freezing vit.encoder.layer.0.attention.attention.query.weight
Freezing vit.encoder.layer.0.attention.attention.query.bias
Freezing vit.encoder.layer.0.attention.attention.key.weight
Freezing vit.encoder.layer.0.attention.attention.key.bias
Freezing vit.encoder.layer.0.attention.attention.value.weight
Freezing vit.encoder.layer.0.attention.attention.value.bias
Freezing vit.encoder.layer.0.attention.output.dense.weight
Freezing vit.encoder.layer.0.attention.output.dense.bias
Freezing vit.encoder.layer.0.intermediate.dense.weight
Freezing vit.encoder.layer.0.intermediate.dense.bias
Freezing vit.encoder.layer.0.output.dense.weight
Freezing vit.encoder.layer.0.output.dense.bias
Freezing vit.encoder.layer.0.layernorm_before.weight
Freezing vit.encoder.layer.0.layernorm_before.bias
Freezing vit.encoder



  0%|          | 0/5 [00:00<?, ?it/s]

{'loss': 1.2589, 'grad_norm': 14.474167823791504, 'learning_rate': 1e-05, 'epoch': 0.8}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1251177787780762, 'train_accuracy': 0.3, 'train_f1': 0.13846153846153847, 'train_precision': 0.09, 'train_recall': 0.3, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.8321, 'train_samples_per_second': 5.458, 'train_steps_per_second': 5.458, 'epoch': 0.8}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.0498793125152588, 'eval_accuracy': 0.5, 'eval_f1': 0.3333333333333333, 'eval_precision': 0.25, 'eval_recall': 0.5, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.8259, 'eval_samples_per_second': 5.477, 'eval_steps_per_second': 5.477, 'epoch': 0.8}
{'loss': 1.0771, 'grad_norm': 7.614552974700928, 'learning_rate': 8.535533905932739e-06, 'epoch': 1.6}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1218191385269165, 'train_accuracy': 0.3, 'train_f1': 0.13846153846153847, 'train_precision': 0.09, 'train_recall': 0.3, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.8102, 'train_samples_per_second': 5.524, 'train_steps_per_second': 5.524, 'epoch': 1.6}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.048577904701233, 'eval_accuracy': 0.5, 'eval_f1': 0.3333333333333333, 'eval_precision': 0.25, 'eval_recall': 0.5, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7492, 'eval_samples_per_second': 5.717, 'eval_steps_per_second': 5.717, 'epoch': 1.6}
{'loss': 1.0538, 'grad_norm': 10.747528076171875, 'learning_rate': 5e-06, 'epoch': 2.4}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1190893650054932, 'train_accuracy': 0.3, 'train_f1': 0.13846153846153847, 'train_precision': 0.09, 'train_recall': 0.3, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7908, 'train_samples_per_second': 5.584, 'train_steps_per_second': 5.584, 'epoch': 2.4}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.0474255084991455, 'eval_accuracy': 0.5, 'eval_f1': 0.3333333333333333, 'eval_precision': 0.25, 'eval_recall': 0.5, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7805, 'eval_samples_per_second': 5.616, 'eval_steps_per_second': 5.616, 'epoch': 2.4}
{'loss': 1.0516, 'grad_norm': 6.99785852432251, 'learning_rate': 1.4644660940672628e-06, 'epoch': 3.2}
{'loss': 1.1683, 'grad_norm': 10.936563491821289, 'learning_rate': 0.0, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.1170228719711304, 'train_accuracy': 0.3, 'train_f1': 0.13846153846153847, 'train_precision': 0.09, 'train_recall': 0.3, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.8638, 'train_samples_per_second': 5.365, 'train_steps_per_second': 5.365, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.04659104347229, 'eval_accuracy': 0.5, 'eval_f1': 0.3333333333333333, 'eval_precision': 0.25, 'eval_recall': 0.5, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.9041, 'eval_samples_per_second': 5.252, 'eval_steps_per_second': 5.252, 'epoch': 4.0}
{'train_runtime': 23.9384, 'train_samples_per_second': 2.089, 'train_steps_per_second': 0.209, 'train_loss': 1.121946358680725, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

Training Fold 2/2...


Some weights of MEGVisionTransformer were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing vit.embeddings.cls_token
Freezing vit.embeddings.position_embeddings
Freezing vit.embeddings.patch_embeddings.projection.weight
Freezing vit.embeddings.patch_embeddings.projection.bias
Freezing vit.encoder.layer.0.attention.attention.query.weight
Freezing vit.encoder.layer.0.attention.attention.query.bias
Freezing vit.encoder.layer.0.attention.attention.key.weight
Freezing vit.encoder.layer.0.attention.attention.key.bias
Freezing vit.encoder.layer.0.attention.attention.value.weight
Freezing vit.encoder.layer.0.attention.attention.value.bias
Freezing vit.encoder.layer.0.attention.output.dense.weight
Freezing vit.encoder.layer.0.attention.output.dense.bias
Freezing vit.encoder.layer.0.intermediate.dense.weight
Freezing vit.encoder.layer.0.intermediate.dense.bias
Freezing vit.encoder.layer.0.output.dense.weight
Freezing vit.encoder.layer.0.output.dense.bias
Freezing vit.encoder.layer.0.layernorm_before.weight
Freezing vit.encoder.layer.0.layernorm_before.bias
Freezing vit.encoder



  0%|          | 0/5 [00:00<?, ?it/s]

{'loss': 0.8922, 'grad_norm': 3.3664658069610596, 'learning_rate': 1e-05, 'epoch': 0.8}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.0877230167388916, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7928, 'train_samples_per_second': 5.578, 'train_steps_per_second': 5.578, 'epoch': 0.8}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3348338603973389, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.8239, 'eval_samples_per_second': 5.483, 'eval_steps_per_second': 5.483, 'epoch': 0.8}
{'loss': 1.2481, 'grad_norm': 10.208475112915039, 'learning_rate': 8.535533905932739e-06, 'epoch': 1.6}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.085632562637329, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.8371, 'train_samples_per_second': 5.443, 'train_steps_per_second': 5.443, 'epoch': 1.6}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.331908106803894, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7884, 'eval_samples_per_second': 5.591, 'eval_steps_per_second': 5.591, 'epoch': 1.6}
{'loss': 0.9878, 'grad_norm': 7.806415557861328, 'learning_rate': 5e-06, 'epoch': 2.4}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.083958625793457, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7853, 'train_samples_per_second': 5.601, 'train_steps_per_second': 5.601, 'epoch': 2.4}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.329612374305725, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.8014, 'eval_samples_per_second': 5.551, 'eval_steps_per_second': 5.551, 'epoch': 2.4}
{'loss': 1.2316, 'grad_norm': 10.576590538024902, 'learning_rate': 1.4644660940672628e-06, 'epoch': 3.2}
{'loss': 1.0678, 'grad_norm': 6.151747703552246, 'learning_rate': 0.0, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'train_loss': 1.082637906074524, 'train_accuracy': 0.5, 'train_f1': 0.3333333333333333, 'train_precision': 0.25, 'train_recall': 0.5, 'train_specificity': 0.6666666666666666, 'train_runtime': 1.7905, 'train_samples_per_second': 5.585, 'train_steps_per_second': 5.585, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3276686668395996, 'eval_accuracy': 0.3, 'eval_f1': 0.13846153846153847, 'eval_precision': 0.09, 'eval_recall': 0.3, 'eval_specificity': 0.6666666666666666, 'eval_runtime': 1.7939, 'eval_samples_per_second': 5.574, 'eval_steps_per_second': 5.574, 'epoch': 4.0}
{'train_runtime': 23.1728, 'train_samples_per_second': 2.158, 'train_steps_per_second': 0.216, 'train_loss': 1.085502803325653, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

0,1
accuracy,▁█▁█▁█▁███▁█▁█▁█▁▁
avg_eval_loss,▁
eval/accuracy,█████▁▁▁▁▁
eval/f1,█████▁▁▁▁▁
eval/loss,▁▁▁▁▁█████
eval/precision,█████▁▁▁▁▁
eval/recall,█████▁▁▁▁▁
eval/runtime,▄▁▂▇█▄▂▃▃▄
eval/samples_per_second,▅█▇▂▁▅▆▆▆▅
eval/specificity,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.3
avg_eval_loss,1.18713
eval/accuracy,0.3
eval/f1,0.13846
eval/loss,1.32767
eval/precision,0.09
eval/recall,0.3
eval/runtime,1.8347
eval/samples_per_second,5.45
eval/specificity,0.66667
