In [None]:
%pip install transformers torch datasets scikit-learn

In [1]:
# Import the libraries
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Define the model
# Model 1
#model_checkpoint = "distilbert-base-uncased" 
# Model 2
model_checkpoint = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [3]:
# Load a small, well-known dataset
dataset = load_dataset("ag_news", split="train[:1000]")  # Use subset for speed
texts, labels = list(dataset['text']), list(dataset['label'])

In [9]:
def tokenize(batch):
    return tokenizer(batch, padding='max_length', truncation=True, max_length=128)

tokenized = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
X = tokenized['input_ids'].numpy()
y = np.array(labels)

In [5]:
# Use the Trainer API for simplicity
def run_experiment(random_seed, model_checkpoint):
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    # Split data
    train_idx, test_idx = train_test_split(np.arange(len(y)), test_size=0.3, random_state=random_seed)
    # Build datasets
    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, idxs):
            self.encodings = {k: v[idxs] for k,v in tokenized.items()}
            self.labels = torch.tensor(y[idxs])
        def __getitem__(self, idx):
            return {**{k: v[idx] for k, v in self.encodings.items()}, 'labels': self.labels[idx]}
        def __len__(self): return len(self.labels)
    train_dataset = MyDataset(train_idx)
    test_dataset = MyDataset(test_idx)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps=10,
        report_to="none",
        seed=random_seed,
        save_strategy="no",
        disable_tqdm=True,
    )

    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        return {"accuracy": accuracy_score(p.label_ids, preds)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result['eval_accuracy']

In [None]:
# Run for Model 1 - distilbert-base-uncased

# Run multiple times to estimate accuracy and its confidence interval
results = []
for seed in range(8):
    acc = run_experiment(random_seed=seed, model_checkpoint=model_checkpoint)
    results.append(acc)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.1967, 'grad_norm': 4.941572189331055, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 0.9378, 'grad_norm': 3.9741854667663574, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.7898, 'grad_norm': 6.888270378112793, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.682, 'grad_norm': 9.499796867370605, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.5578, 'grad_norm': 5.945507049560547, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.5687, 'grad_norm': 4.156579971313477, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6301, 'grad_norm': 10.236400604248047, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.5132, 'grad_norm': 10.129493713378906, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 14.722, 'train_samples_per_sec

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.358, 'grad_norm': 4.945007801055908, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0902, 'grad_norm': 4.470273017883301, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9426, 'grad_norm': 4.89274787902832, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.6902, 'grad_norm': 4.711475372314453, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.6907, 'grad_norm': 2.9768786430358887, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.5443, 'grad_norm': 7.343725204467773, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6351, 'grad_norm': 6.500596523284912, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4502, 'grad_norm': 2.452087640762329, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.1802, 'train_samples_per_secon

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2799, 'grad_norm': 3.2554075717926025, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0429, 'grad_norm': 4.115146636962891, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.8543, 'grad_norm': 5.335216999053955, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.8608, 'grad_norm': 8.963418960571289, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.7155, 'grad_norm': 2.959280490875244, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.485, 'grad_norm': 3.0826690196990967, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.4521, 'grad_norm': 5.237246036529541, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4023, 'grad_norm': 7.694278240203857, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.2058, 'train_samples_per_sec

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2129, 'grad_norm': 3.7248950004577637, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.164, 'grad_norm': 5.067765235900879, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9261, 'grad_norm': 4.0508928298950195, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.8348, 'grad_norm': 4.49784517288208, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.6858, 'grad_norm': 6.085018634796143, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6446, 'grad_norm': 6.382251262664795, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6099, 'grad_norm': 3.59616756439209, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.5642, 'grad_norm': 3.860992670059204, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.2339, 'train_samples_per_secon

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3032, 'grad_norm': 2.2060065269470215, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0539, 'grad_norm': 5.4949517250061035, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9337, 'grad_norm': 6.369175434112549, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.6952, 'grad_norm': 2.7352945804595947, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.691, 'grad_norm': 11.469582557678223, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6066, 'grad_norm': 13.8048095703125, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.589, 'grad_norm': 3.1733040809631348, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4133, 'grad_norm': 5.282905101776123, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.2305, 'train_samples_per_se

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2939, 'grad_norm': 2.8107080459594727, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0517, 'grad_norm': 3.518787145614624, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.7899, 'grad_norm': 6.314426898956299, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.7204, 'grad_norm': 5.498767375946045, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.6206, 'grad_norm': 5.85137939453125, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.4843, 'grad_norm': 1.8235493898391724, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.5814, 'grad_norm': 11.651384353637695, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.552, 'grad_norm': 3.542625665664673, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.1576, 'train_samples_per_sec

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2719, 'grad_norm': 2.980914831161499, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0443, 'grad_norm': 4.8156256675720215, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.8473, 'grad_norm': 4.634191513061523, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.6507, 'grad_norm': 4.794312477111816, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.615, 'grad_norm': 3.5180466175079346, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.5884, 'grad_norm': 8.332164764404297, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.5751, 'grad_norm': 11.510383605957031, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.5901, 'grad_norm': 3.529625654220581, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.135, 'train_samples_per_sec

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3185, 'grad_norm': 3.287883758544922, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.0719, 'grad_norm': 3.392120599746704, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.8473, 'grad_norm': 5.117605686187744, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.7289, 'grad_norm': 8.613532066345215, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.7253, 'grad_norm': 3.647578239440918, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.7519, 'grad_norm': 5.809255123138428, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.4676, 'grad_norm': 9.757250785827637, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4899, 'grad_norm': 4.472189426422119, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 11.0907, 'train_samples_per_seco

In [None]:
# Confidence interval on Model 1 results
def compute_ci(data, confidence=0.95):
    mean = np.mean(data)
    std_err = np.std(data, ddof=1)/np.sqrt(len(data))
    ci = std_err * 1.96  # For 95%
    return (mean - ci, mean, mean + ci)

ci = compute_ci(results)
print(f"\nModel: {model_checkpoint}")
print(f"Accuracies: {np.round(results, 3)}")
print(f"Mean={ci[1]:.3f}, 95% CI=({ci[0]:.3f}, {ci[2]:.3f})\n")


Model: distilbert-base-uncased
Accuracies: [0.85  0.847 0.86  0.843 0.83  0.863 0.877 0.813]
Mean=0.848, 95% CI=(0.834, 0.862)



In [10]:
# Run for Model 2 - albert-base-v2

# Run multiple times to estimate accuracy and its confidence interval
results = []
for seed in range(8):
    acc = run_experiment(random_seed=seed, model_checkpoint=model_checkpoint)
    results.append(acc)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.264, 'grad_norm': 21.015626907348633, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.1941, 'grad_norm': 15.182701110839844, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9254, 'grad_norm': 23.20159912109375, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.7785, 'grad_norm': 15.699048042297363, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.6744, 'grad_norm': 24.603622436523438, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.705, 'grad_norm': 21.13383674621582, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6585, 'grad_norm': 34.009437561035156, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4859, 'grad_norm': 58.19181823730469, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.6827, 'train_samples_per_s

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3173, 'grad_norm': 23.26309585571289, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.1604, 'grad_norm': 17.835969924926758, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 1.0021, 'grad_norm': 47.85809326171875, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.7271, 'grad_norm': 26.501739501953125, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.7673, 'grad_norm': 9.426093101501465, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6954, 'grad_norm': 19.708019256591797, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.726, 'grad_norm': 28.26484489440918, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4878, 'grad_norm': 9.90575885772705, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.2416, 'train_samples_per_sec

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3699, 'grad_norm': 12.71481990814209, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.2555, 'grad_norm': 10.566231727600098, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 1.0062, 'grad_norm': 11.591489791870117, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.8186, 'grad_norm': 18.825834274291992, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.7294, 'grad_norm': 11.901957511901855, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.5241, 'grad_norm': 3.7994890213012695, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6169, 'grad_norm': 10.25279712677002, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.4181, 'grad_norm': 60.55865478515625, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.2533, 'train_samples_per

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.1992, 'grad_norm': 10.974956512451172, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.1628, 'grad_norm': 54.9497184753418, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9926, 'grad_norm': 25.61110496520996, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.8278, 'grad_norm': 97.79493713378906, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.85, 'grad_norm': 11.796147346496582, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.7473, 'grad_norm': 19.199811935424805, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.7059, 'grad_norm': 6.365169048309326, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.6117, 'grad_norm': 19.67987823486328, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.2497, 'train_samples_per_seco

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3418, 'grad_norm': 21.000940322875977, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 0.9672, 'grad_norm': 29.98208236694336, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.811, 'grad_norm': 48.89148712158203, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.9171, 'grad_norm': 14.86322021484375, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.8263, 'grad_norm': 72.09468078613281, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.7495, 'grad_norm': 14.349154472351074, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.7705, 'grad_norm': 27.50349235534668, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.639, 'grad_norm': 17.12796401977539, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.2705, 'train_samples_per_seco

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.282, 'grad_norm': 53.42741394042969, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.2383, 'grad_norm': 53.623291015625, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9448, 'grad_norm': 40.14274978637695, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.8599, 'grad_norm': 157.10487365722656, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.805, 'grad_norm': 203.35594177246094, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.7164, 'grad_norm': 29.50766372680664, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.889, 'grad_norm': 127.65093231201172, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.7518, 'grad_norm': 18.75395393371582, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.3625, 'train_samples_per_second

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2957, 'grad_norm': 35.07734298706055, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.1279, 'grad_norm': 39.68547439575195, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 0.9183, 'grad_norm': 46.67231750488281, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 0.7643, 'grad_norm': 23.1479434967041, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.6756, 'grad_norm': 17.044095993041992, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.7803, 'grad_norm': 34.34391784667969, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.6844, 'grad_norm': 44.39559555053711, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.6508, 'grad_norm': 20.664199829101562, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.2817, 'train_samples_per_sec

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.4055, 'grad_norm': 6.46197509765625, 'learning_rate': 4.488636363636364e-05, 'epoch': 0.11363636363636363}
{'loss': 1.294, 'grad_norm': 30.53546142578125, 'learning_rate': 3.9204545454545456e-05, 'epoch': 0.22727272727272727}
{'loss': 1.1865, 'grad_norm': 14.483648300170898, 'learning_rate': 3.352272727272727e-05, 'epoch': 0.3409090909090909}
{'loss': 1.1072, 'grad_norm': 43.16218566894531, 'learning_rate': 2.784090909090909e-05, 'epoch': 0.45454545454545453}
{'loss': 0.9709, 'grad_norm': 32.465057373046875, 'learning_rate': 2.215909090909091e-05, 'epoch': 0.5681818181818182}
{'loss': 0.8833, 'grad_norm': 39.17475891113281, 'learning_rate': 1.6477272727272726e-05, 'epoch': 0.6818181818181818}
{'loss': 0.7044, 'grad_norm': 19.659196853637695, 'learning_rate': 1.0795454545454547e-05, 'epoch': 0.7954545454545454}
{'loss': 0.8648, 'grad_norm': 27.460329055786133, 'learning_rate': 5.113636363636364e-06, 'epoch': 0.9090909090909091}
{'train_runtime': 21.293, 'train_samples_per_sec

In [11]:
# Confidence interval on Model 2 results
def compute_ci(data, confidence=0.95):
    mean = np.mean(data)
    std_err = np.std(data, ddof=1)/np.sqrt(len(data))
    ci = std_err * 1.96  # For 95%
    return (mean - ci, mean, mean + ci)

ci = compute_ci(results)
print(f"\nModel: {model_checkpoint}")
print(f"Accuracies: {np.round(results, 3)}")
print(f"Mean={ci[1]:.3f}, 95% CI=({ci[0]:.3f}, {ci[2]:.3f})\n")


Model: albert-base-v2
Accuracies: [0.837 0.797 0.85  0.843 0.813 0.78  0.84  0.777]
Mean=0.817, 95% CI=(0.797, 0.838)

