In [1]:
pip install transformers datasets torch scikit-learn --break-system-packages


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

# Load a dataset (replace with your own)
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})

dataset = dataset.rename_column("is_sarcastic", "labels")


  from pandas.core import (
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["headline"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)




In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Convert labels to integers (if necessary)
num_labels = len(set(dataset["train"]["labels"]))


2025-02-21 13:13:46.708160: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-21 13:13:46.878998: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-21 13:13:46.967911: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-21 13:13:46.994091: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-21 13:13:47.130591: I tensorflow/core/platform/cpu_feature_guar

In [5]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch.nn as nn

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 🔹 1. Freeze Initial Layers (Only Train Last Few Layers)
for param in model.distilbert.parameters():  
    param.requires_grad = False  # Freeze DistilBERT base layers

# Unfreeze the last transformer layer
for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True  # Keep last transformer layer trainable

# 🔹 2. Apply Dropout and L2 Regularization
class CustomDistilBERT(nn.Module):
    def __init__(self, model):
        super(CustomDistilBERT, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(0.3)  # Dropout to prevent overfitting
        self.classifier = nn.Linear(768, 2)  # Adjust for binary classification

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model.distilbert(input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Take CLS token representation
        dropped = self.dropout(hidden_state)  # Apply dropout
        logits = self.classifier(dropped)  # Classification layer
        return {"logits": logits}

# Replace original model with custom model
model = CustomDistilBERT(model)

# 🔹 3. Set Training Arguments (L2 Regularization + Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,  # L2 regularization
    load_best_model_at_end=True,  # Load best model using validation loss
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    report_to="none",
)

# 🔹 4. Enable Early Stopping
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stops if val loss doesn’t improve for 2 epochs
)

# Train the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'validation'

In [21]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    logging_strategy="epoch",            # Log at the end of each epoch
    logging_dir="./logs",                # Directory for storing logs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (DistilBertTokenizerFast, DistilBertForSequenceClassification, 
                          TrainingArguments, Trainer, EarlyStoppingCallback)
import torch.nn as nn
import torch
import numpy as np
import re

# 1. Load your dataset (assuming CSV with columns "headline" and "is_sarcastic")
data = pd.read_csv("train.csv")

# 1a. Convert any dictionary entries in the "headline" column to strings.
def extract_text(x):
    if isinstance(x, dict):
        return x.get("content", str(x))
    return str(x)

data["headline"] = data["headline"].apply(extract_text)

# 1b. Clean the text in the "headline" column
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data["headline"] = data["headline"].apply(clean_text)

# 2. Split the DataFrame into train and validation (90:10 ratio)
train_df, val_df = train_test_split(data, test_size=0.10, random_state=42, stratify=data["is_sarcastic"])

# 3. Convert Pandas DataFrames into Hugging Face Datasets using Dataset.from_pandas()
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create a DatasetDict for convenience
datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

# 4. Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# 5. Tokenize the datasets; note that we use "headline" instead of "text"
def tokenize_function(examples):
    return tokenizer(examples["headline"], padding="max_length", truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Rename "is_sarcastic" to "labels" so the Trainer can find them.
tokenized_datasets = tokenized_datasets.rename_column("is_sarcastic", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Load the DistilBERT model. The number of labels is determined by the number of unique values in "is_sarcastic".
base_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(data["is_sarcastic"].unique())
)

# 7. Freeze all DistilBERT parameters except the last transformer layer.
for param in base_model.distilbert.parameters():
    param.requires_grad = False
for param in base_model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True

# 8. Create a custom model wrapper to add dropout.
class CustomDistilBert(nn.Module):
    def __init__(self, base_model, dropout_rate=0.3):
        super(CustomDistilBert, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)
        # Use the classifier from base_model.
        self.classifier = base_model.classifier

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model.distilbert(input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Use CLS token representation
        dropped = self.dropout(hidden_state)
        logits = self.classifier(dropped)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        return {"loss": loss, "logits": logits}

# Replace base_model with the custom model
model = CustomDistilBert(base_model)

# 9. Define training arguments with weight decay and early stopping.
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

# 10. Define a simple compute_metrics function (calculating accuracy)
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# 11. Create the Trainer and add EarlyStoppingCallback.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 12. Train the model.
trainer.train()

# After training, you can evaluate, save the model, or perform inference as needed.


Map: 100%|██████████| 20605/20605 [00:03<00:00, 5601.14 examples/s]
Map: 100%|██████████| 2290/2290 [00:00<00:00, 6143.55 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 10/6440 [01:16<13:43:46,  7.69s/it]

{'loss': 0.6927, 'grad_norm': 3.139820098876953, 'learning_rate': 4.992236024844721e-05, 'epoch': 0.01}


  0%|          | 20/6440 [02:32<13:26:20,  7.54s/it]

{'loss': 0.6649, 'grad_norm': 1.815093755722046, 'learning_rate': 4.984472049689442e-05, 'epoch': 0.02}


  0%|          | 30/6440 [03:46<12:36:29,  7.08s/it]

{'loss': 0.6468, 'grad_norm': 3.4361214637756348, 'learning_rate': 4.976708074534161e-05, 'epoch': 0.02}


  1%|          | 40/6440 [04:57<12:49:17,  7.21s/it]

{'loss': 0.6146, 'grad_norm': 1.8476978540420532, 'learning_rate': 4.968944099378882e-05, 'epoch': 0.03}


  1%|          | 50/6440 [06:09<12:59:36,  7.32s/it]

{'loss': 0.5512, 'grad_norm': 2.548879384994507, 'learning_rate': 4.961180124223603e-05, 'epoch': 0.04}


  1%|          | 60/6440 [07:24<13:46:21,  7.77s/it]

{'loss': 0.5268, 'grad_norm': 2.097477436065674, 'learning_rate': 4.9534161490683236e-05, 'epoch': 0.05}


  1%|          | 70/6440 [08:36<12:24:52,  7.02s/it]

{'loss': 0.5311, 'grad_norm': 2.476260185241699, 'learning_rate': 4.945652173913044e-05, 'epoch': 0.05}


  1%|          | 80/6440 [09:47<12:39:58,  7.17s/it]

{'loss': 0.518, 'grad_norm': 1.866965651512146, 'learning_rate': 4.937888198757764e-05, 'epoch': 0.06}


  1%|▏         | 90/6440 [10:57<12:31:23,  7.10s/it]

{'loss': 0.4815, 'grad_norm': 1.7802932262420654, 'learning_rate': 4.9301242236024846e-05, 'epoch': 0.07}


  2%|▏         | 100/6440 [12:10<12:41:44,  7.21s/it]

{'loss': 0.4982, 'grad_norm': 2.312425136566162, 'learning_rate': 4.9223602484472054e-05, 'epoch': 0.08}


  2%|▏         | 110/6440 [13:20<12:13:00,  6.95s/it]

{'loss': 0.4802, 'grad_norm': 3.178335428237915, 'learning_rate': 4.914596273291926e-05, 'epoch': 0.09}


  2%|▏         | 120/6440 [14:28<11:55:35,  6.79s/it]

{'loss': 0.4747, 'grad_norm': 2.713679075241089, 'learning_rate': 4.906832298136646e-05, 'epoch': 0.09}


  2%|▏         | 130/6440 [15:37<12:05:16,  6.90s/it]

{'loss': 0.491, 'grad_norm': 3.4979820251464844, 'learning_rate': 4.8990683229813664e-05, 'epoch': 0.1}


  2%|▏         | 140/6440 [16:47<12:25:47,  7.10s/it]

{'loss': 0.5437, 'grad_norm': 3.215796709060669, 'learning_rate': 4.891304347826087e-05, 'epoch': 0.11}


  2%|▏         | 150/6440 [18:00<12:45:35,  7.30s/it]

{'loss': 0.4245, 'grad_norm': 2.239666700363159, 'learning_rate': 4.883540372670808e-05, 'epoch': 0.12}


  2%|▏         | 160/6440 [19:12<12:27:11,  7.14s/it]

{'loss': 0.4614, 'grad_norm': 3.057353973388672, 'learning_rate': 4.875776397515528e-05, 'epoch': 0.12}


  3%|▎         | 170/6440 [20:22<12:04:47,  6.94s/it]

{'loss': 0.4343, 'grad_norm': 2.754807949066162, 'learning_rate': 4.868012422360249e-05, 'epoch': 0.13}


  3%|▎         | 180/6440 [21:33<12:15:10,  7.05s/it]

{'loss': 0.4606, 'grad_norm': 5.8200507164001465, 'learning_rate': 4.860248447204969e-05, 'epoch': 0.14}


  3%|▎         | 190/6440 [22:44<12:19:23,  7.10s/it]

{'loss': 0.4708, 'grad_norm': 2.1326346397399902, 'learning_rate': 4.85248447204969e-05, 'epoch': 0.15}


  3%|▎         | 200/6440 [23:57<13:11:02,  7.61s/it]

{'loss': 0.4049, 'grad_norm': 2.4714953899383545, 'learning_rate': 4.8447204968944106e-05, 'epoch': 0.16}


  3%|▎         | 210/6440 [25:11<12:54:15,  7.46s/it]

{'loss': 0.415, 'grad_norm': 3.02274751663208, 'learning_rate': 4.836956521739131e-05, 'epoch': 0.16}


  3%|▎         | 220/6440 [26:21<12:10:52,  7.05s/it]

{'loss': 0.4117, 'grad_norm': 4.6507158279418945, 'learning_rate': 4.829192546583851e-05, 'epoch': 0.17}


  4%|▎         | 230/6440 [27:32<12:21:33,  7.16s/it]

{'loss': 0.3363, 'grad_norm': 2.730354070663452, 'learning_rate': 4.8214285714285716e-05, 'epoch': 0.18}


  4%|▎         | 240/6440 [28:46<13:00:49,  7.56s/it]

{'loss': 0.3955, 'grad_norm': 3.58805513381958, 'learning_rate': 4.8136645962732924e-05, 'epoch': 0.19}


  4%|▍         | 250/6440 [30:01<12:30:18,  7.27s/it]

{'loss': 0.3952, 'grad_norm': 3.643188238143921, 'learning_rate': 4.8059006211180125e-05, 'epoch': 0.19}


  4%|▍         | 260/6440 [31:13<12:27:29,  7.26s/it]

{'loss': 0.391, 'grad_norm': 1.833207368850708, 'learning_rate': 4.798136645962733e-05, 'epoch': 0.2}


  4%|▍         | 270/6440 [32:27<12:06:13,  7.06s/it]

{'loss': 0.3691, 'grad_norm': 2.4155492782592773, 'learning_rate': 4.7903726708074534e-05, 'epoch': 0.21}


  4%|▍         | 280/6440 [33:35<11:40:59,  6.83s/it]

{'loss': 0.3675, 'grad_norm': 2.0260331630706787, 'learning_rate': 4.782608695652174e-05, 'epoch': 0.22}


  5%|▍         | 290/6440 [34:43<11:36:02,  6.79s/it]

{'loss': 0.4559, 'grad_norm': 3.6935505867004395, 'learning_rate': 4.774844720496895e-05, 'epoch': 0.23}


  5%|▍         | 300/6440 [35:51<11:34:02,  6.78s/it]

{'loss': 0.3977, 'grad_norm': 6.218353748321533, 'learning_rate': 4.767080745341615e-05, 'epoch': 0.23}


  5%|▍         | 310/6440 [37:00<11:44:40,  6.90s/it]

{'loss': 0.3864, 'grad_norm': 5.301943302154541, 'learning_rate': 4.759316770186336e-05, 'epoch': 0.24}


  5%|▍         | 320/6440 [38:11<12:07:09,  7.13s/it]

{'loss': 0.33, 'grad_norm': 4.77354621887207, 'learning_rate': 4.751552795031056e-05, 'epoch': 0.25}


  5%|▌         | 330/6440 [39:31<16:21:09,  9.63s/it]

{'loss': 0.3393, 'grad_norm': 1.7498729228973389, 'learning_rate': 4.743788819875777e-05, 'epoch': 0.26}


  5%|▌         | 340/6440 [41:32<20:14:53, 11.95s/it]

{'loss': 0.3888, 'grad_norm': 4.686152935028076, 'learning_rate': 4.736024844720497e-05, 'epoch': 0.26}


  5%|▌         | 350/6440 [42:50<11:34:47,  6.85s/it]

{'loss': 0.3221, 'grad_norm': 3.1995859146118164, 'learning_rate': 4.7282608695652177e-05, 'epoch': 0.27}


  6%|▌         | 360/6440 [43:57<11:04:36,  6.56s/it]

{'loss': 0.3167, 'grad_norm': 3.27877140045166, 'learning_rate': 4.7204968944099384e-05, 'epoch': 0.28}


  6%|▌         | 370/6440 [45:03<11:02:18,  6.55s/it]

{'loss': 0.3416, 'grad_norm': 3.9813849925994873, 'learning_rate': 4.7127329192546586e-05, 'epoch': 0.29}


  6%|▌         | 380/6440 [46:08<10:58:37,  6.52s/it]

{'loss': 0.4134, 'grad_norm': 4.193871021270752, 'learning_rate': 4.7049689440993793e-05, 'epoch': 0.3}


  6%|▌         | 390/6440 [47:14<11:10:48,  6.65s/it]

{'loss': 0.338, 'grad_norm': 3.3145065307617188, 'learning_rate': 4.6972049689440995e-05, 'epoch': 0.3}


  6%|▌         | 400/6440 [48:19<10:56:38,  6.52s/it]

{'loss': 0.4154, 'grad_norm': 2.3414905071258545, 'learning_rate': 4.68944099378882e-05, 'epoch': 0.31}


  6%|▋         | 410/6440 [49:25<10:52:39,  6.49s/it]

{'loss': 0.3535, 'grad_norm': 3.797760009765625, 'learning_rate': 4.681677018633541e-05, 'epoch': 0.32}


  7%|▋         | 420/6440 [50:30<10:53:46,  6.52s/it]

{'loss': 0.3719, 'grad_norm': 2.041796922683716, 'learning_rate': 4.673913043478261e-05, 'epoch': 0.33}


  7%|▋         | 430/6440 [51:35<10:51:41,  6.51s/it]

{'loss': 0.3186, 'grad_norm': 3.2017037868499756, 'learning_rate': 4.666149068322981e-05, 'epoch': 0.33}


  7%|▋         | 440/6440 [52:40<10:49:01,  6.49s/it]

{'loss': 0.3839, 'grad_norm': 2.8430066108703613, 'learning_rate': 4.658385093167702e-05, 'epoch': 0.34}


  7%|▋         | 450/6440 [53:45<10:57:29,  6.59s/it]

{'loss': 0.3707, 'grad_norm': 3.8788583278656006, 'learning_rate': 4.650621118012423e-05, 'epoch': 0.35}


  7%|▋         | 460/6440 [54:50<10:47:21,  6.50s/it]

{'loss': 0.3527, 'grad_norm': 5.0272016525268555, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.36}


  7%|▋         | 470/6440 [55:55<10:46:13,  6.49s/it]

{'loss': 0.3655, 'grad_norm': 7.646906852722168, 'learning_rate': 4.635093167701863e-05, 'epoch': 0.36}


  7%|▋         | 480/6440 [57:01<10:44:59,  6.49s/it]

{'loss': 0.3334, 'grad_norm': 1.7325174808502197, 'learning_rate': 4.627329192546584e-05, 'epoch': 0.37}


  8%|▊         | 490/6440 [58:05<10:42:21,  6.48s/it]

{'loss': 0.3135, 'grad_norm': 4.537245273590088, 'learning_rate': 4.6195652173913046e-05, 'epoch': 0.38}


  8%|▊         | 500/6440 [59:10<10:41:56,  6.48s/it]

{'loss': 0.3283, 'grad_norm': 3.9079160690307617, 'learning_rate': 4.6118012422360254e-05, 'epoch': 0.39}


  8%|▊         | 510/6440 [1:00:15<10:40:03,  6.48s/it]

{'loss': 0.3074, 'grad_norm': 3.879517078399658, 'learning_rate': 4.6040372670807455e-05, 'epoch': 0.4}


  8%|▊         | 520/6440 [1:01:20<10:39:20,  6.48s/it]

{'loss': 0.3814, 'grad_norm': 2.849317789077759, 'learning_rate': 4.5962732919254656e-05, 'epoch': 0.4}


  8%|▊         | 530/6440 [1:02:25<10:41:42,  6.51s/it]

{'loss': 0.3683, 'grad_norm': 3.6459834575653076, 'learning_rate': 4.5885093167701864e-05, 'epoch': 0.41}


  8%|▊         | 540/6440 [1:03:30<10:37:53,  6.49s/it]

{'loss': 0.3286, 'grad_norm': 2.2463576793670654, 'learning_rate': 4.580745341614907e-05, 'epoch': 0.42}


  9%|▊         | 550/6440 [1:04:35<10:38:36,  6.51s/it]

{'loss': 0.2911, 'grad_norm': 2.7348129749298096, 'learning_rate': 4.572981366459628e-05, 'epoch': 0.43}


  9%|▊         | 560/6440 [1:05:46<11:47:26,  7.22s/it]

{'loss': 0.3356, 'grad_norm': 3.8684489727020264, 'learning_rate': 4.565217391304348e-05, 'epoch': 0.43}


  9%|▉         | 570/6440 [1:06:56<11:22:54,  6.98s/it]

{'loss': 0.3273, 'grad_norm': 2.957855224609375, 'learning_rate': 4.557453416149068e-05, 'epoch': 0.44}


  9%|▉         | 580/6440 [1:08:05<11:11:41,  6.88s/it]

{'loss': 0.3556, 'grad_norm': 2.307860851287842, 'learning_rate': 4.549689440993789e-05, 'epoch': 0.45}


  9%|▉         | 590/6440 [1:09:14<11:16:00,  6.93s/it]

{'loss': 0.443, 'grad_norm': 3.3947012424468994, 'learning_rate': 4.54192546583851e-05, 'epoch': 0.46}


  9%|▉         | 600/6440 [1:10:23<11:06:21,  6.85s/it]

{'loss': 0.2283, 'grad_norm': 2.2569210529327393, 'learning_rate': 4.5341614906832306e-05, 'epoch': 0.47}


  9%|▉         | 610/6440 [1:11:32<11:11:37,  6.91s/it]

{'loss': 0.2704, 'grad_norm': 3.2116823196411133, 'learning_rate': 4.52639751552795e-05, 'epoch': 0.47}


 10%|▉         | 620/6440 [1:12:42<11:10:02,  6.91s/it]

{'loss': 0.3893, 'grad_norm': 2.1368658542633057, 'learning_rate': 4.518633540372671e-05, 'epoch': 0.48}


 10%|▉         | 630/6440 [1:13:51<11:07:02,  6.89s/it]

{'loss': 0.4201, 'grad_norm': 4.765082836151123, 'learning_rate': 4.5108695652173916e-05, 'epoch': 0.49}


 10%|▉         | 640/6440 [1:14:59<11:06:14,  6.89s/it]

{'loss': 0.3315, 'grad_norm': 3.802731990814209, 'learning_rate': 4.5031055900621124e-05, 'epoch': 0.5}


 10%|█         | 650/6440 [1:16:08<11:03:53,  6.88s/it]

{'loss': 0.3928, 'grad_norm': 3.1622207164764404, 'learning_rate': 4.4953416149068325e-05, 'epoch': 0.5}


 10%|█         | 660/6440 [1:17:18<11:08:26,  6.94s/it]

{'loss': 0.2517, 'grad_norm': 2.1222190856933594, 'learning_rate': 4.4875776397515526e-05, 'epoch': 0.51}


 10%|█         | 670/6440 [1:18:27<11:05:39,  6.92s/it]

{'loss': 0.3407, 'grad_norm': 4.3095502853393555, 'learning_rate': 4.4798136645962734e-05, 'epoch': 0.52}


 11%|█         | 680/6440 [1:19:36<11:01:28,  6.89s/it]

{'loss': 0.3908, 'grad_norm': 1.0521281957626343, 'learning_rate': 4.472049689440994e-05, 'epoch': 0.53}


 11%|█         | 690/6440 [1:20:44<10:57:24,  6.86s/it]

{'loss': 0.2501, 'grad_norm': 2.111403226852417, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.54}


 11%|█         | 700/6440 [1:21:53<11:00:38,  6.91s/it]

{'loss': 0.3257, 'grad_norm': 2.9346206188201904, 'learning_rate': 4.456521739130435e-05, 'epoch': 0.54}


 11%|█         | 710/6440 [1:23:02<10:57:22,  6.88s/it]

{'loss': 0.333, 'grad_norm': 4.788524627685547, 'learning_rate': 4.448757763975155e-05, 'epoch': 0.55}


 11%|█         | 720/6440 [1:24:11<10:57:41,  6.90s/it]

{'loss': 0.4018, 'grad_norm': 3.7099223136901855, 'learning_rate': 4.440993788819876e-05, 'epoch': 0.56}


 11%|█▏        | 730/6440 [1:25:19<10:55:06,  6.88s/it]

{'loss': 0.3432, 'grad_norm': 2.001863956451416, 'learning_rate': 4.433229813664597e-05, 'epoch': 0.57}


 11%|█▏        | 740/6440 [1:26:29<10:58:05,  6.93s/it]

{'loss': 0.2918, 'grad_norm': 4.619111061096191, 'learning_rate': 4.425465838509317e-05, 'epoch': 0.57}


 12%|█▏        | 750/6440 [1:27:39<11:04:55,  7.01s/it]

{'loss': 0.2955, 'grad_norm': 1.1550357341766357, 'learning_rate': 4.4177018633540377e-05, 'epoch': 0.58}


 12%|█▏        | 760/6440 [1:28:48<11:00:07,  6.97s/it]

{'loss': 0.2883, 'grad_norm': 4.683504581451416, 'learning_rate': 4.409937888198758e-05, 'epoch': 0.59}


 12%|█▏        | 770/6440 [1:29:56<10:39:27,  6.77s/it]

{'loss': 0.3679, 'grad_norm': 3.645669460296631, 'learning_rate': 4.4021739130434786e-05, 'epoch': 0.6}


 12%|█▏        | 780/6440 [1:31:04<10:40:09,  6.79s/it]

{'loss': 0.3714, 'grad_norm': 3.5866260528564453, 'learning_rate': 4.3944099378881993e-05, 'epoch': 0.61}


 12%|█▏        | 790/6440 [1:32:12<10:37:47,  6.77s/it]

{'loss': 0.3886, 'grad_norm': 5.305403232574463, 'learning_rate': 4.3866459627329195e-05, 'epoch': 0.61}


 12%|█▏        | 800/6440 [1:33:19<10:20:57,  6.61s/it]

{'loss': 0.3663, 'grad_norm': 6.3890862464904785, 'learning_rate': 4.3788819875776396e-05, 'epoch': 0.62}


 13%|█▎        | 810/6440 [1:34:25<10:30:11,  6.72s/it]

{'loss': 0.3165, 'grad_norm': 2.4389894008636475, 'learning_rate': 4.3711180124223603e-05, 'epoch': 0.63}


 13%|█▎        | 820/6440 [1:35:31<10:13:28,  6.55s/it]

{'loss': 0.2902, 'grad_norm': 2.7280142307281494, 'learning_rate': 4.363354037267081e-05, 'epoch': 0.64}


 13%|█▎        | 830/6440 [1:36:37<10:14:35,  6.57s/it]

{'loss': 0.328, 'grad_norm': 4.974148273468018, 'learning_rate': 4.355590062111801e-05, 'epoch': 0.64}


 13%|█▎        | 840/6440 [1:37:42<10:12:27,  6.56s/it]

{'loss': 0.353, 'grad_norm': 4.597991943359375, 'learning_rate': 4.347826086956522e-05, 'epoch': 0.65}


 13%|█▎        | 850/6440 [1:38:49<10:21:50,  6.67s/it]

{'loss': 0.311, 'grad_norm': 3.748033046722412, 'learning_rate': 4.340062111801242e-05, 'epoch': 0.66}


 13%|█▎        | 860/6440 [1:39:58<10:47:34,  6.96s/it]

{'loss': 0.4177, 'grad_norm': 1.897080898284912, 'learning_rate': 4.332298136645963e-05, 'epoch': 0.67}


 14%|█▎        | 870/6440 [1:41:08<10:45:55,  6.96s/it]

{'loss': 0.3208, 'grad_norm': 3.1335086822509766, 'learning_rate': 4.324534161490684e-05, 'epoch': 0.68}


 14%|█▎        | 880/6440 [1:42:17<10:39:38,  6.90s/it]

{'loss': 0.3225, 'grad_norm': 3.9145700931549072, 'learning_rate': 4.316770186335404e-05, 'epoch': 0.68}


 14%|█▍        | 890/6440 [1:43:27<10:36:07,  6.88s/it]

{'loss': 0.3483, 'grad_norm': 5.840506553649902, 'learning_rate': 4.3090062111801246e-05, 'epoch': 0.69}


 14%|█▍        | 900/6440 [1:44:35<10:37:32,  6.90s/it]

{'loss': 0.3427, 'grad_norm': 4.980530261993408, 'learning_rate': 4.301242236024845e-05, 'epoch': 0.7}


 14%|█▍        | 910/6440 [1:45:44<10:31:33,  6.85s/it]

{'loss': 0.3568, 'grad_norm': 2.4886813163757324, 'learning_rate': 4.2934782608695655e-05, 'epoch': 0.71}


 14%|█▍        | 920/6440 [1:46:53<10:33:39,  6.89s/it]

{'loss': 0.2878, 'grad_norm': 3.7645723819732666, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.71}


 14%|█▍        | 930/6440 [1:48:02<10:38:56,  6.96s/it]

{'loss': 0.3097, 'grad_norm': 3.2867205142974854, 'learning_rate': 4.2779503105590064e-05, 'epoch': 0.72}


 15%|█▍        | 940/6440 [1:49:09<10:10:41,  6.66s/it]

{'loss': 0.3121, 'grad_norm': 3.844292640686035, 'learning_rate': 4.270186335403727e-05, 'epoch': 0.73}


 15%|█▍        | 950/6440 [1:50:17<10:34:01,  6.93s/it]

{'loss': 0.4668, 'grad_norm': 6.463583469390869, 'learning_rate': 4.262422360248447e-05, 'epoch': 0.74}


 15%|█▍        | 960/6440 [1:51:27<10:34:03,  6.94s/it]

{'loss': 0.3448, 'grad_norm': 1.3960376977920532, 'learning_rate': 4.254658385093168e-05, 'epoch': 0.75}


 15%|█▌        | 970/6440 [1:52:36<10:26:22,  6.87s/it]

{'loss': 0.3259, 'grad_norm': 4.277111053466797, 'learning_rate': 4.246894409937888e-05, 'epoch': 0.75}


 15%|█▌        | 980/6440 [1:53:45<10:30:21,  6.93s/it]

{'loss': 0.2918, 'grad_norm': 4.153024196624756, 'learning_rate': 4.239130434782609e-05, 'epoch': 0.76}


 15%|█▌        | 990/6440 [1:54:55<10:32:34,  6.96s/it]

{'loss': 0.3198, 'grad_norm': 3.3441052436828613, 'learning_rate': 4.23136645962733e-05, 'epoch': 0.77}


 16%|█▌        | 1000/6440 [1:56:03<10:23:43,  6.88s/it]

{'loss': 0.3084, 'grad_norm': 6.43623685836792, 'learning_rate': 4.22360248447205e-05, 'epoch': 0.78}


 16%|█▌        | 1010/6440 [1:57:12<10:20:38,  6.86s/it]

{'loss': 0.2884, 'grad_norm': 2.590911388397217, 'learning_rate': 4.21583850931677e-05, 'epoch': 0.78}


 16%|█▌        | 1020/6440 [1:58:21<10:18:51,  6.85s/it]

{'loss': 0.3384, 'grad_norm': 2.6532514095306396, 'learning_rate': 4.208074534161491e-05, 'epoch': 0.79}


 16%|█▌        | 1030/6440 [1:59:30<10:19:33,  6.87s/it]

{'loss': 0.3168, 'grad_norm': 1.7549906969070435, 'learning_rate': 4.2003105590062116e-05, 'epoch': 0.8}


 16%|█▌        | 1040/6440 [2:00:35<9:49:29,  6.55s/it] 

{'loss': 0.3586, 'grad_norm': 4.466508388519287, 'learning_rate': 4.192546583850932e-05, 'epoch': 0.81}


 16%|█▋        | 1050/6440 [2:01:43<10:13:37,  6.83s/it]

{'loss': 0.3189, 'grad_norm': 2.5736746788024902, 'learning_rate': 4.1847826086956525e-05, 'epoch': 0.82}


 16%|█▋        | 1060/6440 [2:02:53<10:23:35,  6.95s/it]

{'loss': 0.2459, 'grad_norm': 3.283946990966797, 'learning_rate': 4.1770186335403726e-05, 'epoch': 0.82}


 17%|█▋        | 1070/6440 [2:04:03<10:27:38,  7.01s/it]

{'loss': 0.3156, 'grad_norm': 1.2767102718353271, 'learning_rate': 4.1692546583850934e-05, 'epoch': 0.83}


 17%|█▋        | 1080/6440 [2:05:14<10:26:34,  7.01s/it]

{'loss': 0.303, 'grad_norm': 4.112646579742432, 'learning_rate': 4.161490683229814e-05, 'epoch': 0.84}


 17%|█▋        | 1090/6440 [2:06:26<10:18:31,  6.94s/it]

{'loss': 0.2307, 'grad_norm': 3.5477616786956787, 'learning_rate': 4.153726708074534e-05, 'epoch': 0.85}


 17%|█▋        | 1100/6440 [2:07:33<10:20:44,  6.97s/it]

{'loss': 0.2879, 'grad_norm': 6.435217380523682, 'learning_rate': 4.1459627329192544e-05, 'epoch': 0.85}


 17%|█▋        | 1110/6440 [2:08:51<11:25:19,  7.71s/it]

{'loss': 0.2222, 'grad_norm': 3.1403086185455322, 'learning_rate': 4.138198757763975e-05, 'epoch': 0.86}


 17%|█▋        | 1120/6440 [2:10:14<12:35:21,  8.52s/it]

{'loss': 0.3296, 'grad_norm': 4.268774509429932, 'learning_rate': 4.130434782608696e-05, 'epoch': 0.87}


 18%|█▊        | 1130/6440 [2:11:41<12:41:02,  8.60s/it]

{'loss': 0.2619, 'grad_norm': 4.234813213348389, 'learning_rate': 4.122670807453417e-05, 'epoch': 0.88}


 18%|█▊        | 1140/6440 [2:13:06<12:35:50,  8.56s/it]

{'loss': 0.2872, 'grad_norm': 2.7804625034332275, 'learning_rate': 4.114906832298137e-05, 'epoch': 0.89}


 18%|█▊        | 1150/6440 [2:14:28<12:39:26,  8.61s/it]

{'loss': 0.2581, 'grad_norm': 4.38781213760376, 'learning_rate': 4.107142857142857e-05, 'epoch': 0.89}


 18%|█▊        | 1160/6440 [2:15:47<10:41:13,  7.29s/it]

{'loss': 0.3494, 'grad_norm': 5.977411270141602, 'learning_rate': 4.099378881987578e-05, 'epoch': 0.9}


 18%|█▊        | 1170/6440 [2:16:57<10:18:52,  7.05s/it]

{'loss': 0.2991, 'grad_norm': 3.650660514831543, 'learning_rate': 4.0916149068322986e-05, 'epoch': 0.91}


 18%|█▊        | 1180/6440 [2:18:08<10:17:51,  7.05s/it]

{'loss': 0.3178, 'grad_norm': 4.260091781616211, 'learning_rate': 4.0838509316770193e-05, 'epoch': 0.92}


 18%|█▊        | 1190/6440 [2:19:18<10:00:18,  6.86s/it]

{'loss': 0.3006, 'grad_norm': 3.0935781002044678, 'learning_rate': 4.076086956521739e-05, 'epoch': 0.92}


 19%|█▊        | 1200/6440 [2:20:35<10:15:30,  7.05s/it]

{'loss': 0.2201, 'grad_norm': 5.20722770690918, 'learning_rate': 4.0683229813664596e-05, 'epoch': 0.93}


 19%|█▉        | 1210/6440 [2:21:43<9:54:31,  6.82s/it] 

{'loss': 0.2931, 'grad_norm': 4.218657493591309, 'learning_rate': 4.0605590062111803e-05, 'epoch': 0.94}


 19%|█▉        | 1220/6440 [2:22:54<10:11:32,  7.03s/it]

{'loss': 0.2881, 'grad_norm': 6.613028526306152, 'learning_rate': 4.052795031055901e-05, 'epoch': 0.95}


 19%|█▉        | 1230/6440 [2:24:03<9:51:52,  6.82s/it] 

{'loss': 0.3155, 'grad_norm': 2.952699661254883, 'learning_rate': 4.045031055900621e-05, 'epoch': 0.95}


 19%|█▉        | 1240/6440 [2:25:11<9:37:55,  6.67s/it]

{'loss': 0.2092, 'grad_norm': 4.212965488433838, 'learning_rate': 4.0372670807453414e-05, 'epoch': 0.96}


 19%|█▉        | 1250/6440 [2:26:17<9:47:23,  6.79s/it]

{'loss': 0.3369, 'grad_norm': 2.973684549331665, 'learning_rate': 4.029503105590062e-05, 'epoch': 0.97}


 20%|█▉        | 1260/6440 [2:27:27<9:57:15,  6.92s/it] 

{'loss': 0.2385, 'grad_norm': 1.6125099658966064, 'learning_rate': 4.021739130434783e-05, 'epoch': 0.98}


 20%|█▉        | 1270/6440 [2:28:38<10:03:04,  7.00s/it]

{'loss': 0.302, 'grad_norm': 5.0697455406188965, 'learning_rate': 4.013975155279504e-05, 'epoch': 0.99}


 20%|█▉        | 1280/6440 [2:29:47<9:46:40,  6.82s/it] 

{'loss': 0.2669, 'grad_norm': 4.989175319671631, 'learning_rate': 4.006211180124224e-05, 'epoch': 0.99}


                                                       
 20%|██        | 1288/6440 [4:40:53<9:02:14,  6.32s/it]

{'eval_loss': 0.30807387828826904, 'eval_accuracy': 0.8755458515283843, 'eval_runtime': 7814.7459, 'eval_samples_per_second': 0.293, 'eval_steps_per_second': 0.018, 'epoch': 1.0}


RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'classifier.weight', 'base_model.classifier.weight'}, {'classifier.bias', 'base_model.classifier.bias'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            