In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import tabulate
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# pip install transformers==4.45.2 



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('dair-ai/emotion')
print(dataset)

labels = dataset["train"].features["label"].names
print(labels)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [3]:
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_data(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Токенизация всех данных
tokenized_dataset = dataset.map(tokenize_data, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 2000/2000 [00:00<00:00, 9433.51 examples/s]


In [5]:
def compute_metrics(eval_pred):
    logits, true_labels = eval_pred  # Переименовали переменную
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(true_labels, predictions),
        **classification_report(
            true_labels, 
            predictions,
            target_names=labels,  # Используем глобальный список названий
            output_dict=True,
            zero_division=0
        )["macro avg"]
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=1000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1-score",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
    # lr_scheduler_type="cosine",
    # warmup_steps=500
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

results = trainer.evaluate(tokenized_dataset["test"])
print(tabulate.tabulate(
    results.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))
trainer.train()

results = trainer.evaluate(tokenized_dataset["test"])
print(tabulate.tabulate(
    results.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))


Test metrics: {'eval_loss': 0.7932651042938232, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.7425, 'eval_precision': 0.7402512345382611, 'eval_recall': 0.5454750466210642, 'eval_f1-score': 0.5292401848522555, 'eval_support': 2000.0, 'eval_runtime': 30.2114, 'eval_samples_per_second': 66.2, 'eval_steps_per_second': 4.138}
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.7933 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0000 |
+-----------------------------+------------+
| eval_accuracy               |     0.7425 |
+-----------------------------+------------+
| eval_precision              |     0.7403 |
+-----------------------------+------------+
| eval_recall                 |     0.5455 |
+-----------------------------+------------+
| eval_f1-score               |     0.5292 |
+-----------------------------+------------+
| eval_support                |  2000

Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Precision,Recall,F1-score,Support
100,0.4813,0.602014,0.0,0.7985,0.795935,0.695663,0.717404,2000.0
200,0.5563,0.401258,0.0,0.8795,0.863867,0.819401,0.836907,2000.0
300,0.412,0.370869,0.0,0.8875,0.885845,0.833811,0.848792,2000.0
400,0.3452,0.29869,0.0,0.904,0.886308,0.881649,0.879431,2000.0
500,0.2953,0.253372,0.0,0.9145,0.880174,0.895599,0.887184,2000.0
600,0.2682,0.242581,0.0,0.9145,0.89044,0.884771,0.88568,2000.0
700,0.2368,0.229441,0.0,0.9235,0.904171,0.898934,0.900193,2000.0
800,0.233,0.21062,0.0,0.9225,0.893911,0.899958,0.896235,2000.0
900,0.2247,0.195859,0.0,0.9265,0.894704,0.911191,0.902439,2000.0
1000,0.2298,0.194467,0.0,0.925,0.892626,0.914553,0.902922,2000.0


Test metrics: {'eval_loss': 0.19975651800632477, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.9255, 'eval_precision': 0.8758080764820901, 'eval_recall': 0.9015332050069241, 'eval_f1-score': 0.8875884063063505, 'eval_support': 2000.0, 'eval_runtime': 22.5506, 'eval_samples_per_second': 88.689, 'eval_steps_per_second': 5.543, 'epoch': 1.0}
+-----------------------------+------------+
| Метрика                     |   Значение |
| eval_loss                   |     0.1998 |
+-----------------------------+------------+
| eval_model_preparation_time |     0.0000 |
+-----------------------------+------------+
| eval_accuracy               |     0.9255 |
+-----------------------------+------------+
| eval_precision              |     0.8758 |
+-----------------------------+------------+
| eval_recall                 |     0.9015 |
+-----------------------------+------------+
| eval_f1-score               |     0.8876 |
+-----------------------------+------------+
| eval_support      

In [15]:
from torch import nn
from transformers import AutoModel

# Кастомная классификационная голова
class CustomClassifier(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_prob=0.1):
        super().__init__()
        self.dense = nn.Linear(hidden_size, 1024)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(1024, num_labels)
        self.relu = nn.ReLU()

    def forward(self, pooled_output):
        x = self.dense(pooled_output)  # [batch_size, 256]
        x = self.relu(x)
        x = self.dropout(x)
        return self.out_proj(x)        # [batch_size, num_labels]

# Модифицируем модель BERT
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels)
)

# Замораживаем все слои BERT
for param in model.parameters():
    param.requires_grad = False

# Добавляем кастомную голову
hidden_size = model.config.hidden_size  # 768 для bert-base
num_labels = len(labels)                          # 6 эмоций в датасете
model.classifier = CustomClassifier(hidden_size, num_labels)

# Проверяем, что обучаются только параметры классификатора
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Обучается: {name}")

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=1e-4, # Больше lr, так как обучаем только голову
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=10000,
    logging_steps=200,
    weight_decay=0.01,
    metric_for_best_model="f1-score",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
    # lr_scheduler_type="cosine",
    # warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate(tokenized_dataset["test"])
print(tabulate.tabulate(
    results.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Обучается: classifier.dense.weight
Обучается: classifier.dense.bias
Обучается: classifier.out_proj.weight
Обучается: classifier.out_proj.bias


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score,Support
200,1.5806,1.566283,0.392,0.130832,0.197472,0.149039,2000.0
400,1.5457,1.558187,0.4415,0.172626,0.236536,0.194022,2000.0
600,1.5166,1.556525,0.412,0.151174,0.204422,0.150668,2000.0
800,1.5263,1.508552,0.4575,0.157347,0.24572,0.18829,2000.0
1000,1.5007,1.478144,0.463,0.159239,0.248456,0.190569,2000.0
1200,1.48,1.465558,0.4675,0.161749,0.25125,0.192665,2000.0
1400,1.4713,1.455693,0.4755,0.160201,0.253182,0.194655,2000.0
1600,1.448,1.436043,0.4865,0.326724,0.255758,0.198223,2000.0
1800,1.4742,1.414599,0.491,0.217141,0.257642,0.199582,2000.0
2000,1.4522,1.413316,0.4935,0.222652,0.262595,0.207293,2000.0


Test metrics: {'eval_loss': 1.381722092628479, 'eval_accuracy': 0.486, 'eval_precision': 0.27523213925002504, 'eval_recall': 0.25627310887950566, 'eval_f1-score': 0.20800118134501563, 'eval_support': 2000.0, 'eval_runtime': 22.5217, 'eval_samples_per_second': 88.803, 'eval_steps_per_second': 5.55, 'epoch': 10.0}
+-------------------------+------------+
| Метрика                 |   Значение |
| eval_loss               |     1.3817 |
+-------------------------+------------+
| eval_accuracy           |     0.4860 |
+-------------------------+------------+
| eval_precision          |     0.2752 |
+-------------------------+------------+
| eval_recall             |     0.2563 |
+-------------------------+------------+
| eval_f1-score           |     0.2080 |
+-------------------------+------------+
| eval_support            |  2000.0000 |
+-------------------------+------------+
| eval_runtime            |    22.5217 |
+-------------------------+------------+
| eval_samples_per_second |   

In [12]:
from peft import (
    PromptTuningConfig,
    get_peft_model,
    TaskType
)
from transformers import AutoModelForSequenceClassification

# 1. Конфигурация Prompt Tuning
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=20,  # Длина мягкого промпта
    token_dim=768,          # Размерность эмбеддингов BERT
    # num_transformer_submodules=1,  # Для encoder-only моделей
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Classify the emotion in the text:",
    base_model_name_or_path="bert-base-uncased",
    tokenizer_name_or_path="bert-base-uncased"
)

# 2. Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=len(labels),
    return_dict=True
)

# 3. Обертка модели в PEFT
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Обучаемые параметры: ~0.1%

# 4. Заморозка основной модели (опционально)
# for param in model.base_model.parameters():
#     param.requires_grad = False

training_args = TrainingArguments(
    output_dir="./peft_results",
    learning_rate=3e-4,           # Выше обычного для промптов
    per_device_train_batch_size=32,
    num_train_epochs=5,          # Нужно больше эпох
    logging_steps=100,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate(tokenized_dataset["test"])
print(tabulate.tabulate(
    results.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 15,360 || all params: 109,502,214 || trainable%: 0.0140


  0%|          | 21/5000 [03:12<12:38:49,  9.14s/it]
  4%|▍         | 100/2500 [01:33<32:01,  1.25it/s]
  4%|▍         | 100/2500 [01:33<32:01,  1.25it/s]

{'loss': 1.8187, 'grad_norm': 0.17120012640953064, 'learning_rate': 0.00028799999999999995, 'epoch': 0.2}


  8%|▊         | 200/2500 [02:54<30:46,  1.25it/s]
  8%|▊         | 200/2500 [02:54<30:46,  1.25it/s]

{'loss': 1.7639, 'grad_norm': 0.30830827355384827, 'learning_rate': 0.000276, 'epoch': 0.4}


 12%|█▏        | 300/2500 [04:14<29:27,  1.24it/s]
 12%|█▏        | 300/2500 [04:14<29:27,  1.24it/s]

{'loss': 1.678, 'grad_norm': 0.3930996060371399, 'learning_rate': 0.00026399999999999997, 'epoch': 0.6}


 16%|█▌        | 400/2500 [05:35<28:04,  1.25it/s]
 16%|█▌        | 400/2500 [05:35<28:04,  1.25it/s]

{'loss': 1.6494, 'grad_norm': 0.4718685448169708, 'learning_rate': 0.00025199999999999995, 'epoch': 0.8}


 20%|██        | 500/2500 [06:55<26:47,  1.24it/s]
 20%|██        | 500/2500 [06:55<26:47,  1.24it/s]

{'loss': 1.6321, 'grad_norm': 0.4933715760707855, 'learning_rate': 0.00023999999999999998, 'epoch': 1.0}


 24%|██▍       | 600/2500 [08:16<25:31,  1.24it/s]
 24%|██▍       | 600/2500 [08:16<25:31,  1.24it/s]

{'loss': 1.6107, 'grad_norm': 0.6840925812721252, 'learning_rate': 0.00022799999999999999, 'epoch': 1.2}


 28%|██▊       | 700/2500 [09:36<24:03,  1.25it/s]
 28%|██▊       | 700/2500 [09:36<24:03,  1.25it/s]

{'loss': 1.6053, 'grad_norm': 0.6605125069618225, 'learning_rate': 0.00021599999999999996, 'epoch': 1.4}


 32%|███▏      | 800/2500 [10:56<22:44,  1.25it/s]
 32%|███▏      | 800/2500 [10:56<22:44,  1.25it/s]

{'loss': 1.5969, 'grad_norm': 0.8295512795448303, 'learning_rate': 0.000204, 'epoch': 1.6}


 36%|███▌      | 900/2500 [12:17<21:23,  1.25it/s]
 36%|███▌      | 900/2500 [12:17<21:23,  1.25it/s]

{'loss': 1.6125, 'grad_norm': 0.7219657897949219, 'learning_rate': 0.00019199999999999998, 'epoch': 1.8}


 40%|████      | 1000/2500 [13:37<20:05,  1.24it/s]
 40%|████      | 1000/2500 [13:37<20:05,  1.24it/s]

{'loss': 1.6037, 'grad_norm': 0.544520914554596, 'learning_rate': 0.00017999999999999998, 'epoch': 2.0}


 44%|████▍     | 1100/2500 [14:58<18:45,  1.24it/s]
 44%|████▍     | 1100/2500 [14:58<18:45,  1.24it/s]

{'loss': 1.5978, 'grad_norm': 0.6628518104553223, 'learning_rate': 0.000168, 'epoch': 2.2}


 48%|████▊     | 1200/2500 [16:27<20:04,  1.08it/s]
 48%|████▊     | 1200/2500 [16:27<20:04,  1.08it/s]

{'loss': 1.586, 'grad_norm': 0.6603737473487854, 'learning_rate': 0.000156, 'epoch': 2.4}


 52%|█████▏    | 1300/2500 [18:00<18:31,  1.08it/s]
 52%|█████▏    | 1300/2500 [18:00<18:31,  1.08it/s]

{'loss': 1.5808, 'grad_norm': 0.7615088820457458, 'learning_rate': 0.00014399999999999998, 'epoch': 2.6}


 56%|█████▌    | 1400/2500 [19:33<16:59,  1.08it/s]
 56%|█████▌    | 1400/2500 [19:33<16:59,  1.08it/s]

{'loss': 1.593, 'grad_norm': 0.7308874130249023, 'learning_rate': 0.00013199999999999998, 'epoch': 2.8}


 60%|██████    | 1500/2500 [21:05<15:26,  1.08it/s]
 60%|██████    | 1500/2500 [21:05<15:26,  1.08it/s]

{'loss': 1.5965, 'grad_norm': 0.7331592440605164, 'learning_rate': 0.00011999999999999999, 'epoch': 3.0}


 64%|██████▍   | 1600/2500 [22:38<13:53,  1.08it/s]
 64%|██████▍   | 1600/2500 [22:38<13:53,  1.08it/s]

{'loss': 1.5865, 'grad_norm': 0.7494508028030396, 'learning_rate': 0.00010799999999999998, 'epoch': 3.2}


 68%|██████▊   | 1700/2500 [24:11<12:21,  1.08it/s]
 68%|██████▊   | 1700/2500 [24:11<12:21,  1.08it/s]

{'loss': 1.5854, 'grad_norm': 0.9168499708175659, 'learning_rate': 9.599999999999999e-05, 'epoch': 3.4}


 72%|███████▏  | 1800/2500 [25:44<10:52,  1.07it/s]
 72%|███████▏  | 1800/2500 [25:44<10:52,  1.07it/s]

{'loss': 1.5882, 'grad_norm': 0.802970826625824, 'learning_rate': 8.4e-05, 'epoch': 3.6}


 76%|███████▌  | 1900/2500 [27:13<08:14,  1.21it/s]
 76%|███████▌  | 1900/2500 [27:13<08:14,  1.21it/s]

{'loss': 1.5878, 'grad_norm': 1.4894355535507202, 'learning_rate': 7.199999999999999e-05, 'epoch': 3.8}


 80%|████████  | 2000/2500 [28:33<06:30,  1.28it/s]
 80%|████████  | 2000/2500 [28:33<06:30,  1.28it/s]

{'loss': 1.5678, 'grad_norm': 0.881062924861908, 'learning_rate': 5.9999999999999995e-05, 'epoch': 4.0}


 84%|████████▍ | 2100/2500 [29:51<05:13,  1.28it/s]
 84%|████████▍ | 2100/2500 [29:51<05:13,  1.28it/s]

{'loss': 1.5794, 'grad_norm': 0.5813419222831726, 'learning_rate': 4.7999999999999994e-05, 'epoch': 4.2}


 88%|████████▊ | 2200/2500 [31:09<03:54,  1.28it/s]
 88%|████████▊ | 2200/2500 [31:09<03:54,  1.28it/s]

{'loss': 1.5765, 'grad_norm': 1.4918239116668701, 'learning_rate': 3.5999999999999994e-05, 'epoch': 4.4}


 92%|█████████▏| 2300/2500 [32:28<02:36,  1.28it/s]
 92%|█████████▏| 2300/2500 [32:28<02:36,  1.28it/s]

{'loss': 1.5803, 'grad_norm': 0.815494179725647, 'learning_rate': 2.3999999999999997e-05, 'epoch': 4.6}


 96%|█████████▌| 2400/2500 [33:46<01:18,  1.28it/s]
 96%|█████████▌| 2400/2500 [33:46<01:18,  1.28it/s]

{'loss': 1.5756, 'grad_norm': 1.0910353660583496, 'learning_rate': 1.1999999999999999e-05, 'epoch': 4.8}


100%|██████████| 2500/2500 [35:04<00:00,  1.28it/s]
100%|██████████| 2500/2500 [35:04<00:00,  1.28it/s]
100%|██████████| 2500/2500 [35:04<00:00,  1.19it/s]


{'loss': 1.5852, 'grad_norm': 0.8338690996170044, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 2104.5312, 'train_samples_per_second': 38.013, 'train_steps_per_second': 1.188, 'train_loss': 1.6135260314941406, 'epoch': 5.0}


100%|██████████| 250/250 [00:25<00:00,  9.70it/s]

+-------------------------+------------+
| Метрика                 |   Значение |
| eval_loss               |     1.5444 |
+-------------------------+------------+
| eval_accuracy           |     0.4125 |
+-------------------------+------------+
| eval_precision          |     0.1381 |
+-------------------------+------------+
| eval_recall             |     0.2071 |
+-------------------------+------------+
| eval_f1-score           |     0.1564 |
+-------------------------+------------+
| eval_support            |  2000.0000 |
+-------------------------+------------+
| eval_runtime            |    25.8858 |
+-------------------------+------------+
| eval_samples_per_second |    77.2620 |
+-------------------------+------------+
| eval_steps_per_second   |     9.6580 |
+-------------------------+------------+
| epoch                   |     5.0000 |
+-------------------------+------------+





In [9]:

from peft import (
    LoraConfig, 
    TaskType, 
    get_peft_model
)
from transformers import AutoModelForSequenceClassification

# 1. Конфигурация LoRA
lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8,                # Ранг адаптеров
        lora_alpha=16,      # Коэффициент масштабирования
        lora_dropout=0.1,   # Дропаут для регуляризации
        target_modules=["query", "value"],  # Слои для применения LoRA
        bias="none"
    )

# 2. Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=6
)


# Создаем новую конфиг
peft_config = lora_config

# Обертка модели
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

# Обучение
training_args = TrainingArguments(
    output_dir=f"./lora_results",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    logging_steps=200
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate(tokenized_dataset["test"])
print(tabulate.tabulate(
    results.items(),
    headers=["Метрика", "Значение"],
    tablefmt="grid",
    floatfmt=".4f"
))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 299,526 || all params: 109,786,380 || trainable%: 0.2728


  2%|▏         | 105/5000 [01:33<1:13:00,  1.12it/s]
  8%|▊         | 200/2500 [02:48<31:26,  1.22it/s]
  8%|▊         | 200/2500 [02:48<31:26,  1.22it/s]

{'loss': 1.349, 'grad_norm': 2.0876481533050537, 'learning_rate': 0.000276, 'epoch': 0.4}


 16%|█▌        | 400/2500 [05:33<28:45,  1.22it/s]
 16%|█▌        | 400/2500 [05:33<28:45,  1.22it/s]

{'loss': 0.8296, 'grad_norm': 3.4698805809020996, 'learning_rate': 0.00025199999999999995, 'epoch': 0.8}


 20%|██        | 500/2500 [06:55<27:22,  1.22it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.47649213671684265, 'eval_accuracy': 0.8365, 'eval_precision': 0.8267511170691882, 'eval_recall': 0.7541225013167661, 'eval_f1-score': 0.7820812942751005, 'eval_support': 2000.0, 'eval_runtime': 28.4486, 'eval_samples_per_second': 70.302, 'eval_steps_per_second': 8.788, 'epoch': 1.0}


 24%|██▍       | 600/2500 [08:46<25:54,  1.22it/s]  
 24%|██▍       | 600/2500 [08:46<25:54,  1.22it/s]

{'loss': 0.5413, 'grad_norm': 4.005258083343506, 'learning_rate': 0.00022799999999999999, 'epoch': 1.2}


 32%|███▏      | 800/2500 [11:31<23:23,  1.21it/s]
 32%|███▏      | 800/2500 [11:31<23:23,  1.21it/s]

{'loss': 0.3978, 'grad_norm': 3.885700225830078, 'learning_rate': 0.000204, 'epoch': 1.6}


 40%|████      | 1000/2500 [14:15<20:26,  1.22it/s]
 40%|████      | 1000/2500 [14:15<20:26,  1.22it/s]

{'loss': 0.3599, 'grad_norm': 4.011159896850586, 'learning_rate': 0.00017999999999999998, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                   

{'eval_loss': 0.2431042194366455, 'eval_accuracy': 0.9215, 'eval_precision': 0.8936483063804851, 'eval_recall': 0.9011226544768344, 'eval_f1-score': 0.8970911641926804, 'eval_support': 2000.0, 'eval_runtime': 28.3684, 'eval_samples_per_second': 70.501, 'eval_steps_per_second': 8.813, 'epoch': 2.0}


 48%|████▊     | 1200/2500 [17:27<16:56,  1.28it/s]  
 48%|████▊     | 1200/2500 [17:27<16:56,  1.28it/s]

{'loss': 0.2753, 'grad_norm': 2.9487192630767822, 'learning_rate': 0.000156, 'epoch': 2.4}


 56%|█████▌    | 1400/2500 [20:04<14:21,  1.28it/s]
 56%|█████▌    | 1400/2500 [20:04<14:21,  1.28it/s]

{'loss': 0.2438, 'grad_norm': 3.2253737449645996, 'learning_rate': 0.00013199999999999998, 'epoch': 2.8}


 60%|██████    | 1500/2500 [21:22<13:02,  1.28it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.19373109936714172, 'eval_accuracy': 0.9285, 'eval_precision': 0.8998014687986707, 'eval_recall': 0.9034757221810993, 'eval_f1-score': 0.9011861731340605, 'eval_support': 2000.0, 'eval_runtime': 27.0616, 'eval_samples_per_second': 73.906, 'eval_steps_per_second': 9.238, 'epoch': 3.0}


 64%|██████▍   | 1600/2500 [23:09<11:45,  1.28it/s]  
 64%|██████▍   | 1600/2500 [23:09<11:45,  1.28it/s]

{'loss': 0.2356, 'grad_norm': 2.7473952770233154, 'learning_rate': 0.00010799999999999998, 'epoch': 3.2}


 72%|███████▏  | 1800/2500 [25:45<09:08,  1.28it/s]
 72%|███████▏  | 1800/2500 [25:45<09:08,  1.28it/s]

{'loss': 0.2073, 'grad_norm': 1.419308066368103, 'learning_rate': 8.4e-05, 'epoch': 3.6}


 80%|████████  | 2000/2500 [28:22<06:31,  1.28it/s]
 80%|████████  | 2000/2500 [28:22<06:31,  1.28it/s]

{'loss': 0.1996, 'grad_norm': 5.523075580596924, 'learning_rate': 5.9999999999999995e-05, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

 80%|████████  | 2000/2500 [28:49<06:31,  1.28it/s]
[A
[A

{'eval_loss': 0.1785881668329239, 'eval_accuracy': 0.936, 'eval_precision': 0.9144537697201054, 'eval_recall': 0.9291076530980072, 'eval_f1-score': 0.920991715768876, 'eval_support': 2000.0, 'eval_runtime': 25.8254, 'eval_samples_per_second': 77.443, 'eval_steps_per_second': 9.68, 'epoch': 4.0}


 88%|████████▊ | 2200/2500 [31:25<03:54,  1.28it/s]  
 88%|████████▊ | 2200/2500 [31:25<03:54,  1.28it/s]

{'loss': 0.18, 'grad_norm': 4.138106822967529, 'learning_rate': 3.5999999999999994e-05, 'epoch': 4.4}


 96%|█████████▌| 2400/2500 [34:02<01:18,  1.28it/s]
 96%|█████████▌| 2400/2500 [34:02<01:18,  1.28it/s]

{'loss': 0.1844, 'grad_norm': 2.3966174125671387, 'learning_rate': 1.1999999999999999e-05, 'epoch': 4.8}


100%|██████████| 2500/2500 [35:20<00:00,  1.28it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.1654692143201828, 'eval_accuracy': 0.938, 'eval_precision': 0.9193392246932405, 'eval_recall': 0.9178323360827396, 'eval_f1-score': 0.9184753633939998, 'eval_support': 2000.0, 'eval_runtime': 27.0808, 'eval_samples_per_second': 73.853, 'eval_steps_per_second': 9.232, 'epoch': 5.0}
{'train_runtime': 2148.5027, 'train_samples_per_second': 37.235, 'train_steps_per_second': 1.164, 'train_loss': 0.40780635681152344, 'epoch': 5.0}


100%|██████████| 250/250 [00:26<00:00,  9.35it/s]

+-------------------------+------------+
| Метрика                 |   Значение |
| eval_loss               |     0.1978 |
+-------------------------+------------+
| eval_accuracy           |     0.9220 |
+-------------------------+------------+
| eval_precision          |     0.8764 |
+-------------------------+------------+
| eval_recall             |     0.8873 |
+-------------------------+------------+
| eval_f1-score           |     0.8813 |
+-------------------------+------------+
| eval_support            |  2000.0000 |
+-------------------------+------------+
| eval_runtime            |    26.8544 |
+-------------------------+------------+
| eval_samples_per_second |    74.4760 |
+-------------------------+------------+
| eval_steps_per_second   |     9.3090 |
+-------------------------+------------+
| epoch                   |     5.0000 |
+-------------------------+------------+



