In [2]:
# pip install torch

In [4]:
# pip install transformers

In [2]:
# pip install datasets

In [4]:
# pip install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [2]:
# pip install transformers==4.36.2

In [5]:
import pandas as pd
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)



In [6]:
# 1. Загрузка данных
df = pd.read_csv('data/train.csv')

# 2. Разделение на train и valid
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

In [7]:
# 3. Преобразование в Huggingface Datasets
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [8]:
# 4. Токенизатор
model_checkpoint = "distilbert/distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [9]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 4727/4727 [00:00<00:00, 9495.69 examples/s]
Map: 100%|██████████| 1182/1182 [00:00<00:00, 9362.73 examples/s]


In [10]:
# 5. Удалим старые колонки и приведём к нужному формату
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
valid_dataset = valid_dataset.remove_columns(["text", "__index_level_0__"])
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

In [11]:
# 6. Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# pip install transformers[torch]

In [12]:
# pip install -U accelerate==0.25.0

In [13]:
# 7. Тренировочные параметры
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids, p.predictions.argmax(axis=1)),
        "f1": f1_score(p.label_ids, p.predictions.argmax(axis=1)),
    }
)
# 9. Обучение
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  0%|          | 0/888 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 296/888 [15:59<26:01,  2.64s/it]
  0%|          | 0/37 [00:00<?, ?it/s][A
  5%|▌         | 2/37 [00:01<00:31,  1.11it/s][A
  8%|▊         | 3/37 [00:03<00:43,  1.27s/it][A
 11%|█         | 4/37 [00:05<00:48,  1.46s/it][A
 14%|█▎        | 5/37 [00:07<00:50,  1.57s/it][A
 16%|█▌        | 6/37 [00:08<00:50,  1.63s/it][A
 19%|█▉        | 7/37 [00:10<00:50,  1.68s/it][A
 22%|██▏       | 8/37 [00:12<00:49,  1.71s/it][A
 24%|██▍       | 9/37 [00:14<00:48,  1.73s/it][A
 27%|██▋       | 10/37 [00:15<00:46,  1.74s/it][A

{'eval_loss': 0.20527902245521545, 'eval_accuracy': 0.8934010152284264, 'eval_f1': 0.8908145580589255, 'eval_runtime': 65.7955, 'eval_samples_per_second': 17.965, 'eval_steps_per_second': 0.562, 'epoch': 1.0}


IOStream.flush timed out
 56%|█████▋    | 500/888 [28:45<19:55,  3.08s/it]  

{'loss': 0.2725, 'learning_rate': 8.738738738738739e-06, 'epoch': 1.69}


 67%|██████▋   | 592/888 [33:30<12:59,  2.64s/it]
  0%|          | 0/37 [00:00<?, ?it/s][A
  5%|▌         | 2/37 [00:01<00:31,  1.12it/s][A
  8%|▊         | 3/37 [00:03<00:43,  1.27s/it][A
 11%|█         | 4/37 [00:05<00:48,  1.46s/it][A
 14%|█▎        | 5/37 [00:07<00:50,  1.56s/it][A
 16%|█▌        | 6/37 [00:08<00:50,  1.64s/it][A
 19%|█▉        | 7/37 [00:10<00:50,  1.69s/it][A
 22%|██▏       | 8/37 [00:12<00:50,  1.72s/it][A
 24%|██▍       | 9/37 [00:14<00:48,  1.74s/it][A
 27%|██▋       | 10/37 [00:16<00:47,  1.75s/it][A
 30%|██▉       | 11/37 [00:17<00:45,  1.76s/it][A
 32%|███▏      | 12/37 [00:19<00:44,  1.76s/it][A
 35%|███▌      | 13/37 [00:21<00:42,  1.77s/it][A
 38%|███▊      | 14/37 [00:23<00:40,  1.77s/it][A
 41%|████      | 15/37 [00:24<00:39,  1.78s/it][A
 43%|████▎     | 16/37 [00:26<00:37,  1.79s/it][A
 46%|████▌     | 17/37 [00:28<00:35,  1.79s/it][A
 49%|████▊     | 18/37 [00:30<00:33,  1.79s/it][A
 51%|█████▏    | 19/37 [00:32<00:32,  1.78s/it][

{'eval_loss': 0.20923753082752228, 'eval_accuracy': 0.8976311336717429, 'eval_f1': 0.903585657370518, 'eval_runtime': 66.023, 'eval_samples_per_second': 17.903, 'eval_steps_per_second': 0.56, 'epoch': 2.0}


100%|██████████| 888/888 [50:59<00:00,  2.62s/it]  
  0%|          | 0/37 [00:00<?, ?it/s][A
  5%|▌         | 2/37 [00:01<00:31,  1.12it/s][A
  8%|▊         | 3/37 [00:03<00:42,  1.26s/it][A
 11%|█         | 4/37 [00:05<00:48,  1.46s/it][A
 14%|█▎        | 5/37 [00:07<00:50,  1.58s/it][A
 16%|█▌        | 6/37 [00:08<00:51,  1.66s/it][A
 19%|█▉        | 7/37 [00:10<00:50,  1.70s/it][A
 22%|██▏       | 8/37 [00:12<00:50,  1.72s/it][A
 24%|██▍       | 9/37 [00:14<00:48,  1.75s/it][A
 27%|██▋       | 10/37 [00:16<00:47,  1.76s/it][A
 30%|██▉       | 11/37 [00:17<00:45,  1.77s/it][A
 32%|███▏      | 12/37 [00:19<00:44,  1.78s/it][A
 35%|███▌      | 13/37 [00:21<00:42,  1.78s/it][A
 38%|███▊      | 14/37 [00:23<00:41,  1.79s/it][A
 41%|████      | 15/37 [00:25<00:39,  1.80s/it][A
 43%|████▎     | 16/37 [00:26<00:37,  1.80s/it][A
 46%|████▌     | 17/37 [00:28<00:35,  1.80s/it][A
 49%|████▊     | 18/37 [00:30<00:34,  1.80s/it][A
 51%|█████▏    | 19/37 [00:32<00:32,  1.79s/it]

{'eval_loss': 0.17834195494651794, 'eval_accuracy': 0.9094754653130288, 'eval_f1': 0.9119341563786008, 'eval_runtime': 66.1342, 'eval_samples_per_second': 17.873, 'eval_steps_per_second': 0.559, 'epoch': 3.0}


100%|██████████| 888/888 [53:10<00:00,  3.59s/it]

{'train_runtime': 3190.0962, 'train_samples_per_second': 4.445, 'train_steps_per_second': 0.278, 'train_loss': 0.23353415136938696, 'epoch': 3.0}





TrainOutput(global_step=888, training_loss=0.23353415136938696, metrics={'train_runtime': 3190.0962, 'train_samples_per_second': 4.445, 'train_steps_per_second': 0.278, 'train_loss': 0.23353415136938696, 'epoch': 3.0})

In [15]:
# 10. Предсказания и метрики
preds = trainer.predict(valid_dataset)
y_pred = preds.predictions.argmax(axis=1)
y_true = preds.label_ids

100%|██████████| 37/37 [01:04<00:00,  1.73s/it]


In [16]:
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

print("Accuracy:", acc)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)

Accuracy: 0.9094754653130288
F1 Score: 0.9119341563786008
Confusion Matrix:
 [[521  65]
 [ 42 554]]


In [17]:
trainer.save_model("./results/best_model")
tokenizer.save_pretrained("./results/best_model")

('./results/best_model/tokenizer_config.json',
 './results/best_model/special_tokens_map.json',
 './results/best_model/vocab.json',
 './results/best_model/merges.txt',
 './results/best_model/added_tokens.json',
 './results/best_model/tokenizer.json')

### 2. MLM 

In [2]:
# pip install transformers datasets accelerate

In [1]:
import nltk
import math
import torch
import os
from transformers import (
    RobertaTokenizerFast,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset, Dataset



In [2]:
model_checkpoint  = "results/best_model"
model  = RobertaForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at results/best_model and are newly initialized: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

### создаем датасет

In [5]:
# nltk.download("punkt")

In [4]:
with open("data/train-test.txt"  , "r", encoding="utf-8") as f:
    text = f.read()

In [5]:
sentences = nltk.sent_tokenize(text)
dataset = Dataset.from_dict({"text": sentences})

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False)

tokenized = dataset.map(tokenize_function, batched=True, num_proc=8, remove_columns=["text"])

Map (num_proc=8): 100%|██████████| 29347/29347 [00:01<00:00, 19681.74 examples/s]


In [7]:
block_size=128
def group_texts(examples):
    concatenated = sum(examples["input_ids"], [])
    total_length = (len(concatenated) // block_size) * block_size
    result = {
        "input_ids": [concatenated[i : i + block_size] for i in range(0, total_length, block_size)],
    }
    result["attention_mask"] = [[1] * block_size] * len(result["input_ids"])
    return result

lm_datasets = tokenized.map(group_texts, batched=True, num_proc=8)

Map (num_proc=8): 100%|██████████| 29347/29347 [00:01<00:00, 16931.81 examples/s]


In [8]:
split = lm_datasets.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [10]:
training_args = TrainingArguments(
    output_dir="mlm-results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


#### метрики 

In [11]:
eval_loss_before = trainer.evaluate(eval_dataset=eval_dataset).get("eval_loss")
perplexity_before = math.exp(eval_loss_before) if eval_loss_before else None
print(f"Before training: Loss = {eval_loss_before:.4f}, Perplexity = {perplexity_before:.2f}")


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 134/134 [02:16<00:00,  1.02s/it]

Before training: Loss = 17.5383, Perplexity = 41378615.23





In [12]:
trainer.train()

  3%|▎         | 100/3600 [05:11<3:01:05,  3.10s/it]

{'loss': 2.583, 'learning_rate': 4.8611111111111115e-05, 'epoch': 0.08}


  6%|▌         | 200/3600 [10:23<2:55:39,  3.10s/it]

{'loss': 1.5792, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.17}


  8%|▊         | 300/3600 [15:34<2:52:36,  3.14s/it]

{'loss': 1.4478, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.25}


 11%|█         | 400/3600 [20:50<2:47:56,  3.15s/it]

{'loss': 1.3467, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.33}


 14%|█▍        | 500/3600 [26:05<2:43:41,  3.17s/it]

{'loss': 1.2991, 'learning_rate': 4.305555555555556e-05, 'epoch': 0.42}


 17%|█▋        | 600/3600 [32:23<2:35:38,  3.11s/it] 

{'loss': 1.2248, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}


 19%|█▉        | 700/3600 [37:36<2:31:05,  3.13s/it]

{'loss': 1.2058, 'learning_rate': 4.027777777777778e-05, 'epoch': 0.58}


 22%|██▏       | 800/3600 [42:48<2:24:53,  3.10s/it]

{'loss': 1.1656, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.67}


 25%|██▌       | 900/3600 [47:59<2:19:30,  3.10s/it]

{'loss': 1.1404, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.75}


 28%|██▊       | 1000/3600 [53:09<2:14:22,  3.10s/it]

{'loss': 1.1319, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.83}


IOStream.flush timed out
 31%|███       | 1100/3600 [59:25<2:08:42,  3.09s/it] 

{'loss': 1.0738, 'learning_rate': 3.472222222222222e-05, 'epoch': 0.92}


 33%|███▎      | 1200/3600 [1:04:33<1:39:14,  2.48s/it]

{'loss': 1.0604, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}



  0%|          | 0/134 [00:00<?, ?it/s][A
  1%|▏         | 2/134 [00:00<01:02,  2.11it/s][A
  2%|▏         | 3/134 [00:01<01:27,  1.50it/s][A
  3%|▎         | 4/134 [00:02<01:40,  1.30it/s][A
  4%|▎         | 5/134 [00:03<01:47,  1.20it/s][A
  4%|▍         | 6/134 [00:04<01:51,  1.15it/s][A
  5%|▌         | 7/134 [00:05<01:53,  1.12it/s][A
  6%|▌         | 8/134 [00:06<01:54,  1.10it/s][A
  7%|▋         | 9/134 [00:07<01:55,  1.09it/s][A
  7%|▋         | 10/134 [00:08<01:55,  1.08it/s][A
  8%|▊         | 11/134 [00:09<01:55,  1.07it/s][A
  9%|▉         | 12/134 [00:10<01:54,  1.07it/s][A
 10%|▉         | 13/134 [00:11<01:55,  1.05it/s][A
 10%|█         | 14/134 [00:12<01:54,  1.05it/s][A
 11%|█         | 15/134 [00:13<01:53,  1.05it/s][A
 12%|█▏        | 16/134 [00:14<01:51,  1.06it/s][A
 13%|█▎        | 17/134 [00:15<01:50,  1.06it/s][A
 13%|█▎        | 18/134 [00:16<01:48,  1.07it/s][A
 14%|█▍        | 19/134 [00:17<01:47,  1.06it/s][A
 15%|█▍        | 20/134 [00:

{'eval_loss': 0.9630648493766785, 'eval_runtime': 125.6274, 'eval_samples_per_second': 16.979, 'eval_steps_per_second': 1.067, 'epoch': 1.0}



100%|██████████| 134/134 [02:04<00:00,  1.35it/s][A
 36%|███▌      | 1300/3600 [1:11:49<1:58:19,  3.09s/it] 

{'loss': 1.0413, 'learning_rate': 3.194444444444444e-05, 'epoch': 1.08}


 39%|███▉      | 1400/3600 [1:16:58<1:53:32,  3.10s/it]

{'loss': 1.0318, 'learning_rate': 3.055555555555556e-05, 'epoch': 1.17}


 42%|████▏     | 1500/3600 [1:22:08<1:48:09,  3.09s/it]

{'loss': 1.0152, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}


 44%|████▍     | 1600/3600 [1:28:23<1:43:13,  3.10s/it] 

{'loss': 0.9839, 'learning_rate': 2.777777777777778e-05, 'epoch': 1.33}


 47%|████▋     | 1700/3600 [1:33:32<1:38:04,  3.10s/it]

{'loss': 0.9614, 'learning_rate': 2.6388888888888892e-05, 'epoch': 1.42}


 50%|█████     | 1800/3600 [1:38:42<1:32:56,  3.10s/it]

{'loss': 0.9782, 'learning_rate': 2.5e-05, 'epoch': 1.5}


 53%|█████▎    | 1900/3600 [1:43:52<1:27:32,  3.09s/it]

{'loss': 0.9663, 'learning_rate': 2.361111111111111e-05, 'epoch': 1.58}


 56%|█████▌    | 2000/3600 [1:49:02<1:23:12,  3.12s/it]

{'loss': 0.9679, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}


 58%|█████▊    | 2100/3600 [1:55:19<1:17:45,  3.11s/it] 

{'loss': 0.9208, 'learning_rate': 2.0833333333333336e-05, 'epoch': 1.75}


 61%|██████    | 2200/3600 [2:00:29<1:12:11,  3.09s/it]

{'loss': 0.9167, 'learning_rate': 1.9444444444444445e-05, 'epoch': 1.83}


 64%|██████▍   | 2300/3600 [2:05:37<1:06:41,  3.08s/it]

{'loss': 0.913, 'learning_rate': 1.8055555555555555e-05, 'epoch': 1.92}


 67%|██████▋   | 2400/3600 [2:10:45<49:46,  2.49s/it]  

{'loss': 0.8978, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



  0%|          | 0/134 [00:00<?, ?it/s][A
  1%|▏         | 2/134 [00:00<01:01,  2.16it/s][A
  2%|▏         | 3/134 [00:01<01:26,  1.52it/s][A
  3%|▎         | 4/134 [00:02<01:38,  1.32it/s][A
  4%|▎         | 5/134 [00:03<01:45,  1.22it/s][A
  4%|▍         | 6/134 [00:04<01:49,  1.17it/s][A
  5%|▌         | 7/134 [00:05<01:51,  1.14it/s][A
  6%|▌         | 8/134 [00:06<01:53,  1.11it/s][A
  7%|▋         | 9/134 [00:07<01:53,  1.10it/s][A
  7%|▋         | 10/134 [00:08<01:53,  1.09it/s][A
  8%|▊         | 11/134 [00:09<01:53,  1.08it/s][A
  9%|▉         | 12/134 [00:10<01:53,  1.08it/s][A
 10%|▉         | 13/134 [00:11<01:52,  1.07it/s][A
 10%|█         | 14/134 [00:12<01:51,  1.07it/s][A
 11%|█         | 15/134 [00:13<01:51,  1.07it/s][A
 12%|█▏        | 16/134 [00:14<01:50,  1.06it/s][A
 13%|█▎        | 17/134 [00:14<01:49,  1.07it/s][A
 13%|█▎        | 18/134 [00:15<01:49,  1.06it/s][A
 14%|█▍        | 19/134 [00:16<01:48,  1.06it/s][A
 15%|█▍        | 20/134 [00:

{'eval_loss': 0.7965473532676697, 'eval_runtime': 125.0725, 'eval_samples_per_second': 17.054, 'eval_steps_per_second': 1.071, 'epoch': 2.0}


 69%|██████▉   | 2500/3600 [2:18:00<56:45,  3.10s/it]   

{'loss': 0.8864, 'learning_rate': 1.527777777777778e-05, 'epoch': 2.08}


 72%|███████▏  | 2600/3600 [2:24:16<51:37,  3.10s/it]  

{'loss': 0.9052, 'learning_rate': 1.388888888888889e-05, 'epoch': 2.17}


 75%|███████▌  | 2700/3600 [2:29:24<45:58,  3.06s/it]

{'loss': 0.8848, 'learning_rate': 1.25e-05, 'epoch': 2.25}


 78%|███████▊  | 2800/3600 [2:34:33<41:21,  3.10s/it]

{'loss': 0.8682, 'learning_rate': 1.1111111111111112e-05, 'epoch': 2.33}


 81%|████████  | 2900/3600 [2:39:49<36:48,  3.15s/it]

{'loss': 0.8585, 'learning_rate': 9.722222222222223e-06, 'epoch': 2.42}


 83%|████████▎ | 3000/3600 [2:45:03<31:27,  3.15s/it]

{'loss': 0.8569, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


IOStream.flush timed out
 86%|████████▌ | 3100/3600 [2:51:24<26:07,  3.13s/it]  

{'loss': 0.8492, 'learning_rate': 6.944444444444445e-06, 'epoch': 2.58}


 89%|████████▉ | 3200/3600 [2:56:38<20:39,  3.10s/it]

{'loss': 0.8678, 'learning_rate': 5.555555555555556e-06, 'epoch': 2.67}


 92%|█████████▏| 3300/3600 [3:01:48<15:25,  3.08s/it]

{'loss': 0.8634, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.75}


 94%|█████████▍| 3400/3600 [3:06:59<10:23,  3.12s/it]

{'loss': 0.8464, 'learning_rate': 2.777777777777778e-06, 'epoch': 2.83}


 97%|█████████▋| 3500/3600 [3:12:10<05:10,  3.11s/it]

{'loss': 0.8352, 'learning_rate': 1.388888888888889e-06, 'epoch': 2.92}


100%|██████████| 3600/3600 [3:18:24<00:00,  2.48s/it]

{'loss': 0.8326, 'learning_rate': 0.0, 'epoch': 3.0}



  0%|          | 0/134 [00:00<?, ?it/s][A
  1%|▏         | 2/134 [00:00<01:02,  2.12it/s][A
  2%|▏         | 3/134 [00:01<01:27,  1.49it/s][A
  3%|▎         | 4/134 [00:02<01:40,  1.29it/s][A
  4%|▎         | 5/134 [00:03<01:47,  1.20it/s][A
  4%|▍         | 6/134 [00:04<01:51,  1.15it/s][A
  5%|▌         | 7/134 [00:05<01:52,  1.13it/s][A
  6%|▌         | 8/134 [00:06<01:53,  1.11it/s][A
  7%|▋         | 9/134 [00:07<01:54,  1.09it/s][A
  7%|▋         | 10/134 [00:08<01:54,  1.08it/s][A
  8%|▊         | 11/134 [00:09<01:53,  1.08it/s][A
  9%|▉         | 12/134 [00:10<01:52,  1.08it/s][A
 10%|▉         | 13/134 [00:11<01:51,  1.08it/s][A
 10%|█         | 14/134 [00:12<01:51,  1.08it/s][A
 11%|█         | 15/134 [00:13<01:50,  1.08it/s][A
 12%|█▏        | 16/134 [00:14<01:49,  1.08it/s][A
 13%|█▎        | 17/134 [00:14<01:48,  1.08it/s][A
 13%|█▎        | 18/134 [00:15<01:47,  1.08it/s][A
 14%|█▍        | 19/134 [00:16<01:47,  1.07it/s][A
 15%|█▍        | 20/134 [00:

{'eval_loss': 0.7373327612876892, 'eval_runtime': 125.2696, 'eval_samples_per_second': 17.027, 'eval_steps_per_second': 1.07, 'epoch': 3.0}
{'train_runtime': 12029.9461, 'train_samples_per_second': 4.785, 'train_steps_per_second': 0.299, 'train_loss': 1.0613170199924045, 'epoch': 3.0}





TrainOutput(global_step=3600, training_loss=1.0613170199924045, metrics={'train_runtime': 12029.9461, 'train_samples_per_second': 4.785, 'train_steps_per_second': 0.299, 'train_loss': 1.0613170199924045, 'epoch': 3.0})

In [13]:
eval_loss_after = trainer.evaluate(eval_dataset=eval_dataset).get("eval_loss")
perplexity_after = math.exp(eval_loss_after) if eval_loss_after else None
print(f"After training:  Loss = {eval_loss_after:.4f}, Perplexity = {perplexity_after:.2f}")

100%|██████████| 134/134 [02:04<00:00,  1.08it/s]

After training:  Loss = 0.7448, Perplexity = 2.11





In [15]:
# сохраняем модель 
model.save_pretrained("mlm_results")
tokenizer.save_pretrained("mlm_results")

('mlm_results/tokenizer_config.json',
 'mlm_results/special_tokens_map.json',
 'mlm_results/vocab.json',
 'mlm_results/merges.txt',
 'mlm_results/added_tokens.json',
 'mlm_results/tokenizer.json')

### Перетренировка классификатора с использованием весов из MLM

#### классификационная голова -- results/best_model
#### веса языковой части -- mlm_results

In [3]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import torch



In [4]:
dataset = load_dataset("csv", data_files="data/train.csv")["train"]
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("mlm_results")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [16]:
# модель классификации (голова)
model = AutoModelForSequenceClassification.from_pretrained("results/best_model")

# MLM веса 
mlm_model = AutoModel.from_pretrained("mlm_results")

# подставляем веса в классификационную модель
model.roberta = mlm_model


Some weights of RobertaModel were not initialized from the model checkpoint at mlm_results and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {
        "accuracy": acc,
        "f1": f1,
    }

In [18]:
training_args = TrainingArguments(
    output_dir="./final_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

  0%|          | 0/1184 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 25%|██▌       | 296/1184 [04:07<11:05,  1.33it/s]
  0%|          | 0/74 [00:00<?, ?it/s][A
  3%|▎         | 2/74 [00:00<00:06, 11.81it/s][A
  5%|▌         | 4/74 [00:00<00:09,  7.39it/s][A
  7%|▋         | 5/74 [00:00<00:10,  6.61it/s][A
  8%|▊         | 6/74 [00:00<00:10,  6.27it/s][A
  9%|▉         | 7/74 [00:01<00:11,  5.71it/s][A
 11%|█         | 8/74 [00:01<00:11,  5.83it/s][A
 12%|█▏        | 9/74 [00:01<00:11,  5.45it/s][A
 14%|█▎        | 10/74 [00:01<00:11,  5.53it/s][A
 15%|█▍        | 11/74 [00:01<00:11,  5.54it/s][A
 16%|█▌        | 12/74 [00:02<00:11,  5.38it/s][A
 18%|█▊        | 13/74 [00:02<00:11,  5.39it/s][A
 19%|█▉        | 14/74 [00:02<00:11,  5.35it/s][A
 20%|██        | 15/74 [00:02<00:

{'eval_loss': 0.16351749002933502, 'eval_accuracy': 0.9179357021996616, 'eval_f1': 0.9214574898785427, 'eval_runtime': 13.702, 'eval_samples_per_second': 86.265, 'eval_steps_per_second': 5.401, 'epoch': 1.0}


IOStream.flush timed out
 42%|████▏     | 500/1184 [08:19<10:08,  1.12it/s]  

{'loss': 0.1841, 'learning_rate': 1.1554054054054056e-05, 'epoch': 1.69}


 50%|█████     | 592/1184 [09:38<07:13,  1.37it/s]
  0%|          | 0/74 [00:00<?, ?it/s][A
  3%|▎         | 2/74 [00:00<00:06, 10.86it/s][A
  5%|▌         | 4/74 [00:00<00:09,  7.33it/s][A
  7%|▋         | 5/74 [00:00<00:10,  6.46it/s][A
  8%|▊         | 6/74 [00:00<00:10,  6.22it/s][A
  9%|▉         | 7/74 [00:01<00:12,  5.50it/s][A
 11%|█         | 8/74 [00:01<00:11,  5.66it/s][A
 12%|█▏        | 9/74 [00:01<00:12,  5.35it/s][A
 14%|█▎        | 10/74 [00:01<00:11,  5.36it/s][A
 15%|█▍        | 11/74 [00:01<00:11,  5.58it/s][A
 16%|█▌        | 12/74 [00:02<00:11,  5.43it/s][A
 18%|█▊        | 13/74 [00:02<00:11,  5.49it/s][A
 19%|█▉        | 14/74 [00:02<00:10,  5.48it/s][A
 20%|██        | 15/74 [00:02<00:10,  5.72it/s][A
 22%|██▏       | 16/74 [00:02<00:10,  5.67it/s][A
 23%|██▎       | 17/74 [00:02<00:10,  5.44it/s][A
 24%|██▍       | 18/74 [00:03<00:10,  5.52it/s][A
 26%|██▌       | 19/74 [00:03<00:09,  5.78it/s][A
 27%|██▋       | 20/74 [00:03<00:09,  5.45it/s]

{'eval_loss': 0.1626693606376648, 'eval_accuracy': 0.9196277495769881, 'eval_f1': 0.9230769230769231, 'eval_runtime': 13.7166, 'eval_samples_per_second': 86.173, 'eval_steps_per_second': 5.395, 'epoch': 2.0}


IOStream.flush timed out
 75%|███████▌  | 888/1184 [15:01<03:38,  1.35it/s]  
  0%|          | 0/74 [00:00<?, ?it/s][A
  3%|▎         | 2/74 [00:00<00:06, 11.64it/s][A
  5%|▌         | 4/74 [00:00<00:13,  5.26it/s][A
  7%|▋         | 5/74 [00:00<00:13,  5.26it/s][A
  8%|▊         | 6/74 [00:01<00:12,  5.34it/s][A
  9%|▉         | 7/74 [00:01<00:13,  5.13it/s][A
 11%|█         | 8/74 [00:01<00:12,  5.44it/s][A
 12%|█▏        | 9/74 [00:01<00:12,  5.21it/s][A
 14%|█▎        | 10/74 [00:01<00:12,  5.32it/s][A
 15%|█▍        | 11/74 [00:01<00:11,  5.50it/s][A
 16%|█▌        | 12/74 [00:02<00:12,  5.11it/s][A
 18%|█▊        | 13/74 [00:02<00:11,  5.25it/s][A
 19%|█▉        | 14/74 [00:02<00:11,  5.29it/s][A
 20%|██        | 15/74 [00:02<00:10,  5.54it/s][A
 22%|██▏       | 16/74 [00:02<00:10,  5.52it/s][A
 23%|██▎       | 17/74 [00:03<00:10,  5.30it/s][A
 24%|██▍       | 18/74 [00:03<00:10,  5.34it/s][A
 26%|██▌       | 19/74 [00:03<00:10,  5.42it/s][A
 27%|██▋       | 20/

{'eval_loss': 0.17952771484851837, 'eval_accuracy': 0.9382402707275804, 'eval_f1': 0.9396195202646817, 'eval_runtime': 13.7413, 'eval_samples_per_second': 86.018, 'eval_steps_per_second': 5.385, 'epoch': 3.0}


IOStream.flush timed out
 84%|████████▍ | 1000/1184 [17:54<02:30,  1.22it/s] 

{'loss': 0.1308, 'learning_rate': 3.1081081081081082e-06, 'epoch': 3.38}


 95%|█████████▌| 1125/1184 [19:37<00:50,  1.18it/s]

In [7]:
metrics = trainer.evaluate()
metrics

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 74/74 [00:30<00:00,  2.40it/s]


{'eval_loss': 0.17952771484851837,
 'eval_accuracy': 0.9382402707275804,
 'eval_f1': 0.9382151934632695,
 'eval_runtime': 31.8946,
 'eval_samples_per_second': 37.06,
 'eval_steps_per_second': 2.32}

In [8]:
predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(predictions.predictions, axis=-1)
y_true = predictions.label_ids
print("Confusion matrix:")
print(confusion_matrix(y_true, y_pred))

100%|██████████| 74/74 [00:30<00:00,  2.42it/s]

Confusion matrix:
[[541  47]
 [ 26 568]]





In [9]:
trainer.save_model("./final_results/best_model")