In [None]:

!pip3 install transformers
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
from typing import Tuple, List
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, TextDataset, DataCollatorForLanguageModeling,
                          AutoModelForCausalLM,
                          Trainer, TrainingArguments,
                          TextGenerationPipeline, pipeline)
from torch.cuda import empty_cache
import os

input_path       : str = "/content/drive/MyDrive/datatrue"
train_path       : str = "./train.txt"
test_path        : str = "./test.txt"
output_dir       : str = "./output" 
split_token      : str = "<|endoftext|>"
model_name       : str = "sberbank-ai/sbert_large_nlu_ru"
input_block_size : int = 128
train_batch_size : int = 6
eval_batch_size  : int = 16
epochs_count     : int = 5
eval_every       : int = 500
save_every       : int = 2500
warmup_steps     : int = 50

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 9.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 7.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.6.

In [None]:

def get_test_train_text() -> Tuple[List[str], List[str]]:
    data = []
    for filename in os.listdir(input_path):
        f = os.path.join(input_path, filename)
        if os.path.isfile(f):
            with open(f, "r", encoding="utf-8") as file:
                data.append(file.read())
        
    return train_test_split(data, test_size=0.2)

def save_text_file(text : List[str], path : str):
    data = split_token.join(text)
    with open(path, "w", encoding="utf-8") as file:
        file.write(data)
    
train_data, test_data = get_test_train_text()
save_text_file(train_data, train_path)
save_text_file(test_data, test_path)

In [None]:

def load_dataset(tokenizer : AutoTokenizer) -> Tuple[TextDataset, TextDataset, DataCollatorForLanguageModeling]:
    train_dataset = TextDataset(tokenizer=tokenizer, 
                                file_path=train_path, 
                                block_size=input_block_size)
    test_dataset = TextDataset(tokenizer=tokenizer, 
                               file_path=test_path, 
                               block_size=input_block_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)
    return train_dataset, test_dataset, data_collator

def load_model(path : str = model_name, resize : bool = True) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
    tokenizer = AutoTokenizer.from_pretrained(model_name, eos_token=split_token, pad_token=split_token)
    model = AutoModelForCausalLM.from_pretrained(path)
    if resize:
        model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

In [None]:

def create_metrics_computer():
    metrics = {
        "bleu" : load_metric("bleu"),
        "rouge" : load_metric("rouge")
    }
        
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        bleu_results = metrics["bleu"].compute(predictions=predictions, references=labels)
        rouge_results = metrics["rouge"].compute(predictions=predictions, references=labels)
        metrics_results= {
            "bleu" : bleu_results["bleu"],
            "rouge1" : rouge_results["rouge1"].mid.fmeasure,
            "rouge2" : rouge_results["rouge2"].mid.fmeasure,
            "rougeL" : rouge_results["rougeL"].mid.fmeasure,
            "rougeLsum" : rouge_results["rougeLsum"].mid.fmeasure,
        }
        return metrics_results
    
    return compute_metrics

def create_trainer(model : AutoModelForCausalLM,
                   data_collator : DataCollatorForLanguageModeling,
                   train_dataset : TextDataset,
                   test_dataset  : TextDataset) -> Trainer:
    training_args = TrainingArguments(output_dir=output_dir,
                                      overwrite_output_dir=True,
                                      evaluation_strategy="steps",
                                      num_train_epochs=epochs_count,
                                      per_device_train_batch_size=train_batch_size,
                                      per_device_eval_batch_size=eval_batch_size,
                                      logging_steps=eval_every,
                                      eval_steps=eval_every,
                                      save_steps=save_every,
                                      warmup_steps=warmup_steps)
    #compute_metrics = create_metrics_computer()
    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=test_dataset)
                      # На вычисление метрик на каждой eval_every итерации у меня не хватает видеопамяти,
                      # так что надо будет просто в конце один раз метрики вычислить, а тут обходиться loss-ом.
                      #compute_metrics=compute_metrics)
    return trainer
tokenizer, model = load_model()
train_dataset, test_dataset, data_collator = load_dataset(tokenizer)
trainer = create_trainer(model, data_collator, train_dataset, test_dataset)
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at sberbank-ai/sbert_large_nlu_ru and are newly initialized: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 13830
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation

Step,Training Loss,Validation Loss
500,1.5579,0.12553
1000,0.0864,0.047307
1500,0.0427,0.027781
2000,0.0259,0.01775
2500,0.0138,0.012883
3000,0.0058,0.011862
3500,0.0062,0.011022
4000,0.0055,0.010529
4500,0.0043,0.00916
5000,0.0023,0.008855


***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-2500
Configuration saved in ./output/checkpoint-2500/config.json
Model weights saved in ./output/checkpoint-2500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
***** Running Evaluation *****
  Num examples = 3529
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-5000
Configuration saved in ./output/checkpoint-5000/config.json
