In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

from tqdm import tqdm
tqdm.pandas()

In [3]:
df = pd.read_json("data.jsonl", lines=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")

model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT").cuda()

In [5]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [6]:
class BugurtDataset(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.tokens = df["lines"].progress_apply(lambda x: tokenizer("<s>".join(x), 
                                                            max_length=512,
                                                            truncation=True)).tolist()
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, i):
        return self.tokens[i]
    

In [7]:
train_df, eval_df = train_test_split(df, test_size=0.05)
train_dataset = BugurtDataset(train_df, tokenizer)
eval_dataset = BugurtDataset(eval_df, tokenizer)

100%|██████████| 52304/52304 [00:14<00:00, 3512.91it/s]
100%|██████████| 2753/2753 [00:00<00:00, 3545.26it/s]


In [8]:
from transformers import DataCollatorForLanguageModeling

In [9]:
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = "<pad>"
tokenizer.sep_token = "<s>"

In [10]:
from transformers import Trainer, TrainingArguments

In [11]:
args = TrainingArguments(
    warmup_ratio=0.1,
    output_dir = "training",
    evaluation_strategy = "steps",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 16,
    num_train_epochs = 2,
    save_strategy = "steps",
    save_steps = 500,
    fp16 = True,
    eval_steps = 250,
    dataloader_num_workers = 4
)

trainer = Trainer(
    model = model,
    args = args,
    data_collator=collator,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset
)

Using amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 52304
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 1634


Step,Training Loss,Validation Loss
250,No log,1.477706


***** Running Evaluation *****
  Num examples = 2753
  Batch size = 8


In [None]:
model.save_pretrained("release")

In [None]:
test_input = "СИДИШЬ НА РАБОТЕ<s>"

input_ids = tokenizer([test_input], return_tensors="pt").input_ids

tokenizer.decode(model.generate(input_ids.cuda(),
                                max_length=len(tokenizer([test_input], return_tensors="pt").input_ids[0]) + 32,
                                bad_words_ids=[[tokenizer.pad_token_id]],
                                force_words_ids=[[11649], [11649]],
                                temperature=1.,
                                repetition_penalty=10.,
                                do_sample=True).cpu()[:, input_ids.shape[-1]:][0], skip_special_tokens=False).replace("<s>", "\n@\n")