In [1]:
import torch
import json
from datetime import datetime
import numpy as np
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [138]:
class PersonaDataset (torch.utils.data.Dataset):
    def __init__(self, path, tokenizer, max_len=1) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.eoc = tokenizer('[eoc]')['input_ids'] 
        self.eocs = tokenizer('[eocs]')['input_ids'] 
        self.eos = tokenizer(tokenizer.eos_token)['input_ids'] 
        self.max_len = max_len
        self.data = []
        with open(path, 'r') as inp_file:
            for line in inp_file:
                self.data.append(json.loads(line))

    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx) -> list[int]:
        context = self.data[idx]['context']
        response = self.data[idx]['response']
        persona = self.data[idx]['persona']
        golden = self.data[idx]['golden']

        context = [self.tokenizer(c)['input_ids'] for c in context]
        response = self.tokenizer(response)['input_ids'] 
        out = [] 
        for c in context:
            if len(out) + len(self.eocs) + len(c) + len(self.eoc) + len(response) + len(self.eos) < self.max_len:
                out = out + c + self.eocs
            else:
                print(len(out)+len(c)+len(response))
                break
        out += self.eoc + response + self.eos
        out = torch.tensor(out)
        return out




In [139]:
data_path = '/home/posokhov@ad.speechpro.com/projects/persona/TlkPersonaChatRus/tolokapersonachat_gk1.jsonl'
model_path = '/home/posokhov@ad.speechpro.com/projects/models/rugpt2large'
train_size = 3

tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
special_tokens_dict = {'additional_special_tokens': ['[eocs]','[eoc]']} 
# eocs - end of context sentence 
# eoc - end of context
tokenizer.add_special_tokens(special_tokens_dict)

model = transformers.AutoModelWithLMHead.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

dataset = PersonaDataset(data_path, tokenizer)
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [len(dataset)//train_size, len(dataset)-len(dataset)//train_size]
    )
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/posokhov@ad.speechpro.com/projects/models/rugpt2large/config.json
Model config GPT2Config {
  "_name_or_path": "/home/posokhov@ad.speechpro.com/projects/models/rugpt2large",
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_pr

In [None]:
for _, b in zip(range(10), test_dataset):
    print(b)

In [140]:
training_args = transformers.TrainingArguments(
    output_dir="/home/posokhov@ad.speechpro.com/projects/persona/output_gpt", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=1,  # batch size for evaluation
    eval_steps = 1000*32, # Number of update steps between two evaluations.
    save_steps=1000*32, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [141]:
trainer.train()

***** Running training *****
  Num examples = 52923
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 158769


19


RuntimeError: CUDA out of memory. Tried to allocate 246.00 MiB (GPU 0; 10.92 GiB total capacity; 9.98 GiB already allocated; 27.50 MiB free; 10.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF