In [1]:
import torch
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2-imdb')
tokenizer.pad_token = tokenizer.eos_token

tokenizer

GPT2TokenizerFast(name_or_path='gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [2]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('imdb')
dataset = concatenate_datasets(list((dataset.values())))
dataset = dataset.remove_columns(['label'])


def f(data):
    data = [i['text'] for i in data]

    data = tokenizer(data,
                     padding=True,
                     truncation=True,
                     max_length=50,
                     return_tensors='pt').to(device)

    data['labels'] = data['input_ids'].clone()
    select = data['labels'] == tokenizer.pad_token_id
    data['labels'][select] = -100

    return data


loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=4,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=f)

len(loader), next(iter(loader))

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

(12500,
 {'input_ids': tensor([[ 3666,  3656,   290,   314,  7342,   428,  3807,   780,   356,  1410,
            284,  3187, 49301,   290,   520,   398,    65, 11106,  2582,    13,
          20525,   357,   273, 12716,     8,   262, 10747,   290,   384,  3372,
           1758,   357, 20751,   351, 38883,     8,   389,   262,   691, 41632,
           1866,   286,   428,  3350,    13,   775,  2938,  6397,   880,    12],
         [16350,    11,   314,  8359,   428,  2646,   290,   561,  4313,   340,
            284, 19907,  2646, 20175, 29847,  1671,  1220,  6927,  1671, 11037,
           4864,    11,   314,  1107,   765,   284,  3465,   262, 20594,  1022,
           3354,   286,   428,  2646,   290, 38403,     6,  1012, 13416,    13,
           1881,  3715,  2592,   810, 21462, 19674,   959,   338,  2095,   318],
         [30402,    11,   257,  4858,  1182, 11699,   318,   262, 13996,   329,
           3488, 21488, 25671,    11,  6225,   257, 34082, 41221,   776,   656,
            257,

In [3]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('gpt2-imdb').to(
    device)

model_actor.config

  return self.fget.__get__(instance, owner)()


GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "gpt2-imdb",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
optimizer = torch.optim.Adam(model_actor.parameters(), lr=1e-5)

for i, data in enumerate(loader):
    out = model_actor(**data)
    out.loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 1000 == 0:
        print(i, len(loader), out.loss.item())
        
        prompt = data['input_ids'][0]
        chosen = prompt[5:]
        prompt = prompt[:5]

        gen = model_actor.generate(prompt.unsqueeze(0),
                                   max_length=32,
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id)[0, 5:]

        print('prompt=', tokenizer.decode(prompt))
        print('chosen=', tokenizer.decode(chosen))
        print('gen=', tokenizer.decode(gen))

model_actor.save_pretrained('model/actor')

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


0 12500 4.0063276290893555


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


prompt= This Columbo episode is
chosen=  one of the better and perhaps one of my personal favorites. The cast includes Rosemary's Baby John Cassavetes as the maestro, his wife played by Blythe Danner (Gwyneth Paltrow's
gen=  a great example of how to make a great Columbo episode. The story is very simple and the characters are very likable. The



KeyboardInterrupt

