In [1]:
import torch
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = 0

tokenizer



GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '!'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [3]:
from datasets import load_dataset

dataset = load_dataset('b-mc2/sql-create-context', split='train')

dataset, dataset[0]

(Dataset({
     features: ['question', 'context', 'answer'],
     num_rows: 78577
 }),
 {'question': 'How many heads of the departments are older than 56 ?',
  'context': 'CREATE TABLE head (age INTEGER)',
  'answer': 'SELECT COUNT(*) FROM head WHERE age > 56'})

In [5]:
def f(data):
    data = [
        'context: ' + i['context'] + ' question: ' + i['question'] +
        ' answer: ' + i['answer'] + tokenizer.eos_token for i in data
    ]

    data = tokenizer(data,
                     padding=True,
                     truncation=True,
                     max_length=512,
                     return_tensors='pt').to(device)

    data['labels'] = data['input_ids'].clone()
    select = data['labels'] == tokenizer.pad_token_id
    data['labels'][select] = -100

    return data


loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=4,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=f)

len(loader), next(iter(loader))

(19644,
 {'input_ids': tensor([[22866,    25, 29244,  6158, 43679,  3084,    62,  3672,    62,  2670,
            357,  1941, 17828,  7156,  1137,    11,  1964,    62, 28046,   569,
          31315,  1503,    11,  3722,   569, 31315,  1503,    11,  3026,    62,
          33203,  4278,   569, 31315,  1503,     8,  1808,    25,  1867,   318,
            262,   614,  2811,   618,   407,  1479,   373,   262,  3722,    11,
            290,  1342,   621,   767,   373,   262,  3026, 22008,    11,   290,
           1342,   621,   718,  1964,  2489,    30,  3280,    25, 33493, 35224,
              7,  1941,     8, 16034,  3084,    62,  3672,    62,  2670, 33411,
           3722,   796,   366,  1662,  1479,     1,  5357,  3026,    62, 33203,
           4278,  1279,   767,  5357,  1964,    62, 28046,  1279,   718, 50256],
         [22866,    25, 29244,  6158, 43679,  3084,    62,  1558,   940,  2623,
           2231,    62,   940,   357,  8929,    62,   562,  1023,   569, 31315,
           1503, 

In [7]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

model_actor.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
optimizer = torch.optim.Adam(model_actor.parameters(), lr=1e-5)

for i, data in enumerate(loader):
    out = model_actor(**data)
    out.loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 1000 == 0:
        print(i, len(loader), out.loss.item())

        gen = model_actor.generate(input_ids=data['input_ids'][0:1, :5],
                                   min_length=-1,
                                   max_length=100,
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id,
                                   top_k=0.0,
                                   top_p=1.0,
                                   do_sample=True)[0]

        print(tokenizer.decode(gen))

model_actor.save_pretrained('model/actor')

0 19644 3.509646415710449
context: CREATE TABLE tx, BUFFER_SIZE '' WHERE tx IS NULL; LIAMETHOD()' type instantiation=COOL and `MZK()' method call semantics

CAST (ncstali's cast) (camelCase) (withId) (brown) (contApiAParamniApimplIFuncAm) (nuk

erra-07-17 08:55:16 collaborators:

' insert: insert a unique key
