In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets

# 从头训练 Transformers 模型

In [None]:
from transformers import pipeline, set_seed
generation_gpt = pipeline("text-generation", model="openai-gpt")
generation_gpt2 = pipeline("text-generation", model="gpt2")
def model_size(model):
	return sum(t.numel() for t in model.parameters())

print(f"GPT size: {model_size(generation_gpt.model)/1000**2:.1f}M parameters")

print(f"GPT2 size: {model_size(generation_gpt2.model)/1000**2:.1f}M parameters")


In [None]:
def enum_pipeline_ouputs(pipe, prompt, num_return_sequences):
	out = pipe(prompt, num_return_sequences=num_return_sequences, clean_up_tokenization_spaces=True)
	return "\n".join(f"{i+1}." + s["generated_text"] for i, s in enumerate(out))

prompt = "\nWhen they came back"
print("GPT completions:\n" + enum_pipeline_ouputs(generation_gpt, prompt, 3))
print("") print("GPT-2 completions:\n" + enum_pipeline_ouputs(generation_gpt2, prompt, 3))

In [None]:
from datasets import load_dataset, DownloadConfig
download_config = DownloadConfig(delete_extracted=True)

dataset = load_dataset("./codeparrot", split="train", download_config=download_config)


In [None]:
import psutil
print(f"Number of python files code in dataset : {len(dataset)}")
ds_size = sum(os.stat(f["filename"]).st_size for f in dataset.cache_files)
# os.stat.st_size is expressed in bytes, so we convert to GB
print(f"Dataset size (cache file) : {ds_size / 2**30:.2f} GB")
# Process.memory_info is expressed in bytes, so we convert to MB
print(f"RAM used: {psutil.Process(os.getpid()).memory_info().rss >> 20} MB")
streamed_dataset = load_dataset('./codeparrot', split="train", streaming=True)
iterator = iter(streamed_dataset)
print(dataset[0] == next(iterator))

In [None]:
remote_dataset = load_dataset('transformersbook/codeparrot', split="train", streaming=True)


# 构建一个标记化器 （tokenizer)

In [None]:
from transformers import AutoTokenizer
def tok_list(tokenizer, string):
	input_ids = tokenizer(string, add_special_tokens=False)["input_ids"]
	return [tokenizer.decode(tok) for tok in input_ids]
tokenizer_T5 = AutoTokenizer.from_pretrained("t5-base")
tokenizer_camembert = AutoTokenizer.from_pretrained("camembert-base")
print(f'T5 tokens for "sex": {tok_list(tokenizer_T5,"sex")}')
print(f'CamemBERT tokens for "being": {tok_list(tokenizer_camembert,"being")}')

In [None]:
from transformers import AutoTokenizer
python_code = r"""
def say_hello():
	print("Hello, World!")
# Print it say_hello() """
tokenizer = AutoTokenizer.from_pretrained("gpt2")
print(tokenizer(python_code).tokens())
print(tokenizer.backend_tokenizer.normalizer)
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))

In [None]:
a, e = u"a", u"€"
byte = ord(a.encode("utf-8"))
print(f'`{a}` is encoded as `{a.encode("utf-8")}` with a single byte: {byte}')
byte = [ord(chr(i)) for i in e.encode("utf-8")]
print(f'`{e}` is encoded as `{e.encode("utf-8")}` with three bytes: {byte}')

In [None]:
from transformers.models.gpt2.tokenization_gpt2
import bytes_to_unicode
byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())
print(f'Size of our base vocabulary: {len(base_vocab)}')
print(f'First element: `{base_vocab[0]}`, last element: `{base_vocab[-1]}`')
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))
print(f"Size of the vocabulary: {len(tokenizer)}")
print(tokenizer(python_code).tokens())
tokens = sorted(tokenizer.vocab.items(), key=lambda x: len(x[0]), reverse=True)
print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:8]]); ['ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ',
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)
print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[:12]]);

In [None]:
from tqdm.auto import tqdm
length = 10000
dataset_name = 'transformersbook/codeparrot-train'
dataset = load_dataset(dataset_name, split="train", streaming=True)
iter_dataset = iter(dataset)
def batch_iterator(batch_size=10):
	for _ in tqdm(range(0, length, batch_size)):
		yield [next(iter_dataset)['content'] for _ in range(batch_size)]

new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=12500, initial_alphabet=base_vocab)


In [None]:
tokens = sorted(new_tokenizer.vocab.items(), key=lambda x: x[1], reverse=False)

print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[257:280]]);
print([f'{new_tokenizer.convert_tokens_to_string(t)}' for t,_ in tokens[-12:]]);
print(new_tokenizer(python_code).tokens())

In [None]:
import keyword print(f'There are in total {len(keyword.kwlist)} Python keywords.')
for keyw in keyword.kwlist:
	if keyw not in new_tokenizer.vocab:
		print(f'No, keyword `{keyw}` is not in the vocabulary')


In [None]:
length = 200000
new_tokenizer_larger = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=32768, initial_alphabet=base_vocab)


In [None]:
tokens = sorted(new_tokenizer_larger.vocab.items(), key=lambda x: x[1], reverse=False)
print([f'{tokenizer.convert_tokens_to_string(t)}' for t, _ in tokens[-12:]]);
print(new_tokenizer_larger(python_code).tokens())

In [None]:
for keyw in keyword.kwlist:
	if keyw not in new_tokenizer_larger.vocab:
    	print(f'No, keyword `{keyw}` is not in the vocabulary')

 No, keyword `nonlocal` is not in the vocabulary


# 从头开始训练一个模型

In [None]:
rom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained("gpt2-xl", vocab_size=len(tokenizer))
model = AutoModelForCausalLM.from_config(config)
print(f'GPT-2 (xl) size: {model_size(model)/1000**2:.1f}M parameters')
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config_small = AutoConfig.from_pretrained("gpt2", vocab_size=len(tokenizer))
model_small = AutoModelForCausalLM.from_config(config_small)
print(f'GPT-2 size: {model_size(model_small)/1000**2:.1f}M parameters')
input_characters = number_of_sequences * sequence_length * characters_per_token


In [None]:
examples, total_characters, total_tokens = 500, 0, 0
dataset = load_dataset('transformersbook/codeparrot-train', split='train', streaming=True)
for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
	total_characters += len(example['content'])
	total_tokens += len(tokenizer(example['content']).tokens())
characters_per_token = total_characters / total_tokens print(characters_per_token) 3.6233025034779565


In [None]:
import torch
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
  def __init__(self,tokenizer,dataset,seq_length=1024,num_of_sequences=1024,chars_per_token=3.6):
    self.tokenizer=tokenizer
    self.concat_token_id=tokenizer.eos_token_id
    self.dataset=dataset
    self.seq_length=seq_length
    self.input_characters=seq_length*char_per_token*num_of_sequences
  def __iter__(self):
    iterator=iter(self.dataset)
    more_examples=True
    while more_examples:
      buffer,buffer_len=[],0
      while True:
        if buffer_len>=self.input_characters:
          m=f"Buffer full:{buffer_len}>={self.input_characters:.0f}"
          print(m)
          break
        try:
          m=f'Fill buffer:{buffer_len}<{self.input_characters:.0f}'
          print(m)
          buffer.append(next(iterator)['content'])
          buffer_len+=len(buffer[-1])
        except StopIteration:
          iterator=iter(self.dataset)
      all_token_ids=[]
      tokenized_inputs=self.tokenizer(buffer,trunction=False)
      for tokenized_input in tokenized_inputs['input_ids']:
        all_token_ids.extend(tokenized_input+[self.concat_token_id])
      for i in range(0,len(all_token_ids),self.seq_length):
        input_ids=all_token_ids[i:i+self.seq_length]
        if len(input_ids)==self.seq_length:
          yield torch.tensor(input_ids)

In [None]:
shuffled_dataset = dataset.shuffle(buffer_size=100)
constant_length_dataset = ConstantLengthDataset(tokenizer, shuffled_dataset, num_of_sequences=10)
dataset_iterator = iter(constant_length_dataset)
lengths = [len(b) for _, b in zip(range(5), dataset_iterator)]
print(f"Lengths of the sequences: {lengths}")

In [None]:
import torch
import torch.nn.functional as F
from datasets import load_dataset
from accelerate import Accelerator
device='cpu'
accelerator=Accelerator()
model=torch.nn.Transformer().to(device)
model=torch.nn.Transformer()
optimizer=torch.optim.Adam(model.parameters())
dataset=load_dataset('my_dataset')
data=torch.utils.data.DataLorader(dataset,shuffle=True)
model,optimizer,data=accelerator.prepare(model,optimizer,data)
model.train()
for epoch in range(10):
  for source,targets in data:
    source=source.to(device)
    targets=targets.to(device)
    optimizer.zero_grad()
    output=model(source)
    loss=F.cross_entropy(output,targets)
    loss.backward()
    accelerator.backward(loss)
    optimizer.step()


In [None]:
from argparse import Namespace
# Commented parameters correspond to the small model
config = {"train_batch_size": 2, # 12
        "valid_batch_size": 2, # 12
        "weight_decay": 0.1, "shuffle_buffer": 1000, "learning_rate": 2e-4, # 5e-4
        "lr_scheduler_type": "cosine", "num_warmup_steps": 750, # 2000
        "gradient_accumulation_steps": 16, # 1 "max_train_steps": 50000, # 150000
        "max_eval_steps": -1, "seq_length": 1024, "seed": 1, "save_checkpoint_steps": 50000} # 15000
args = Namespace(**config)


In [None]:
from torch.utils.tensorboard import SummaryWriter
import logging
import wandb
def setup_logging(project_name):
  logger=logging.getLogger(__name__)
  logging.basicConfig(
      format='%(asctime)s-%(levelname)s-%(name)s-%(message)s',
      datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO,handlers=[
          logging.FileHandler(f"log/debug_{accelerator.process_index}.log")
          logging.StreamHandler()
      ]
  )
  if accelerator.is_main_process:
    wandb.init(project=project_name,config=args)
    run_name=wandb.run.name
    tb_writer=SummaryWriter()
    tb_wirter.add_hparams(vars(args),{'0':0})
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_debug()
    transformers.utils.logging.set_verbosity_info()
  else:
    tb_writer=None
    run_name=''
    logger.setLevel(logging.ERROR)
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()
  return logger,tb_writer,run_name


In [None]:
def log_metrics(step, metrics):
	logger.info(f"Step {step}: {metrics}"
	if accelerator.is_main_process:
		wandb.log(metrics) [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
