## Fine-Tuning GPT2 to generator Chinese poetry

## Setup

In [1]:
import torch
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline, GPT2Config
from torch.utils.data import RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import pandas as pd
import random
from torch.optim import AdamW
import datetime
import time

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device {device}")

model_name = "uer/gpt2-distil-chinese-cluecorpussmall"  
model_save_path = './model'

device cuda


## Quick Test

In [3]:
configuration = GPT2Config.from_pretrained(model_name)

model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

tokenizer = BertTokenizer.from_pretrained(model_name)

input_sequence = "深度学习"
input_ids = tokenizer.encode(input_sequence, return_tensors='pt')

model = model.to(device)
#combine both sampling techniques
sample_outputs = model.generate(
                              input_ids.to(device),
                              do_sample = True,
                              max_length = 120,
                              top_k = 50,
                              top_p = 0.85,
                              num_return_sequences = 3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}...".format(i, tokenizer.decode(sample_output, skip_special_tokens = True)))
    print('  ---')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Output:
----------------------------------------------------------------------------------------------------
0: 深 度 学 习.? 大 一 新 生 大 一 新 生 大 一 新 生 ， 大 一 刚 入 学 ， 目 前 学 的 是 计 算 机 。 最 近 在 用 python 做 的 一 些 应 用 ， 对 于 我 来 说 很 快 的 就 能 用 。 想 用 python 做 一 些 小 游 戏 ， 不 过 对 于 做 了 一 个 月 左 右 就 可 以 做 的 很 熟 练 了 ， 然 后 再 自 学 一 些 语 法 。 因 为 在 自 学 过 程 中 我 发 现 了 很 多 可 能...
  ---
1: 深 度 学 习 有 点 深 了 但 是 有 点 大 有 时 候 会 把 所 有 的 知 识 点 都 记 进 去 ~ ~ 有 时 候 不 知 道 记 起 来 会 不 会 忘 记 ~ ~ ~ ~ ~ ~ ~ 有 时 候 记 起 来 会 记 不 得 东 西 啊 ~ ~ ~ ~ ~ ~ ~ 。 有 时 候 记 起 来 会 忘 掉 的 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 。 有 时 候 记 得 会 忘 掉...
  ---
2: 深 度 学 习 还 可 以 这 本 书 的 内 容 不 是 很 深 但 也 不 是 说 很 难 但 是 看 完 了 一 遍 以 后 你 会 发 现 有 不 少 地 方 还 是 蛮 不 错 的 。 这 个 作 者 的 作 品 里 面 我 还 可 以 看 到 了 很 多 相 关 的 知 识 在 这 个 学 习 的 过 程 中 你 会 发 现 很 多 不 同 的 知 识 。 希 望 大 家 能 给 自 己 一 些 帮 助 。 。 。...
  ---


## Prepare Data Set

In [4]:
with open('./poetry.txt', 'r') as f:
    poetry_corpus = f.read()
print(poetry_corpus[:100])

寒随穷律变，春逐鸟声开。
初风飘带柳，晚雪间花梅。
碧林青旧竹，绿沼翠新苔。
芝田初雁去，绮树巧莺来。
晚霞聊自怡，初晴弥可喜。
日晃百花色，风动千林翠。
池鱼跃不同，园鸟声还异。
寄言博通者，知予物


In [5]:
poetry_corpus = poetry_corpus.replace('\n', '').replace('\r', '').replace('。', '')
print(poetry_corpus[:100])

寒随穷律变，春逐鸟声开初风飘带柳，晚雪间花梅碧林青旧竹，绿沼翠新苔芝田初雁去，绮树巧莺来晚霞聊自怡，初晴弥可喜日晃百花色，风动千林翠池鱼跃不同，园鸟声还异寄言博通者，知予物外志一朝春夏改，隔夜鸟花迁阴


In [6]:
def str_to_list(s, n=11):
  """将字符串s分割成长度为n的子字符串列表
  Args:
    s: 要分割的字符串
    n: 每个子字符串的长度
  Returns:
    分割后的子字符串列表
  """
  return [s[i:i+n] for i in range(0, len(s), n)]
text = poetry_corpus 
text = str_to_list(text)

In [7]:
print(len(text))
print(text[100])

72514
朱颜含远日，翠色影长津


In [8]:
def add_special_chars(list_of_strings, left_char, right_char):
  """
  在列表中每个字符串的开头和结尾添加特殊字符
  Args:
    list_of_strings: 字符串列表
    special_char: 要添加的特殊字符
  Returns:
    添加了特殊字符的新列表
  """
  return [left_char + s + right_char for s in list_of_strings]

text = add_special_chars(text, left_char='<|startoftext|>', right_char='<|endoftext|>')
print(text[100])

<|startoftext|>朱颜含远日，翠色影长津<|endoftext|>


## GPT2 Tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained(model_name,bos_token='<|startoftext|>',eos_token='<|endoftext|>',unk_token='<|unknown|>',pad_token='<|pad|>')

In [10]:
vocab_list = sorted(tokenizer.vocab.items(), key=lambda x:x[1])
print(len(vocab_list))

21128


In [11]:
print("The max model length is {} for this model".format(tokenizer.model_max_length))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The beginning of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The unknown token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.unk_token_id), tokenizer.unk_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model
The end of sequence token <|endoftext|> has the id 21129
The beginning of sequence token <|startoftext|> has the id 21128
The unknown token <|unknown|> has the id 21130
The padding token <|pad|> has the id 21131


## PyTorch Datasets & Dataloaders

In [12]:
# GPT2 is a large model. Increasing the batch size above 2 has lead to out of memory problems.
batch_size = 16
max_length = 180  # maximum sentence length

from torch.utils.data import Dataset, DataLoader, random_split

class MyDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        # 使用tokenizer   
        encoding = self.tokenizer(text, truncation=True, return_tensors='pt')
        return encoding
        
dataset = MyDataset(text, tokenizer)

train_size = int(0.9 * len(dataset))

val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

65,262 training samples
7,252 validation samples


In [13]:
print(f"dataset size {dataset.__len__()}")
print(dataset[0])
print(f"dataset[0]: \n  input_ids: {dataset[0].input_ids}\n  attn_masks: {dataset[0].attention_mask}")

dataset size 72514
{'input_ids': tensor([[  101, 21128,  2170,  7390,  4956,  2526,  1359,  8024,  3217,  6852,
          7881,  1898,  2458, 21129,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
dataset[0]: 
  input_ids: tensor([[  101, 21128,  2170,  7390,  4956,  2526,  1359,  8024,  3217,  6852,
          7881,  1898,  2458, 21129,   102]])
  attn_masks: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [14]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

## Finetune GPT2 Language Model

In [15]:
configuration = GPT2Config.from_pretrained(model_name, output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)
model = model.to(device)
print(f"Weight shape {model.transformer.wte.weight.shape}")
# this step is necessary because I've added some tokens (bos_token, etc.) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))
print(f"Number of tokens: {len(tokenizer)}")

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Weight shape torch.Size([21128, 768])
Number of tokens: 21132


In [16]:
word_embeddings = model.transformer.wte.weight # Word Token Embeddings

print(word_embeddings.shape)

torch.Size([21132, 768])


In [17]:
epochs = 3
learning_rate = 2e-5
warmup_steps = 1e2
# The epsilon parameter eps = 1e-8 is “a very small number to prevent any division by zero in the implementation”
epsilon = 1e-8
# optim = Adam(model.parameters(), lr=5e-5)
optim = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [18]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [19]:
infer_max_length = 24

def infer(prompt):
    # input = f"<|startoftext|>{prompt.strip()}"
    # input = f"<|startoftext|> {prompt.strip()}"
    input = f"{prompt.strip()}"
    input = tokenizer(input, return_tensors="pt")
    input_ids      = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(input_ids.to(device),
                            attention_mask=attention_mask.to(device),
                            max_new_tokens=infer_max_length,
                            # temperature = 0.5,
                            do_sample = True, top_k = 50, top_p = 0.85)
                            # num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [20]:
total_t0 = time.time()

training_stats = []

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()  # `train` just changes the *mode* (train vs. eval), it doesn't *perform* the training.

    for step, batch in enumerate(train_dataloader):     # step from enumerate() = number of batches

        b_input_ids = batch.input_ids.to(device)   # tokens (of multiple documents in a batch)
        b_labels    = batch.input_ids.to(device)
        b_masks     = batch.attention_mask.to(device)   # mask of [1] for a real word, [0] for a pad

        model.zero_grad()
        # loss = model(X.to(device), attention_mask=a.to(device), labels=X.to(device)).loss
        outputs = model(  input_ids = b_input_ids,
                          labels = b_labels,
                          attention_mask = b_masks,
                          token_type_ids = None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % 500 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_output = infer("")
            print(sample_output)

            # `train` just changes the *mode* (train vs. eval), it doesn't *perform* the training.
            model.train()

        loss.backward()
        optim.step()
        scheduler.step()

    # Calculate the average loss over all the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))


    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch.input_ids.to(device)
        b_labels = batch.input_ids.to(device)
        b_masks = batch.attention_mask.to(device)

        with torch.no_grad():

            outputs  = model(input_ids = b_input_ids,
                             attention_mask = b_masks,
                             labels = b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch   500  of  4,079. Loss: 4.796802520751953.   Elapsed: 0:00:10.
门 外 见 月 明 ， 何 处 何 处 有 扉 花


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,000  of  4,079. Loss: 4.5659708976745605.   Elapsed: 0:00:20.
闻 云 上 香 ， 何 人 去 多 年 相 惜 天 涯


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,500  of  4,079. Loss: 4.4266276359558105.   Elapsed: 0:00:30.
觉 天 堂 有 心 ， 天 地 万 里 开 啼 来 思 旧


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,000  of  4,079. Loss: 4.273077964782715.   Elapsed: 0:00:40.
去 到 白 发 白 ， 长 白 似 苍 苍


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,500  of  4,079. Loss: 4.059493541717529.   Elapsed: 0:00:49.
日 无 处 看 雨 ， 独 自 望 春 城


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,000  of  4,079. Loss: 4.371115684509277.   Elapsed: 0:00:59.
地 不 成 功 ， 风 雨 无 生 涯 前


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,500  of  4,079. Loss: 4.077295780181885.   Elapsed: 0:01:09.
闲 听 闻 此 地 ， 未 曾 发 一 人


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 4,000  of  4,079. Loss: 4.264497756958008.   Elapsed: 0:01:18.
闻 香 客 后 路 ， 犹 忆 去 时 情

  Average training loss: 4.47
  Training epoch took: 0:01:20

Running Validation...
  Validation Loss: 4.45
  Validation took: 0:00:03

Training...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch   500  of  4,079. Loss: 4.2762885093688965.   Elapsed: 0:00:10.
知 君 有 子 ， 知 君 为 此 悲 ， 何 处 惜 家 贫


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,000  of  4,079. Loss: 3.974090814590454.   Elapsed: 0:00:19.
秋 水 上 船 ， 飞 渡 出 山 村


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,500  of  4,079. Loss: 3.7294118404388428.   Elapsed: 0:00:29.
白 松 萝 径 ， 空 悬 落 日 舟


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,000  of  4,079. Loss: 4.095635890960693.   Elapsed: 0:00:38.
知 何 时 尽 ， 相 看 一 自 惊 归


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,500  of  4,079. Loss: 4.010121822357178.   Elapsed: 0:00:48.
花 满 树 枝 落 ， 不 如 白 云 飞


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,000  of  4,079. Loss: 3.901571750640869.   Elapsed: 0:00:58.
花 飘 雨 雪 ， 远 落 花 风


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,500  of  4,079. Loss: 3.830249309539795.   Elapsed: 0:01:07.
春 多 白 羽 ， 春 满 若 青 枫


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 4,000  of  4,079. Loss: 3.852318048477173.   Elapsed: 0:01:17.
知 何 去 岁 寒 ， 却 见 一 时 声

  Average training loss: 4.00
  Training epoch took: 0:01:19

Running Validation...
  Validation Loss: 4.15
  Validation took: 0:00:03

Training...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch   500  of  4,079. Loss: 4.047907829284668.   Elapsed: 0:00:10.
地 通 灵 泽 ， 时 时 解 帝 乡 思


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,000  of  4,079. Loss: 4.130640506744385.   Elapsed: 0:00:19.
日 下 人 心 静 ， 空 山 独 一 闻


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 1,500  of  4,079. Loss: 4.231122970581055.   Elapsed: 0:00:29.
知 尔 未 遂 至 ， 长 啸 不 相 关


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,000  of  4,079. Loss: 4.123702526092529.   Elapsed: 0:00:39.
不 足 日 ， 无 语 与 谁 同


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 2,500  of  4,079. Loss: 3.8870315551757812.   Elapsed: 0:00:48.
风 来 后 路 ， 千 里 故 乡 心


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,000  of  4,079. Loss: 4.07708740234375.   Elapsed: 0:00:58.
知 非 君 子 心 ， 犹 自 感 恩 悲


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 3,500  of  4,079. Loss: 3.8599071502685547.   Elapsed: 0:01:08.
去 年 秋 日 里 ， 江 树 绿 新 枝


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


  Batch 4,000  of  4,079. Loss: 3.9179937839508057.   Elapsed: 0:01:17.
地 是 乡 思 远 ， 愁 云 在 北 楼

  Average training loss: 3.88
  Training epoch took: 0:01:19

Running Validation...
  Validation Loss: 4.06
  Validation took: 0:00:03

Training complete!
Total training took 0:04:07 (h:mm:ss)


In [21]:
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.465735,4.445215,0:01:20,0:00:03
2,4.004158,4.153035,0:01:19,0:00:03
3,3.882568,4.055147,0:01:19,0:00:03


## Saving & Loading Fine-Tuned Model

In [22]:
print("Saving model to %s" % model_save_path)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Saving model to ./model


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [24]:
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21132, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768

## Generate Text

In [25]:
print(infer(""))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


有 佳 句 来 ， 空 馀 古 道 心
