## Train a word-level GPT on some text data

The inputs here are simple text files, which we chop up to words and then train GPT on.

In [None]:
import shutil
import os
try:
    os.remove('./input.txt')
    shutil.rmtree('./mingpt')
except:
    pass
shutil.copy('../input/qbqljl/input.txt', './input.txt')
shutil.copytree('../input/qbqljl/mingpt', './mingpt')

In [1]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset
import jieba

class WordDataset(Dataset):

    def __init__(self, data, block_size, topK=3000):
        words = jieba.lcut(data)
        word_stats = { w: 0 for w in set(words) }
        for w in words:
            word_stats[w] += 1

        # reserve for unknown
        stoi = { '<unk>': 0 }
        itos = { 0 : '<unk>' }

        i = 1
        ditched_set = set()
        for (key, cnt) in sorted(word_stats.items(), key=lambda item: item[1], reverse=True):
            if i > topK:
                ditched_set.add(key)
            else:
                stoi[key] = i
                itos[i] = key
                i += 1

        words = list(map(lambda w: '<unk>' if w in ditched_set else w, words))

        data_size, vocab_size = len(words), i - 1

        print('data has %d words, %d unique words.' % (len(words),  vocab_size))

        self.stoi = stoi
        self.itos = itos
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = words

    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx):
        # we're actually going to "cheat" and pick a spot in the dataset at random
        i = np.random.randint(0, len(self.data) - (self.block_size + 1))
        chunk = self.data[i:i + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [5]:
block_size = 32 # spatial extent of the model for its context

In [6]:
with open('input.txt', 'r') as f:
    train_dataset = WordDataset(f.read(), block_size)

data has 833035 characters, 4229 unique.


In [7]:
from mingpt.model import GPT, GPT1Config
mconf = GPT1Config(train_dataset.vocab_size, train_dataset.block_size)
model = GPT(mconf)

08/19/2020 13:29:49 - INFO - mingpt.model -   number of parameters: 2.961613e+07


In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=200, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

In [None]:
# training
from mingpt.utils import sample

context = jieba.lcut('草')
x = torch.tensor([0 if s not in train_dataset.stoi else train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, temperature=0.9, sample=True, top_k=5)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

In [None]:
# well that was fun

torch.save((model, train_dataset), 'model.pkl')