In [2]:
from aitextgen import aitextgen
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from aitextgen.utils import build_gpt2_config
from aitextgen.tokenizers import train_tokenizer

## Training the Tokenizer

In [None]:
#train_tokenizer('merged_large_bal.csv')

## Specify a Model Configuration

config = build_gpt2_config(vocab_size=50_000, max_length=1024, dropout=0.1, n_embd=256, n_layer=12, n_head=32)
config

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Instantiating Your Custom GPT-2 Model

ai = aitextgen(config=config,
              tokenizer_file="./aitextgen.tokenizer.json",
              to_gpu=True)

In [None]:
ai = aitextgen(model_folder="./trained_model",
               tokenizer_file="./aitextgen.tokenizer.json",
               to_gpu=True)

In [None]:
ai.generate(2)

In [None]:
ai.generate(max_length = 20, prompt = 'e4 c5', top_k = 40, top_p  = 0.6)

## Train GPT-2

Important parameters for `train()`:

- **`line_by_line`**: Set this to `True` if the input text file is a single-column CSV, with one record per row. aitextgen will automatically process it optimally.
- **`from_cache`**: If you compressed your dataset locally (as noted in the previous section) and are using that cache file, set this to `True`.
- **`num_steps`**: Number of steps to train the model for.
- **`generate_every`**: Interval of steps to generate example text from the model; good for qualitatively validating training.
- **`save_every`**: Interval of steps to save the model: the model will be saved in the VM to `/trained_model`.
- **`save_gdrive`**: Set this to `True` to copy the model to a unique folder in your Google Drive, if you have mounted it in the earlier cells
- **`batch_size`**: Batch size of the model training; setting it too high will cause the GPU to go OOM. _Unlike finetuning, since you are using a small model, you can massively increase the batch size to normalize the training_.
- **`fp16`**: Enables half-precision training for faster/more memory-efficient training. Only works on a T4 or V100 GPU.

Here are other important parameters for `train()` that are useful but you likely do not need to change.

- **`learning_rate`**: Learning rate of the model training.


In [None]:
ai.train('4.csv',
         line_by_line=True,
         from_cache=False,
         num_steps=50_000,
         generate_every=1_000,
         save_every=1_000,
         save_gdrive=False,
         learning_rate=1e-3,
         batch_size=2,
         )


## Load a Trained Model

In [4]:
ai = aitextgen(model_folder="trained_model",
               tokenizer_file="trained_model/aitextgen.tokenizer.json",
               to_gpu=True)

In [8]:
ai.generate(max_length = 512, prompt = 'e4 : ', top_k = 100, top_p  = 0.9, temperature = 0.7)

[1me4 : [0mUnion[Anion]:
    query = {}
    for anion, obj in anions.items():
        query[Anion] = anion
    return query


In [6]:
import pandas as pd
test = pd.read_csv('chess_data/39.csv')
test

Unnamed: 0,input,target
0,e4 : d4 Nf6 Nf3 e6 Bf4 b6 e3 Bb7 Be2 Nh5 h3 Nx...,Rc7 Ra1 Ra7 a2 0-1
1,e4 : e4 Nf6 Nc3 d5 exd5 Nxd5 Bc4 Nb6 Bb3 c5 d3...,h6 Qg4 Nf6 Qf4 Ne7 Nd6 Qb8 Nxb7 Qxb7 Bb3 Nfd5 ...
2,e4 : c4 e5 g3 Nf6 Bg2 c6 Nf3 e4 Nd4 Qb6 Nc2,a5 O-O Na6 Nc3 d5 d3 exd3 exd3 dxc4 Re1+ Be6 d...
3,e4 : d4 d5 c4 Nc6 Nf3 Bg4 Nc3 e6 a3 Nf6 Bg5 Be...,Bxe7 Qxe7 h3 Nxc3 bxc3 Bxf3 Qxf3 O-O cxd5 exd5...
4,e4 : c4 c6 g3 d5 Nf3 Nf6 Bg2 Bf5 cxd5 Nxd5 O-O...,Bc4+ Kg1 Be3+ Kh2 fxe5 Nd7+ Ke7 Nxe5+ Ke6 Nxc4...
...,...,...
191178,e4 : e4 c5 Nf3 d6 d4 cxd4 Nxd4 Nf6 Nc3 g6 f4 B...,Re2 Kg7 Bd2 Bxd2 Rxd2 Rc8 Rf4 Rac7 Bxa6 Ra8 Bc...
191179,e4 : e4 d5 e5 c5 f4 Nc6 Nf3 Bg4 Be2 e6 O-O Nge...,Nd8 Rxb7 Nxb7 Rxb7 Ne7 Bb4 Qe8 Bxe7+ Kg6 Qe2 Q...
191180,e4 : d4 Nf6 Nf3 g6 c4 Bg7 Nc3 d5 cxd5 Nxd5 Bd2...,Bxg7 Kxg7 Be2 Qg5 g3 c6 Qd4+ Kg8 h4 Qe7 O-O Bh...
191181,e4 : c4 e6 g3 d5 Bg2 dxc4 Nf3 c5,O-O Nc6 Na3 Nf6 Nxc4 Be7 d3 O-O Be3 Nd5 Bd2 b6...
