Adapted from https://github.com/minimaxir/gpt-2-simple

In [None]:
!pip install -q aitextgen
from aitextgen import aitextgen
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from aitextgen.utils import build_gpt2_config
from aitextgen.tokenizers import train_tokenizer

## Training the Tokenizer

In [None]:
#train_tokenizer('../input/chess-data/cleaned_merged_chess_data/cleaned_merged_chess_data.csv')

## Specify a Model Configuration

In [None]:
#config = build_gpt2_config(vocab_size=2000, max_length=500, dropout=0.1, n_embd=256, n_layer=12, n_head=32)
#config

## Instantiating Your Custom GPT-2 Model

In [None]:
#ai = aitextgen(config=config,
#               tokenizer_file="./aitextgen.tokenizer.json",
#               to_gpu=True)

In [None]:
ai = aitextgen(model_folder="../input/gpt-chess-v1",
               tokenizer_file="../input/gpt-chess-v1/aitextgen.tokenizer.json",
               to_gpu=True)

In [None]:
ai.generate(2)

## Train GPT-2

Important parameters for `train()`:

- **`line_by_line`**: Set this to `True` if the input text file is a single-column CSV, with one record per row. aitextgen will automatically process it optimally.
- **`from_cache`**: If you compressed your dataset locally (as noted in the previous section) and are using that cache file, set this to `True`.
- **`num_steps`**: Number of steps to train the model for.
- **`generate_every`**: Interval of steps to generate example text from the model; good for qualitatively validating training.
- **`save_every`**: Interval of steps to save the model: the model will be saved in the VM to `/trained_model`.
- **`save_gdrive`**: Set this to `True` to copy the model to a unique folder in your Google Drive, if you have mounted it in the earlier cells
- **`batch_size`**: Batch size of the model training; setting it too high will cause the GPU to go OOM. _Unlike finetuning, since you are using a small model, you can massively increase the batch size to normalize the training_.
- **`fp16`**: Enables half-precision training for faster/more memory-efficient training. Only works on a T4 or V100 GPU.


In [None]:
ai.train('../input/chess-data/cleaned_lichess08_test.csv',
         line_by_line=True,
         from_cache=False,
         num_steps=50_000,
         generate_every=1_000,
         save_every=1_000,
         save_gdrive=False,
         learning_rate=1e-3,
         batch_size=4,
         )

## Generate Text From The Trained Model

In [None]:
ai = aitextgen(model_folder="../input/gpt-chess-v1",
               tokenizer_file="../input/gpt-chess-v1/aitextgen.tokenizer.json",
               to_gpu=True)

`generate()` without any parameters generates a single text from the loaded model to the console.

In [None]:
ai.generate(max_length = 10, prompt = 'e4', top_k = 40, top_p  = 0.6)

Other optional-but-helpful parameters for `ai.generate()`:

*  **`max_length`**: Number of tokens to generate (default 256, you can generate up to 1024 tokens with GPT-2, but it will be _much_ slower)
* **`temperature`**: The higher the temperature, the crazier the text (default 0.7, recommended to keep between 0.7 and 1.0)
* **`top_k`**: Limits the generated guesses to the top *k* guesses (default 0 which disables the behavior; if the generated output is super crazy, you may want to set `top_k=40`)
* **`top_p`**: Nucleus sampling: limits the generated guesses to a cumulative probability. (gets good results on a dataset with `top_p=0.9`)

In [None]:
 ai.generate(n=1,
            batch_size=5,
            prompt ="d4 g6",
            temperature = 0.7,
            top_p=0.9,
           top_k = 40)

In [None]:
from tqdm import tqdm

num_files = 1

prompts = ["a4","b4","c4","d4","e4","f4","g4","h4", 
           'e4 c5', 'e4 e5', 'e4 e6','e4 c6','e4 d6',
          'e4 d5','e4 g6','e4 Nf6','d4 Nf6','d4 d5','d4 e6',
          'd4 d6','d4 f5','d4 g6', 'c4 c5', 'c4 e5',
          'c4 e6', 'c4 Nf6', 'c4 f5', 'c4 g6', 'c4 c6']

print(len(prompts))

for prompt in tqdm(prompts):
    for _ in range(num_files):
        ai.generate_to_file(n=1000,
                         batch_size=100,
                        prompt = prompt,
                         temperature=0.7,
                         top_p=0.9)

In [None]:
import os
#returns the names of the files in the directory data as a list
list_of_files = os.listdir("./")
lines=[]
for file in list_of_files:
    if file != 'result.txt' and file != 'general_generation.csv':
        f = open(file, "r")
        #append each line in the file to a list
        inner = f.readlines()
        lines.append(inner[0:len(inner):2])
        f.close()
    #else:
     #   print(file)

In [None]:
import pandas as pd
pd.DataFrame(lines).T.to_csv('prompted_chess.csv', index = False)