In [1]:
%%capture
!uv pip install torch tokenizers numpy datasets transformers[torch] accelerate

In [2]:
import torch
from datasets import load_from_disk
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizerFast,
    GPT2Config,
    GPT2LMHeadModel,
)

In [3]:
mps_device = None

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")

tensor([1.], device='mps:0')


In [4]:
DATASET_PATH = "../.data/datasets/full/2025-01-20"
TOKENIZER_PATH = "../src/tokenizer/kn1ght-tokenizer.json"

In [5]:
raw_dataset = load_from_disk(DATASET_PATH)

print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 352349
    })
    test: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 1585571
    })
    validation: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 1585572
    })
})


In [6]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_PATH)

In [7]:
from kn1ght.constants import SPECIAL_TOKENS

tokenizer.pad_token = SPECIAL_TOKENS["PAD"]

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = mps_device if mps_device is not None else device

block_size = 512
batch_size = 1000
num_proc = 4
mlm = False
n_embed = 768
n_layer = 12
n_head = 12

In [9]:
def sample_data(dataset, samples=1000) -> dict:
    return dataset.shuffle(seed=1997).select(range(samples))

In [10]:
def tokenize(examples):
    return tokenizer(examples["PGN"], return_special_tokens_mask=True)

In [11]:
training_data = sample_data(raw_dataset["train"], 11000)

tokenized_datasets = training_data.map(
    tokenize,
    batched=True,
    num_proc=num_proc,
    remove_columns=["Date", "Result", "WhiteElo", "BlackElo"],
)

In [12]:
print(tokenized_datasets)

Dataset({
    features: ['PGN', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 11000
})


In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=mlm,
)

In [14]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_embed=n_embed,
    n_layer=n_layer,
    n_head=n_head,
    bos_token_id=0,
    eos_token_id=1,
    unk_token_id=2,
    pad_token_id=3,
)

model = GPT2LMHeadModel(config)

In [15]:
training_args = TrainingArguments(
    output_dir="../.data/models/kn1ght/2025-01-20",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    eval_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    logging_steps=500,
    learning_rate=5e-4,
    weight_decay=0.01,
    warmup_steps=1000,
    report_to="none",
    seed=1997,
    fp16=False,
)

In [16]:
dataset = tokenized_datasets.train_test_split(test_size=1000)

train_dataset = dataset["train"]
eval_dataset = dataset["test"] if "test" in dataset else None

print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['PGN', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 10000
})
Dataset({
    features: ['PGN', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 1000
})


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
trainer.train()

In [19]:
model_size = sum(p.numel() for p in model.parameters())

print(f"Model size: {model_size / 1000**2:.1f}M parameters")

Model size: 89.0M parameters


In [None]:
from transformers import pipeline

from kn1ght.constants import SPECIAL_TOKENS

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=mps_device)

prompt = f"{SPECIAL_TOKENS['START']}1.d4 d5 2.Nf3"

output = pipe(
    prompt,
    max_length=len(tokenizer.encode(prompt)) + 1,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    truncation=True,
)

print(output)

Device set to use mps


[{'generated_text': '[g_start]1.d4 d5 2.Nf3 141.'}]


In [None]:
prompt = f"{SPECIAL_TOKENS['START']}1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 e5 6.Ndb5 d6 7.Bg5 a6 8.Na3 b5 9.Nd5 Qa5+ 10.Bd2 Qd8 11.Nxf6+ Qxf6 12.Bd3 Be7 13.O-O Qg6 14.f4 exf4 15.Bxf4 Ne5 16.Kh1 O-O 17.Qe2"

output = pipe(
    prompt,
    max_length=len(tokenizer.encode(prompt)) + 1,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    truncation=True,
)

print(output)

[{'generated_text': '[g_start]1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 e5 6.Ndb5 d6 7.Bg5 a6 8.Na3 b5 9.Nd5 Qa5+ 10.Bd2 Qd8 11.Nxf6+ Qxf6 12.Bd3 Be7 13.O-O Qg6 14.f4 exf4 15.Bxf4 Ne5 16.Kh1 O-O 17.Qe2 Rad2'}]


In [22]:
model.save_pretrained("../.data/models/out/kn1ght")

In [23]:
model.from_pretrained("../.data/models/out/kn1ght")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(4096, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=4096, bias=False)
)

In [None]:
model.push_to_hub("InterwebAlchemy/kn1ght")

model.safetensors:   0%|          | 0.00/356M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/InterwebAlchemy/kn1ght/commit/345038e7c382c79dde8c319180735d48029fcf05', commit_message='Upload model', commit_description='', oid='345038e7c382c79dde8c319180735d48029fcf05', pr_url=None, repo_url=RepoUrl('https://huggingface.co/InterwebAlchemy/kn1ght', endpoint='https://huggingface.co', repo_type='model', repo_id='InterwebAlchemy/kn1ght'), pr_revision=None, pr_num=None)