In [1]:
from datasets import load_dataset
from huggingface_hub import login
from dotenv import load_dotenv
import os
from src.tokenizer.tokenizer import ChessTokenizer


load_dotenv()

login(token=os.getenv("HF_TOKEN"))


  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\James\.cache\huggingface\token
Login successful


In [2]:
dataset = load_dataset("jimbowyer123/chessformers")
dataset = dataset.filter(lambda x: x["moves"] is not None)
dataset = dataset["train"].train_test_split(test_size=0.0005)
tokenizer = ChessTokenizer()

def tokenize_moves(sample):
    return tokenizer(sample["moves"], return_tensors="pt")

dataset = dataset.map(tokenize_moves, batched=False, remove_columns=["moves"])


Map: 100%|██████████| 1761/1761 [00:00<00:00, 3745.94 examples/s]
Map: 100%|██████████| 1762/1762 [00:00<00:00, 3647.19 examples/s]


In [3]:
from transformers import LlamaForCausalLM, LlamaConfig
import torch

# Initialize the Llama configuration with the tokenizer's vocabulary size
config = LlamaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=tokenizer.max_length,
)


# Create a new GPT-2 model with the custom configuration
model = LlamaForCausalLM(config)

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

print(model.dtype)


Number of trainable parameters: 113374464
torch.bfloat16


In [4]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [5]:
from transformers import Trainer, TrainingArguments

import wandb

wandb.finish()
wandb.login()

wandb.init(project="chess-training")

training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1500,
    save_steps=1500,
    save_total_limit=2,
    logging_dir="logs",
    logging_steps=100,
    report_to="wandb",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

trainer.train() 




[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbower-james1996[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2643 [00:00<?, ?it/s]