In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [2]:
chess_data = pd.read_csv('../Data/Processed/cleaned_lichess08_sample.csv')

In [3]:
chess_data.head()['PGN']

0    e4 g6 d4 Bg7 c3 d6 Qf3 Nf6 h3 O-O Bg5 Nbd7 Bc4...
1    d4 Nf6 c4 g6 a3 Bg7 Nf3 O-O Nc3 a5 Bf4 d6 e3 N...
2    e4 c5 Qf3 e5 Bc4 Nf6 Nh3 h6 g4 g5 Ng1 a6 h4 Bg...
3    e4 e5 Nf3 d6 Nc3 f5 exf5 Bxf5 d3 Nf6 h3 Be7 Be...
4    f3 e5 g4 d5 Bg2 d4 c4 dxc3 Nxc3 Nc6 Qa4 Qh4+ K...
Name: PGN, dtype: object

In [4]:
s

In [5]:
data = chess_data['PGN'].apply(generate_seqs)

In [6]:
data = pd.DataFrame(data.to_list(), columns=['inp', 'out'])

In [7]:
# the following 2 hyperparameters are task-specific
max_source_length = 40
max_target_length = 5

In [8]:
# Suppose we have the following 2 training examples:
#input_sequence_1 = "Welcome to NYC"
#output_sequence_1 = "Bienvenue à NYC"

#input_sequence_2 = "HuggingFace is a #company"
#output_sequence_2 = "HuggingFace est une entreprise"

input_sequences = list(data['inp'].values)[:1000]
output_sequences = list(data['out'].values)[:1000]

In [9]:
len(input_sequences), len(output_sequences)

(1000, 1000)

In [10]:
input_sequences[0], output_sequences[0]

('e4 g6 d4 Bg7 c3 d6 Qf3 Nf6 h3 O-O Bg5 Nbd7 Bc4 a6 h4 b5', 'Bd5 Nxd5 exd5')

In [11]:
# encode the inputs
# task_prefix = "translate English to French: "
task_prefix = "Chess: "

#input_sequences = [input_sequence_1, input_sequence_2]


encoding = tokenizer([task_prefix + sequence for sequence in input_sequences],
                     padding='longest',
                     max_length=max_source_length,
                     truncation=True,
                     return_tensors="pt")
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

In [12]:
# encode the targets
#output_sequences = [output_sequence_1, output_sequence_2]

target_encoding = tokenizer(output_sequences,
                            padding='longest',
                            max_length=max_target_length,
                            truncation=True)
labels = target_encoding.input_ids


In [13]:
# replace padding token id's of the labels by -100
labels = [
           [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
]
labels = torch.tensor(labels)

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss

In [14]:
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = input_sequences[-3:] # use different length sentences to test batching
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    do_sample=False, # disable sampling to test if batching affects output
)

print(tokenizer.batch_decode(output, skip_special_tokens=True))

['Chess: e4 c5 Nf3 Nc6 Nc3', 'Chess: e4 c5 Nc3 Nc6 Nf3', 'Chess: e4 d5 exd5 Qxd5 Nc']


In [15]:
output_sequences[-3:] 

['Be6 Bg5 Rxg5', 'O-O Nec3 Re8', 'd5 b5 Bb3']

In [16]:
sentences

['e4 c5 Nf3 Nc6 Nc3 e6 d4 cxd4 Nxd4 e5 Nf5 d6 Qg4 Nf6 Nxg7+ Bxg7 Qxg7 Rg8 Qh6 Nd4 Bd3',
 'e4 c5 Nc3 Nc6 Nf3 d6 Bb5 a6 Bxc6+ bxc6 d4 cxd4 Nxd4 Bb7 Qf3 Nf6 Bg5 c5 Nde2 g6 Bxf6 exf6 Nd5 Bg7 O-O',
 'e4 d5 exd5 Qxd5 Nc3 Qd8 d4 g6 Nf3 Bg7 Bc4 Nf6 O-O O-O h3 a6']

In [17]:
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = input_sequences[:3] # use different length sentences to test batching
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    do_sample=False, # disable sampling to test if batching affects output
)

print(tokenizer.batch_decode(output, skip_special_tokens=True))

['Chess: e4 g6 d4 Bg7 c3', 'Chess: d4 Nf6 c4 g6 a3', 'Chess: e4 c5 Qf3 e5 Bc4']


In [18]:
output_sequences[:3] 

['Bd5 Nxd5 exd5', 'c6 Ne2 Rfe8', 'Bg7 hxg5 hxg5']

In [19]:
sentences

['e4 g6 d4 Bg7 c3 d6 Qf3 Nf6 h3 O-O Bg5 Nbd7 Bc4 a6 h4 b5',
 'd4 Nf6 c4 g6 a3 Bg7 Nf3 O-O Nc3 a5 Bf4 d6 e3 Nbd7 Be2 Nh5 Bg3 Nxg3 fxg3 Nf6 O-O Ng4 Qd2 e5 dxe5 Nxe5 Nxe5 Bxe5 Rf2 Be6 b3 Qg5 Raf1 f5 Bf3',
 'e4 c5 Qf3 e5 Bc4 Nf6 Nh3 h6 g4 g5 Ng1 a6 h4']