In [1]:
# first off all we imported libraries which we need
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

from tensorflow.keras.losses import SparseCategoricalCrossentropy

from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

import numpy as np

import pandas as pd

In [2]:
chess_df = pd.read_csv("data/games.csv")
chess_df = chess_df[chess_df.turns > 5]
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [3]:
corpus = chess_df.moves
corpus.head()

0    d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...
1    d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2    e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3    d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4    e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...
Name: moves, dtype: object

In [4]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)

In [5]:
total_words = len(tokenizer.word_index) + 1 
total_words

2694

In [6]:
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        if i < 10:
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
        else:
            n_gram_sequence = token_list[i-9:i+1]
        
input_sequences[0:5]

[[5, 8],
 [5, 8, 11],
 [5, 8, 11, 23],
 [5, 8, 11, 23, 74],
 [5, 8, 11, 23, 74, 12]]

In [23]:
input_sequences[-1][-9:]

[8, 58, 295, 44, 12, 21, 4, 2, 60]

In [7]:
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

10

In [8]:
padded_sentences = []

batchsize = 20
batches = int(len(input_sequences) / batchsize) + 1

for batch in range(batches):
        padded_sentences_batch = pad_sequences(input_sequences[batchsize*batch:batchsize*(batch+1)], maxlen=max_sequence_len)
        
        for sentence in padded_sentences_batch:
            
            padded_sentences.append(sentence)

padded_sentences[0:5]

tokenizer.texts_to_sequences(["e4  e5 Bc4 nc6 Qh5 nf6 Qxf7"])

[array([0, 0, 0, 0, 0, 0, 0, 0, 5, 8], dtype=int32),
 array([ 0,  0,  0,  0,  0,  0,  0,  5,  8, 11], dtype=int32),
 array([ 0,  0,  0,  0,  0,  0,  5,  8, 11, 23], dtype=int32),
 array([ 0,  0,  0,  0,  0,  5,  8, 11, 23, 74], dtype=int32),
 array([ 0,  0,  0,  0,  5,  8, 11, 23, 74, 12], dtype=int32)]

In [24]:

tokenizer.texts_to_sequences(["e4  e5 Bc4 nc6 Qh5 nf6 Qxf7"])

[[3, 6, 18, 7, 111, 4, 348]]

In [9]:
# https://medium.datadriveninvestor.com/keras-training-on-large-datasets-3e9d9dbc09d4
import numpy as np
from tensorflow.keras.utils import Sequence

class My_Generator(Sequence):

    def __init__(self, data, labels, batch_size):
        self.data, self.labels = data, labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))

    def __getitem__(self, idx):
        
        batch_x = self.data[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        return np.array(batch_x), np.array(batch_y)

In [10]:
X = []
labels = []

for i in padded_sentences:
    X.append(i[0:len(i) - 1])
    labels.append(i[-1])
    
X = np.array(X)
labels = np.array(labels)

x_train, x_validate, y_train, y_validate = train_test_split(X, labels, test_size = 0.1, random_state = 999)

In [29]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))


loss_fn = SparseCategoricalCrossentropy()
adam = Adam(learning_rate=0.01)


model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 9, 100)            269400    
                                                                 
 bidirectional_4 (Bidirectio  (None, 300)              301200    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 2694)              810894    
                                                                 
Total params: 1,381,494
Trainable params: 1,381,494
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
batch_size = 20
num_training_samples = len(padded_sentences)
num_epochs = 20

my_training_batch_generator = My_Generator(x_train, y_train, batch_size)


early_stop = EarlyStopping(patience=10)
                           #, restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(patience=3)

model.fit(my_training_batch_generator,
                                          steps_per_epoch=(num_training_samples // batch_size),
                                          epochs=num_epochs,
                                          verbose=1,
                                          use_multiprocessing=True,
                                          workers=16,
                                          max_queue_size=32,
                                          validation_data = (x_validate,y_validate),
                                          callbacks = [reduce_lr, early_stop])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f34c39af520>

In [15]:
import pickle

# saving
with open('tokenizer_early.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
model.save("early")



INFO:tensorflow:Assets written to: early/assets


INFO:tensorflow:Assets written to: early/assets


In [17]:
prediction = model.predict([[5, 8, 11, 23, 74, 12, 487, 253, 2]])

In [18]:
tokendict={}
for index in range(len(prediction[0])):
    tokendict[tokenizer.sequences_to_texts([[index]])[0]] = prediction[0][index]

In [19]:
token_df = pd.DataFrame.from_dict({"moves": tokendict.keys(), "probs": tokendict.values()})

In [21]:
token_df.sort_values(by="probs").tail()

Unnamed: 0,moves,probs
17,h6,0.09656
25,be7,0.110761
61,bb4,0.120348
10,c5,0.135323
7,nc6,0.253689


In [None]:
token_df.head()

In [None]:
len(tokenizer.index_docs)

In [None]:
tokenizer.sequences_to_texts([[5, 8, 11, 23, 74, 12, 487, 253, 2]])

In [None]:
tokenizer.texts_to_sequences(["d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3"])

In [None]:
tokenizer.texts_to_sequences(["Bf4+"])

In [None]:
prediction[0][56]

In [None]:
tokenizer.sequences_to_texts([[12]])