# Import Libraries

In [1]:
# pytorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# for visualizing the results
import numpy as np
import matplotlib.pyplot as plt

# for reading input data
import pandas as pd

# for parsing the FEN of chess positions
import re


To represent a chess position, it is common to use [Forsyth–Edwards Notation (FEN)](http://https://en.wikipedia.org/wiki/Forsyth%E2%80%93Edwards_Notation) which contains all the necessary information to reconstruct a chess game from the current position. To make this information usable for a neural network, we will use a bit (actually a byte) to represent if a specific piece (white rook, white knight, etc...) is on a specific square on the 8x8 chess board. Since there are 6 different pieces and two different players, that means there are 12 specific pieces that could potentially be on each square.

However, we still need to keep track of information like whose turn it is, which castling options are still legal, if en passant is possible, how many half moves since a pawn move or piece capture, and how many turns the game has had. To do this we use an additional 8x8 board where the rook locations represent castling rights, the 3rd and 6th rank (row) keep track of possible en passant moves, the e1 and e8 sqaure represent whose on move, and the 4th and 5th rank represent the number of half moves and full moves as binary numbers (max possible being 255) respectively.

Below is a function to do this conversion.

In [2]:
def fen_to_bit_vector(fen):
    # piece placement - lowercase for black pieces, uppercase for white pieces. numbers represent consequtive spaces. / represents a new row
    # active color - whose turn it is, either 'w' or 'b'
    # castling rights - which castling moves are still legal K or k for kingside and Q or q for queenside, '-' if no legal castling moves for either player
    # en passant - if the last move was a pawn moving up two squares, this is the space behind the square for the purposes of en passant
    # halfmove clock - number of moves without a pawn move or piece capture, after 50 of which the game is a draw
    # fullmove number - number of full turns starting at 1, increments after black's move

    # Example FEN of starting position
    # rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1

    parts = re.split(" ", fen)
    piece_placement = re.split("/", parts[0])
    active_color = parts[1]
    castling_rights = parts[2]
    en_passant = parts[3]
    halfmove_clock = int(parts[4])
    fullmove_clock = int(parts[5])

    bit_vector = np.zeros((13, 8, 8), dtype=np.uint8)

    # piece to layer structure taken from reference [1]
    piece_to_layer = {
        'R': 1,
        'N': 2,
        'B': 3,
        'Q': 4,
        'K': 5,
        'P': 6,
        'p': 7,
        'k': 8,
        'q': 9,
        'b': 10,
        'n': 11,
        'r': 12
    }

    castling = {
        'K': (7,7),
        'Q': (7,0),
        'k': (0,7),
        'q': (0,0),
    }

    for r, row in enumerate(piece_placement):
        c = 0
        for piece in row:
            if piece in piece_to_layer:
                bit_vector[piece_to_layer[piece], r, c] = 1
                c += 1
            else:
                c += int(piece)

    if en_passant != '-':
        bit_vector[0, ord(en_passant[0]) - ord('a'), int(en_passant[1]) - 1] = 1

    if castling_rights != '-':
        for char in castling_rights:
            bit_vector[0, castling[char][0], castling[char][1]] = 1

    if active_color == 'w':
        bit_vector[0, 7, 4] = 1
    else:
        bit_vector[0, 0, 4] = 1

    if halfmove_clock > 0:
        c = 7
        while halfmove_clock > 0:
            bit_vector[0, 3, c] = halfmove_clock%2
            halfmove_clock = halfmove_clock // 2
            c -= 1
            if c < 0:
                break

    if fullmove_clock > 0:
        c = 7
        while fullmove_clock > 0:
            bit_vector[0, 4, c] = fullmove_clock%2
            fullmove_clock = fullmove_clock // 2
            c -= 1
            if c < 0:
                break

    return bit_vector



In [23]:
fen = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 5 0"
board = fen_to_bit_vector(fen)
print(board)

fentensor = torch.flatten(torch.from_numpy(board))
print(fentensor)

fenstring = ''.join(map(str,fentensor.numpy().tolist()))
print(fenstring)

[[[1 0 0 0 0 0 0 1]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 1 0 1]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [1 0 0 0 1 0 0 1]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [1 0 0 0 0 0 0 1]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 1 0 0 0 0 1 0]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 1 0 0 1 0 0]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 1 0 0 0 0]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 1 0 0 0]]

 [[0 0 0 0 0 0 0 0]
  [0 0 0

The first 8x8 board (0th index) contains all the extra information and the following 12 boards (1 to 12) represent the locations of the pieces in the order

1. White Rook
2. White Knight
3. White Bishop
4. White Queen
5. White King
6. White Pawn
7. Black Pawn
8. Black King
9. Black Queen
10. Black Bishop
11. Black Knight
12. Black Rook

Notice how the pieces line up correctly with the starting position with the first board correctly indicating it is white to move.

# Neural Network

We'll begin with a simple Feed-Forward Neural Network that's fully connected. Neural Networks are named for their structure being analogous to neurons in the human brain. The idea is that, in the human brain, when a neuron gets an electrical impulse through its synapses it will sometimes fire an electrical impulse to other neurons, creating a chain reaction. For neural networks, our neurons are nodes our synapses are edges (with corresponding weights) and the firing of the neuron is the activation function and output of the node.

![Perceptron](http://starship-knowledge.com/wp-content/uploads/2020/10/Perceptrons-1024x724.jpeg)

The goal of the Neural Network is to have weights such that after all the chain reactions of nodes taking in inputs and producing outputs, the information output of the final node represents the evaluation of the chess position that began the process. In order to actually find such weights we will use a method known as backpropagation, which iteratively adjusts the weights in the network to nudge the output closer to the answer we desire.

Technically speaking, for each training record (a FEN and an evaulation) we input the position into the Neural Network, and after we get a result we compute the error between the result and the correct evaluation which can be represented as a error function. To change the weights in such a way as to minimize this error function we compute the gradient of the error function and adjust the weights in the opposite direction. This means that if we overshoot we want to decrease our evaluation and if we undershoot we want to increase our evaluation.

![Gradient Descent](https://sebastianraschka.com/images/blog/2015/singlelayer_neural_networks_files/perceptron_gradient_descent_1.png)


In [4]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(832, 832)
        self.fc2 = nn.Linear(832, 832)
        self.fc3 = nn.Linear(832, 832)
        self.fc4 = nn.Linear(832, 832)
        self.fc5 = nn.Linear(832, 832)
        self.fc6 = nn.Linear(832, 832)
        self.fc7 = nn.Linear(832, 832)
        self.fc8 = nn.Linear(832, 832)
        self.fc9 = nn.Linear(832, 1)


    def forward(self, x):
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = self.fc9(x)
        return x



In [5]:
# ChessDataset code and eval_to_int code taken from reference [1]
class ChessDataset(Dataset):
    def __init__(self, data_frame):
        self.fens = torch.from_numpy(np.array([*map(fen_to_bit_vector, data_frame["FEN"])], dtype=np.float32))
        self.evals = torch.Tensor([[x] for x in data_frame["Evaluation"]])
        self._len = len(self.evals)

    def __len__(self):
        return self._len

    def __getitem__(self, index):
        return self.fens[index], self.evals[index]


def eval_to_int(evaluation):
    try:
        res = int(evaluation)
    except ValueError:
        res = 10000 if evaluation[1] == '+' else -10000
    return res / 100



In [6]:
def AdamW_main():
    MAX_DATA = 5000000
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    print("Using device {}".format(device))

    print("Preparing Training Data...")
    train_data = pd.read_csv("kaggle/input/chess-evaluations/chessData.csv")
    train_data = train_data[:MAX_DATA]
    train_data["Evaluation"] = train_data["Evaluation"].map(eval_to_int)
    trainset = ChessDataset(train_data)

    print("Preparing Test Data...")
    test_data = pd.read_csv("kaggle/input/chess-evaluations/tactic_evals.csv")
    test_data = test_data[:MAX_DATA]
    test_data["Evaluation"] = test_data["Evaluation"].map(eval_to_int)
    testset = ChessDataset(test_data)

    batch_size = 1024

    print("Converting to pytorch Dataset...")

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

    print("Trainset loaded")

    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

    print("Testset loaded")

    net = Net().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(net.parameters())

    print("Training Net")

    for epoch in range(100):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print('[%d, %5d] loss: %.3f' % (epoch + 1, len(trainloader), running_loss / len(trainloader)))

    print('Finished Training')

    PATH = './chessv5.pth'
    torch.save(net.state_dict(), PATH)

    weights_output = []

    for name, param in net.named_parameters():
        if 'weight' in name:
            print(f"Layer: {name}, Shape: {param.size()}")
            weights_output.append(param.detach().cpu().numpy())

    out = np.vstack(weights_output)
    print(out.shape)

    np.savetxt('nnweightsv5.csv', out, delimiter=',', fmt='%f')

    print('Evaluating model')

    count = 0
    total_loss = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            #print("Correct eval: {}, Predicted eval: {}, loss: {}".format(labels, outputs, loss))

            # count should represent the number of positions evaluated
            # independent of the batch size
            count += len(labels)
            total_loss += loss
            if count % 10000 == 0:
                print('Average error of the model on the {} tactics positions is {}'.format(count, total_loss/count))

In [7]:
AdamW_main()

Using device mps
Preparing Training Data...
Preparing Test Data...
Converting to pytorch Dataset...
Trainset loaded
Testset loaded
Training Net
Epoch:
1
[1,  4883] loss: 119.869
Epoch:
2
[2,  4883] loss: 77.602
Epoch:
3
[3,  4883] loss: 61.705
Epoch:
4
[4,  4883] loss: 52.809
Epoch:
5
[5,  4883] loss: 46.621
Epoch:
6
[6,  4883] loss: 42.203
Epoch:
7
[7,  4883] loss: 38.969
Epoch:
8
[8,  4883] loss: 36.125
Epoch:
9
[9,  4883] loss: 33.936
Epoch:
10
[10,  4883] loss: 32.129
Epoch:
11
[11,  4883] loss: 30.645
Epoch:
12
[12,  4883] loss: 29.349
Epoch:
13
[13,  4883] loss: 28.072
Epoch:
14
[14,  4883] loss: 27.128
Epoch:
15
[15,  4883] loss: 26.290
Epoch:
16
[16,  4883] loss: 25.573
Epoch:
17
[17,  4883] loss: 24.797
Epoch:
18
[18,  4883] loss: 24.272
Epoch:
19
[19,  4883] loss: 23.779
Epoch:
20
[20,  4883] loss: 23.455
Epoch:
21
[21,  4883] loss: 22.836
Epoch:
22
[22,  4883] loss: 22.522
Epoch:
23
[23,  4883] loss: 22.218
Epoch:
24
[24,  4883] loss: 21.975
Epoch:
25
[25,  4883] loss: 21.77