In [1]:
import torch
import torch.nn as nn
import pandas as pd
from chessEngine import ChessEncoder, MLPEngine
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingLR

# Preprocessing

In [2]:
df = pd.read_csv('fen_analysis.csv').sample(frac=1)[:30000] # This shuffles the rows
df

Unnamed: 0,fen_value,score
15778,rn1qb1k1/4b1p1/3p1r1p/p1pPpp2/1pP4P/1P1PB1PN/P...,51
136634,r1b1kb1r/pp1p4/n4qpp/2pPNp1B/2P1nB2/P7/1P3PPP/...,573
53339,r1bqk2r/pp3pbp/2n1pnp1/2pp4/8/1PN2NP1/PBPPPPBP...,23
104926,5r1k/5q1p/pp3pp1/2p5/6QN/1P1P4/PBP4P/5RK1 w - ...,597
190050,rn1qk1nr/ppp2pbp/2bpp1p1/8/1P2P3/2PB4/PB1PNPPP...,-10
...,...,...
191251,8/1b3q2/p1nppkp1/1p3p2/2p1P3/PBP3P1/1PP2P1r/R1...,529
285537,rn1qkbnr/pp2p1pp/5p2/2pp4/2PP2b1/2N1PN2/PP3PPP...,-246
273107,r2qk1nr/pp3pb1/2n1b2p/3pp1p1/2pP4/2P1P2P/PPBNQ...,-10
286611,r2qk2r/1pp1bppp/p1np1n2/4p3/2B1P2B/2NP1Q1P/PPP...,8


In [3]:
encoder_object = ChessEncoder()

In [4]:
fen_encodings = []
for fen_i in df['fen_value']:
    encoded_fen = encoder_object.encode_fen(fen_i)
    fen_encodings.append(encoded_fen)

['rn1qb1k1/4b1p1/3p1r1p/p1pPpp2/1pP4P/1P1PB1PN/P3QPB1/R3K2R', 'b', 'KQ', '-']
['r1b1kb1r/pp1p4/n4qpp/2pPNp1B/2P1nB2/P7/1P3PPP/RN1Q1RK1', 'w', 'kq', '-']
['r1bqk2r/pp3pbp/2n1pnp1/2pp4/8/1PN2NP1/PBPPPPBP/R2Q1RK1', 'w', 'kq', '-']
['5r1k/5q1p/pp3pp1/2p5/6QN/1P1P4/PBP4P/5RK1', 'w', '-', '-']
['rn1qk1nr/ppp2pbp/2bpp1p1/8/1P2P3/2PB4/PB1PNPPP/RN1Q1RK1', 'b', 'kq', '-']
['rn1qk2r/pbpp2pp/1p3n2/5p2/1bBP4/2N2N2/PPP2PPP/R1BQ1RK1', 'w', 'kq', '-']
['rk3b1N/ppRnp1pp/4N3/4p2n/4p3/8/P4PPP/5RK1', 'w', '-', '-']
['1k5r/pp1b2p1/3N1n2/4p3/2B1P2p/4BP2/PPP3PP/2K5', 'w', '-', '-']
['r2qk2r/ppp2ppp/2n5/3p1b2/3P1B2/2P1PN2/P4PPP/R2QK2R', 'w', 'KQkq', '-']
['8/8/5Bn1/6P1/3pN3/1P1Pk3/2K5/8', 'w', '-', '-']
['rnbqkbnr/pppp1ppp/4p3/8/8/1P6/PBPPPPPP/RN1QKBNR', 'b', 'KQkq', '-']
['r2q1rk1/pbpp1pbp/np2p1pn/6N1/P3P2P/1P1P4/2PB1PP1/1R1QKB1R', 'w', 'K', '-']
['r1bqk2r/pppp1ppp/2n2n2/4p3/8/2PPP3/PP1NBPPP/R2QK1NR', 'b', 'KQkq', '-']
['rnbqkbnr/p1pp1ppp/1p2p3/8/4PP2/8/PPPP2PP/RNBQKBNR', 'w', 'KQkq', '-']
['8/pp1k4/6p1/4r1p

In [5]:
X = torch.tensor(fen_encodings, dtype=torch.int32)

In [6]:
score_encodings = []
for y_i in df['score']:
    # print(y_i)
    encoded_score = encoder_object.encode_score(str(y_i))
    score_encodings.append(encoded_score)
y = torch.tensor(score_encodings, dtype=torch.float32)

In [7]:
X.shape, y.shape

(torch.Size([30000, 200]), torch.Size([30000]))

# Helper Functions

In [8]:
val_split = 10000
test_split = 10000

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
def get_batch(X, Y, bs):
    
    assert isinstance(X, torch.Tensor)
    assert isinstance(Y, torch.Tensor)

    batch = torch.randint(0, len(X), (bs,))
    x = X[batch].to(device)
    y = Y[batch].to(device).to(torch.bfloat16)
    return x, y
# b = get_batch(X, y, 64)

In [11]:
X_val = X[:val_split].to(device)
y_val = y[:val_split].to(device)
X_test = X[val_split:test_split].to(device)
y_test = y[val_split:test_split].to(device)
X = X[test_split:]
y = y[test_split:]

# Training

In [12]:
# hyperparameters
lr = 0.1
num_steps = 3000
warmup_steps = 250
bs = len(X)
# allowed_error = 100 #
d1 = {1:10, 2:20}
if bs > len(X): bs = len(X)

In [13]:
from torch.optim.lr_scheduler import LambdaLR


In [14]:
model = MLPEngine(embedding_dim=64).to(device)
loss_category = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr = lr) # i accidenly used a smaller lr for scheduler and it worked better, maybe try it?
# Define warm-up and decay
def lr_lambda(epoch):
    if epoch < warmup_steps:  
        return epoch / warmup_steps
    else:  # Exponential decay after warm-up
        return 0.95 ** (epoch - warmup_steps)

scheduler = LambdaLR(optimiser, lr_lambda)
model = model.to(torch.bfloat16)
model.compile()

In [15]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [16]:
train_history = {}
val_history = {}
start_step = 0

In [None]:
model.train()
# train_history = {}
# val_history = {}
for step_i in range(num_steps):
    tot_step = step_i + start_step
    optimiser.zero_grad()
    x_batch, y_batch = get_batch(X, y, bs) 
    # print(x_batch, y_batch)
    y_pred = model(x_batch).view(bs)
    # print(y_pred.shape, y_batch.shape)
    loss = loss_category(y_pred, y_batch)
    # print(loss.item())
    train_history[tot_step] = loss.item()
    loss.backward()
    optimiser.step()
    scheduler.step()
    # print(f"Epoch {step_i}, Learning Rate: {scheduler.get_last_lr()}")
    print(tot_step, ': ',loss.item())

    if tot_step % 100 == 0:
        # validation phase
        y_pred = model(X_val).view(val_split)
        # print(y_pred.shape, y_batch.shape)
        loss = loss_category(y_pred, y_val)
        # print(loss.item())
        val_history[tot_step] = loss.item()

start_step += num_steps

0 :  2932736.0


In [None]:
plt.plot(list(train_history.keys())[500:], list(train_history.values())[500:], label='train')
# plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
plt.plot(list(train_history.keys()), list(train_history.values()), label='train')
plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
'''
# My results
At relu, model is stuck around 25k with 3x1000 steps with xavier
AT relu, moedl went to 9k and then exploded at 3x1000 steps with kaiming; then at 14k
At gelu, model is stuck atound 10k

Adam is better than AdamW for this task
'''

# Inference

In [None]:
fen_sample ='rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq - 0 1'

In [None]:
def get_score(fen_sample):
    # encoding
    encoder_object = ChessEncoder()
    encoded_fen = torch.tensor(encoder_object.encode_fen(fen_i), dtype=torch.int32).to(device).view(70)
    # print(encoded_fen.shape)
    bs=1
    model = MLPEngine(embedding_dim=32).to(device)
    model.eval()
    y_pred = model(encoded_fen)
    return y_pred.item()
    # run the model

In [None]:
get_score(fen_sample)

- [ ] do inference, and run a partially trained model with the GUI intact
- [ ] find a way to fix the fact that our model is giving integer loss
- [ ] implement bitsandbytes
- [ ] improve the model
  - [ ] get a better/ bigger dataset
  - [ ] hyperparameter and architecture
    - [ ] add CNN
    - [ ] better encoding
    - [ ] residual connections
    - [ ] try adamW after tuning b1 and b2
    - [ ] increase embedding dim
    - [ ] increase neurons in the layers
    - [ ] increase layers in the network
    - [ ] change loss function (maybe)
    - [ ] try diff learning rate scheduler(trapeziodal)
    - [ ] Add regularisation
      - [ ] l1,l2
      - [ ] dropout
    - [ ] Better initialisation
    - [ ] diff optimisation algorithm