In [1]:
import torch
import torch.nn as nn
import pandas as pd
from chessEngine import ChessEncoder, MLPEngine
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import LambdaLR
from print_color import print

# Preprocessing

In [2]:
df = pd.read_csv('data/fen_analysis.csv').sample(frac=1)[:50000] # This shuffles the rows
# df = pd.read_csv('fen_analysis.csv')[:30000] # no shuffle
df

Unnamed: 0,fen_value,score
36880,r7/5k2/p5pp/1P1PB3/1P2b1B1/8/2pp4/R5K1 w - - 0 40,-342
286427,r1bqkbnr/p1pppppp/1pn5/8/2PP4/4P3/PP3PPP/RNBQK...,-70
334186,3r2k1/pp4pp/5p2/2nP4/1P1qP3/5QN1/P5PP/5RK1 w -...,80
73886,rnb1k1nr/1pqp2pp/p1p1p3/4P3/7B/2PB1N2/P1P2PPP/...,392
78306,6k1/1pp2pb1/p1np3p/6pq/3P1P1N/2P5/PP4QP/4B1K1 ...,512
...,...,...
182485,8/3K2k1/8/6pp/8/8/7r/8 b - - 1 50,578
71689,r3k2r/pb1p4/p1p3p1/P3P1qp/3P4/4PN2/1PP3PP/1NB1...,537
145021,2r1k2r/1bq1bppp/p1nppn2/1pp5/1P2PP2/P1PPBN2/2N...,18
57407,r1b2rk1/pp3p1p/1q2p1p1/2pnP3/3P1P2/P1NQ4/1PB3P...,113


In [3]:
encoder_object = ChessEncoder()

In [4]:
fen_encodings = []
for fen_i in df['fen_value']:
    encoded_fen = encoder_object.encode_fen(fen_i)
    fen_encodings.append(encoded_fen)

In [5]:
X = torch.tensor(fen_encodings, dtype=torch.int32)

In [6]:
X

tensor([[ 9,  0,  0,  ...,  0, 14, 13],
        [ 9,  0, 10,  ...,  0, 39, 39],
        [ 0,  0,  0,  ...,  0, 23, 22],
        ...,
        [ 0,  0,  9,  ...,  0, 39, 39],
        [ 9,  0, 10,  ...,  0, 32, 32],
        [ 0,  0,  0,  ...,  0, 13, 19]], dtype=torch.int32)

In [7]:
score_encodings = []
for y_i in df['score']:
    # print(y_i)
    encoded_score = encoder_object.encode_score(str(y_i))
    score_encodings.append(encoded_score)
y = torch.tensor(score_encodings, dtype=torch.float32)

In [8]:
X.shape, y.shape

(torch.Size([50000, 200]), torch.Size([50000]))

# Helper Functions

In [9]:
val_split = 10000
test_split = 10000

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
def get_batch(X, Y, bs):
    
    assert isinstance(X, torch.Tensor)
    assert isinstance(Y, torch.Tensor)

    batch = torch.randint(0, len(X), (bs,))
    x = X[batch].to(device)
    y = Y[batch].to(device).to(torch.float32)
    return x, y
# b = get_batch(X, y, 64)

In [12]:
X_val = X[:val_split].to(device)
y_val = y[:val_split].to(device)
X_test = X[val_split:val_split+test_split].to(device)
y_test = y[val_split:val_split+test_split].to(device)
X = X[val_split+test_split:]
y = y[val_split+test_split:]

In [13]:
len(X)

30000

# Training

In [14]:
# hyperparameters
lr = 5e-2
num_steps = 500
warmup_steps = 20
bs_train = len(X)
bs_eval = 10000
# allowed_error = 100 #
d1 = {1:10, 2:20}
if bs_train > len(X): bs_train = len(X)

In [15]:
 # i accidenly used a smaller lr for scheduler and it worked better, maybe try it?

In [16]:
model = MLPEngine(embedding_dim=64, bs_train = bs_train, bs_eval=bs_eval).to(device)
loss_category = nn.MSELoss()
optimiser = torch.optim.AdamW(
            model.parameters(), 
            lr = lr,
            betas=(0.9, 0.999),
            eps=1e-3,
            weight_decay=1e-5)

# Define warm-up and decay
def lr_lambda(epoch):
    if epoch < warmup_steps:  
        return epoch / warmup_steps
    else:  # Exponential decay after warm-up
        return 0.99 ** (epoch - warmup_steps)

scheduler = LambdaLR(optimiser, lr_lambda)
model.compile()

In [17]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

In [18]:
train_history = {}
val_history = {}
start_step = 0

In [19]:
model.train()
for step_i in range(1, num_steps+1):
    tot_step = step_i + start_step
    optimiser.zero_grad()
    x_batch, y_batch = get_batch(X, y, bs_train) 
    y_pred = model(x_batch).view(bs_train)
    loss = loss_category(y_pred, y_batch)
    train_history[tot_step] = loss.item()
    loss.backward()
    optimiser.step()
    scheduler.step()
    print(tot_step, ': ',loss.item())

    if tot_step % 100 == 0:
        # validation phase
        model.eval()
        y_pred = model(X_val).view(val_split)
        loss = loss_category(y_pred, y_val)
        val_history[tot_step] = loss.item()
        print(f'{tot_step}: {loss.item()} (Validation)', color='r')
        model.train()

start_step += num_steps

1 :  3096524.75[0m
2 :  3147884.5[0m
3 :  3120818.0[0m
4 :  3091953.0[0m
5 :  3081020.5[0m
6 :  3075539.5[0m
7 :  3055821.5[0m
8 :  2999301.75[0m
9 :  3204445.5[0m
10 :  2934180.25[0m
11 :  3271672.75[0m
12 :  3124416.5[0m
13 :  3152709.5[0m
14 :  3030268.25[0m
15 :  3233329.0[0m
16 :  3056035.5[0m
17 :  3058113.0[0m
18 :  2975888.5[0m
19 :  2989414.75[0m
20 :  3072248.25[0m
21 :  3080711.25[0m
22 :  2997883.5[0m
23 :  3053162.75[0m
24 :  2953910.25[0m
25 :  3047538.75[0m
26 :  2947671.75[0m
27 :  2920909.5[0m
28 :  2981533.0[0m
29 :  2971322.25[0m
30 :  2978004.0[0m
31 :  2895927.25[0m
32 :  2818621.25[0m
33 :  2861352.0[0m
34 :  2753279.25[0m
35 :  2738956.0[0m
36 :  2739920.5[0m
37 :  2670986.25[0m
38 :  2622482.0[0m
39 :  2515789.0[0m
40 :  2623169.75[0m
41 :  2539506.25[0m
42 :  2662526.5[0m
43 :  2505061.75[0m
44 :  2318516.25[0m
45 :  2555106.75[0m
46 :  2174821.75[0m
47 :  2352730.5[0m
48 :  2241042.5[0m
49 :  2395561.5[0m
50 :  

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'saves/model.pt')

In [None]:
plt.plot(list(train_history.keys())[100:], list(train_history.values())[100:], label='train')
# plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
plt.plot(list(train_history.keys()), list(train_history.values()), label='train')
plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
'''
# My results
At relu, model is stuck around 25k with 3x1000 steps with xavier
AT relu, moedl went to 9k and then exploded at 3x1000 steps with kaiming; then at 14k
At gelu, model is stuck atound 10k

Adam is better than AdamW for this task
'''

# TO DO

- [x] do inference, and run a partially trained model with the GUI intact
- [x] **find a way to fix the fact that our model is giving integer loss
- [x] **Fix the bug in initialisation
- [ ] implement weights and biases or tensorboard 
- [ ] improve the model
  - [x] get a better/ bigger dataset
  - [ ] hyperparameter and architecture
    - [x] better encoding
    - [ ] residual connections
    - [ ] try adamW after tuning b1 and b2
    - [x] increase embedding dim
    - [x] increase neurons in the layers
    - [x] increase layers in the network
    - [x] change loss function (maybe)
    - [ ] try diff learning rate scheduler(trapeziodal)
    - [ ] Add regularisation
      - [ ] l1,l2
      - [x] dropout
    - [x] Better initialisation
    - [x] diff optimisation algorithm

# Note