In [1]:
import torch
import torch.nn as nn
import pandas as pd
from chessEngine import ChessEncoder, MLPEngine
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingLR

# Preprocessing

In [2]:
df = pd.read_csv('fen_analysis.csv').sample(frac=1)[:30000] # This shuffles the rows
df

Unnamed: 0,fen_value,score
57383,rnbqk2r/ppp2ppp/4pn2/3pb3/2PP4/4P3/PP3PPP/RN1Q...,22
189240,rnb1k1nr/pppp1ppp/8/2b5/8/1P2P1q1/PBPP2PP/RN2K...,102
201335,r3k3/pp1n3p/2ppp3/4p3/7B/1P2P3/P1PP1PPP/RN2K1N...,-682
183753,1q3k2/p4p1p/1pn1r1p1/8/2N5/PP2PBP1/5PQP/3R1RK1...,785
88036,rn1q2r1/pbpp1Q1p/2k1p3/N1p2p2/3P4/2P5/P4PPP/R1...,-258
...,...,...
93080,6k1/2p3pp/p4p2/3p1K2/1Pb2P2/r7/6PP/8 w - - 0 36,-650
110254,6k1/1R3pp1/2P3r1/q2Pp2p/3bP3/3B1P2/1r2Q1PP/5K1...,345
3954,rnbqkbnr/pp1ppppp/8/2p5/4P3/8/PPPP1PPP/RNBQKBN...,35
141340,rn2kbnr/pbpp1ppp/1p2pq2/8/2PP4/P1N2P2/1P2P1PP/...,-85


In [3]:
encoder_object = ChessEncoder()

In [4]:
fen_encodings = []
for fen_i in df['fen_value']:
    encoded_fen = encoder_object.encode_fen(fen_i)
    fen_encodings.append(encoded_fen)

In [5]:
X = torch.tensor(fen_encodings, dtype=torch.int32)

In [6]:
X

tensor([[ 9, 11, 10,  ...,  0, 36, 39],
        [ 9, 11, 10,  ...,  0, 29, 38],
        [ 9,  0,  0,  ...,  0, 27, 15],
        ...,
        [ 9, 11, 10,  ...,  0, 39, 39],
        [ 9, 11,  0,  ...,  0, 39, 39],
        [ 9,  0,  0,  ...,  0, 24, 18]], dtype=torch.int32)

In [7]:
score_encodings = []
for y_i in df['score']:
    # print(y_i)
    encoded_score = encoder_object.encode_score(str(y_i))
    score_encodings.append(encoded_score)
y = torch.tensor(score_encodings, dtype=torch.float32)

In [8]:
X.shape, y.shape

(torch.Size([30000, 200]), torch.Size([30000]))

# Helper Functions

In [9]:
val_split = 10000
test_split = 10000

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
def get_batch(X, Y, bs):
    
    assert isinstance(X, torch.Tensor)
    assert isinstance(Y, torch.Tensor)

    batch = torch.randint(0, len(X), (bs,))
    x = X[batch].to(device)
    y = Y[batch].to(device).to(torch.bfloat16)
    return x, y
# b = get_batch(X, y, 64)

In [12]:
X_val = X[:val_split].to(device)
y_val = y[:val_split].to(device)
X_test = X[val_split:test_split].to(device)
y_test = y[val_split:test_split].to(device)
X = X[test_split:]
y = y[test_split:]

In [13]:
len(X)

20000

# Training

In [14]:
# hyperparameters
lr = 0.1
num_steps = 3000
warmup_steps = 50
bs = len(X)
# allowed_error = 100 #
d1 = {1:10, 2:20}
if bs > len(X): bs = len(X)

In [15]:
from torch.optim.lr_scheduler import LambdaLR


In [16]:
model = MLPEngine(embedding_dim=64).to(device)
loss_category = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr = lr) # i accidenly used a smaller lr for scheduler and it worked better, maybe try it?
# Define warm-up and decay
def lr_lambda(epoch):
    if epoch < warmup_steps:  
        return epoch / warmup_steps
    else:  # Exponential decay after warm-up
        return 0.95 ** (epoch - warmup_steps)

scheduler = LambdaLR(optimiser, lr_lambda)
model = model.to(torch.bfloat16)
model.compile()

In [17]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [18]:
train_history = {}
val_history = {}
start_step = 0

In [19]:
model.train()
# train_history = {}
# val_history = {}
for step_i in range(num_steps):
    tot_step = step_i + start_step
    optimiser.zero_grad()
    x_batch, y_batch = get_batch(X, y, bs) 
    # print(x_batch, y_batch)
    y_pred = model(x_batch).view(bs)
    # print(y_pred.shape, y_batch.shape)
    loss = loss_category(y_pred, y_batch)
    # print(loss.item())
    train_history[tot_step] = loss.item()
    loss.backward()
    optimiser.step()
    scheduler.step()
    # print(f"Epoch {step_i}, Learning Rate: {scheduler.get_last_lr()}")
    print(tot_step, ': ',loss.item())

    if tot_step % 100 == 0:
        # validation phase
        y_pred = model(X_val).view(val_split)
        # print(y_pred.shape, y_batch.shape)
        loss = loss_category(y_pred, y_val)
        # print(loss.item())
        val_history[tot_step] = loss.item()

start_step += num_steps

0 :  3375104.0
1 :  3555328.0
2 :  3325952.0
3 :  3358720.0
4 :  3211264.0
5 :  3211264.0
6 :  3309568.0
7 :  3325952.0
8 :  3342336.0
9 :  3588096.0
10 :  3375104.0
11 :  3342336.0
12 :  3260416.0
13 :  3489792.0
14 :  3358720.0
15 :  3293184.0
16 :  3637248.0
17 :  3538944.0
18 :  3309568.0
19 :  3588096.0
20 :  3424256.0
21 :  3325952.0
22 :  3260416.0
23 :  3227648.0
24 :  3178496.0
25 :  3080192.0
26 :  3145728.0
27 :  3162112.0
28 :  2998272.0
29 :  2801664.0
30 :  3047424.0
31 :  2932736.0
32 :  2834432.0
33 :  2785280.0
34 :  2703360.0
35 :  2555904.0
36 :  2490368.0
37 :  2392064.0
38 :  2408448.0
39 :  2228224.0
40 :  2129920.0
41 :  1884160.0
42 :  1990656.0
43 :  1736704.0
44 :  1409024.0
45 :  1376256.0
46 :  1220608.0
47 :  1019904.0
48 :  954368.0
49 :  888832.0
50 :  786432.0
51 :  577536.0
52 :  489472.0
53 :  448512.0
54 :  317440.0
55 :  337920.0
56 :  258048.0
57 :  239616.0
58 :  206848.0
59 :  227328.0
60 :  192512.0
61 :  159744.0
62 :  179200.0
63 :  162816.0
64

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'saves/bad_model.pt')

In [None]:
plt.plot(list(train_history.keys())[500:], list(train_history.values())[500:], label='train')
# plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
plt.plot(list(train_history.keys()), list(train_history.values()), label='train')
plt.plot(val_history.keys(), val_history.values(), label='validation')
plt.legend()
plt.show()

In [None]:
'''
# My results
At relu, model is stuck around 25k with 3x1000 steps with xavier
AT relu, moedl went to 9k and then exploded at 3x1000 steps with kaiming; then at 14k
At gelu, model is stuck atound 10k

Adam is better than AdamW for this task
'''

# TO DO

- [x] do inference, and run a partially trained model with the GUI intact
- [ ] **find a way to fix the fact that our model is giving integer loss
- [ ] **Fix the bug in initialisation
- [ ] implement weights and biases or tensorboard 
- [ ] improve the model
  - [ ] get a better/ bigger dataset
  - [ ] hyperparameter and architecture
    - [ ] add CNN
    - [x] better encoding
    - [ ] residual connections
    - [ ] try adamW after tuning b1 and b2
    - [ ] increase embedding dim
    - [ ] increase neurons in the layers
    - [ ] increase layers in the network
    - [ ] change loss function (maybe)
    - [ ] try diff learning rate scheduler(trapeziodal)
    - [ ] Add regularisation
      - [ ] l1,l2
      - [ ] dropout
    - [ ] Better initialisation
    - [ ] diff optimisation algorithm

- make the init proper by specifying the activation in the init as claude said
-  no cnn right now, maybe in future
- 