# Dataset Cleaning and Prep

In [1]:
# dependencies
'''
%pip install requests
%pip install bitarray
%pip install peewee
%pip install tqdm
'''

import json
import os
import requests
import base64
from peewee import *
from bitarray import bitarray
from tqdm import tqdm

In [2]:
# uses lichess's evals db
# https://database.lichess.org/lichess_db_eval.jsonl.zst

# if db is already downloaded, use lichess_db_eval.jsonl
# if not, download it
if not os.path.exists('lichess_db_eval.jsonl'):
    url = 'https://database.lichess.org/lichess_db_eval.jsonl.zst'
    r = requests.get(url)
    with open('lichess_db_eval.jsonl.zst', 'wb') as f:
        f.write(r.content)
    os.system('zstd -d lichess_db_eval.jsonl.zst')

In [3]:
# we can encode the fens as bitmaps
# p, n, b, r, q, k is 6 types of piece
# w, b is 2 colors
# 6 * 2 * 8 * 8 = 768
# one bit for whether it's white's turn
# castling is four bits for KQkq
# reserve seven bits for en passant, with 0 being no en passant and 1-64 for the square
# 768 + 1 + 4 + 7 = 780

def fen_to_bitmap(fen):
    # start by splitting the fen into its parts
    parts = fen.split(' ')
    board = parts[0]
    to_move = parts[1]
    castling = parts[2]
    ep = parts[3]

    # create an empty bitarray
    bitmap = bitarray(780)
    
    # fill in the board
    row = 0
    col = 0

    # char order
    pieces = 'PNBRQKpnbrqk'

    for char in board:
        if char == '/':
            row += 1
            col = 0
        elif char.isdigit():
            col += int(char)
        else:
            piece = pieces.index(char)

            bitmap[piece * 64 + row * 8 + col] = 1
            col += 1

    # fill in to move
    bitmap[768] = (to_move == 'w')
    
    # fill in castling
    bitmap[769] = ('K' in castling)
    bitmap[770] = ('Q' in castling)
    bitmap[771] = ('k' in castling)
    bitmap[772] = ('q' in castling)

    # fill in en passant
    if ep != '-':
        # convert ep to a square
        col = ord(ep[0]) - ord('a')
        row = int(ep[1]) - 1
        # convert to a 7 bit number
        ep = bitarray(7)

        # one bit for whether ep is possible
        ep[0] = 1
        # six bits for the square
        square = row * 8 + col
        for i in range(6):
            ep[i + 1] = square % 2
            square //= 2
        bitmap[773:780] = ep


    return bitmap

In [None]:
# cleaning the db
# we only want the eval with the most knodes
# and we want to convert the fens to bitmaps
# for mate in x, we want to convert x to a cp of 1000000 / x

sqlite_db = SqliteDatabase('lichess.db')

# create a model for the db
class Evaluations(Model):
    id = IntegerField(primary_key = True)
    fen = TextField()
    binary = BlobField()
    eval = FloatField()

    class Meta:
        database = sqlite_db
    
    def binary_base64(self):
        return base64.b64encode(self.binary)

# create the table
sqlite_db.connect()
sqlite_db.create_tables([Evaluations])

# per the lichess docs, there are 190,987,505 positions
num_positions = 190987505

position_id = 1

with open('lichess_db_eval.jsonl', 'r') as f:
    for _ in tqdm(range(num_positions)):
        # read a line
        line = f.readline()
        # if the line is empty, we're done
        if not line:
            break

        # parse the line
        position = json.loads(line)

        # find the eval with the most knodes
        best_eval = max(position['evals'], key = lambda x: x['knodes'])

        # convert the fen to a bitmap
        bitmap = fen_to_bitmap(position['fen'])

        # convert the eval to a cp
        # if the eval is mate in x, convert to 1000000 / x
        # we can tell if it's mate by if there is a 'mate' key
        if 'mate' in best_eval['pvs'][0]:
            eval = 1000000 / best_eval['pvs'][0]['mate']
        else:
            eval = best_eval['pvs'][0]['cp']
        
        # insert into the db
        eval = Evaluations.create(fen = position['fen'], binary = bitmap.tobytes(), eval = eval, id = position_id)
        position_id += 1
        eval.save()

  0%|          | 1000/190987505 [00:05<294:10:48, 180.34it/s]


In [5]:
# testing the db
# get the first 10 entries
for eval in Evaluations.select().limit(10):
    print(eval.fen, eval.eval)
    print(eval.binary_base64())

sqlite_db.close()

7r/1p3k2/p1bPR3/5p2/2B2P1p/8/PP4P1/3K4 b - - 69.0
b'AAAQAAQAwgAAAAAAAAAAAAAAAAAgAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAQAECABAEAAAAAAAAAAAAAAAAAIAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAA='
8/4r3/2R2pk1/6pp/3P4/6P1/5K1P/8 b - - 0.0
b'AAAAABACAQAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAEAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAA='
6k1/6p1/8/4K3/4NN2/8/8/8 w - - 55555.555555555555
b'AAAAAAAAAAAAAAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAgAA='
r1b2rk1/1p2bppp/p1nppn2/q7/2P1P3/N1N5/PP2BPPP/R1BQ1RK1 w - - 26.0
b'AAAAACgAxwAAAAAAAKAAAAAAAAAAAAggAAAAAAAAAIQAAAAAAAAAEAAAAAAAAAACAEeYAAAAAAAAACQAAAAAACAIAAAAAAAAhAAAAAAAAAAAAACAAAAAAAIAAAAAAAAAgAA='
6k1/4Rppp/8/8/8/8/5PPP/6K1 w - - 1000000.0
b'AAAAAAAABwAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAgAA='
6k1/6p1/6N1/4K3/4N3/8/8/8 b - - 37037.03703703704
b'AAAAAAAAAAAAAAIAC

True