# Data cleaning
## Data structure
Raw data has match info and the game PGN format (moves made in the games)
Example:

```
                Event            White       Black Result     UTCDate   UTCTime \
0           Classical          eisaaaa    HAMID449    1-0  2016.06.30  22:00:01 \
1               Blitz           go4jas  Sergei1973    0-1  2016.06.30  22:00:01 \
2    Blitz tournament  Evangelistaizac      kafune    1-0  2016.06.30  22:00:02 \
3      Correspondence           Jvayne    Wsjvayne    1-0  2016.06.30  22:00:02 \
4    Blitz tournament           kyoday   BrettDale    0-1  2016.06.30  22:00:02 \

    WhiteElo  BlackElo  WhiteRatingDiff  BlackRatingDiff  ECO
0       1901      1896             11.0            -11.0  D10
1       1641      1627            -11.0             12.0  C20
2       1647      1688             13.0            -13.0  B01
3       1706      1317             27.0            -25.0  A00
4       1945      1900            -14.0             13.0  B9 

                                         Opening  TimeControl   Termination                                                     AN
0                                    Slav Defense       300+5  Time forfeit      1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1                       King's Pawn Opening: 2.b3       300+0        Normal      1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2   Scandinavian Defense: Mieses-Kotroc Variation       180+0  Time forfeit      1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....
3                            Van't Kruijs Opening           -        Normal      1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...
4     Sicilian Defense: Najdorf, Lipnitsky Attack       180+0  Time forfeit      1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...
```
So a few columns of the game info and then a column called `AN` that contains the PGN of the game, so we must convert this table into gameboards
From this data, the most important data to save is `Result`, `ELO`s, `TimeControl` and `AN`


In [5]:
import pandas as pd
import chess

import numpy as np


piece_to_idx = {
    'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,
    'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11
}

def board_to_tensor(board: chess.Board, sideToPlay: chess.Color):
    tensor = np.zeros(13, dtype=np.uint64) # each board is represented in a int(8*8)-> uint64, a bit for each square
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece:
            idx = piece_to_idx[piece.symbol()]
            tensor[idx] |= np.uint64(1<<square) # Set the bit to 1 at square for the board idx

    for move in board.legal_moves:
        square = move.to_square
        tensor[12] |= np.uint64(1<<square)
        
    return tensor

def get_possible_moves(board: chess.Board) -> np.ndarray[np.uint64]: # np.ndarray[np.uint64, shape=(63)]
    array = np.zeros(shape=(64,), dtype=np.uint64) # 64*63 bits (is set to 64 just to maintain homogeneity)
    for move in board.legal_moves:
        move_idx = MOVE_DICTIONARY[move.uci()[:4]]
        num_idx = move_idx // 64
        bit_idx = move_idx % 64
        array[num_idx] |= 1<<bit_idx
    return array


letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
numbers = list(range(1, 10)) # [1..9]
MOVE_DICTIONARY = {}
cumulative = 0
for num_games_saved in range(8):
    for j in range(8):
        for k in range(8):
            for w in range (8):
                if (num_games_saved == k and j == w):
                    cumulative += 1
                    continue
                from_square = f"{letters[num_games_saved]}{numbers[j]}"
                to_square = f"{letters[k]}{numbers[w]}"
                MOVE_DICTIONARY[f"{from_square}{to_square}"] = (num_games_saved * 8**3) + (j * 8**2) + (k * 8) + w - cumulative
REVERSE_MOVE_DICTIONARY = {
    value: key for key, value in MOVE_DICTIONARY.items()
}

In [6]:
import re

def clean_pgn(pgn_text):
    # Remove {...} comments and [%...] evals
    cleaned = re.sub(r'\{[^}]*\}', '', pgn_text)
    cleaned = re.sub(r'\[%[^]]*\]', '', cleaned)
    # cleaned = re.sub(r'[0-9]+\.\.\.', '', cleaned)
    cleaned = re.sub(r'\?!?', '', cleaned)
    return cleaned

def pgn_to_moves(pgn_text):
    # Clean the PGN text
    cleaned_pgn = clean_pgn(pgn_text)
    # Split the cleaned PGN into moves
    tokens = cleaned_pgn.replace("\n", " ").split()
    
    moves = [token for token in tokens if not token[0].isdigit() and '.' not in token]
    return moves

In [7]:
def init_move_saver():
    move_dictionary = {}
    move_idx = 0

    def inner(move: chess.Move):
        nonlocal move_idx, move_dictionary

        move_str = move.uci()
        if move_str not in move_dictionary:
            move_dictionary[move_str] = move_idx
            move_idx += 1
        return move_dictionary[move_str] 
    
    def get_dict():
        nonlocal move_dictionary
        return move_dictionary
    
    return inner, get_dict

import chess.pgn
import os
def game_file(file):
    with open(file, "r") as pgn_file:
        while True:
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break
            yield game

In [8]:
import csv
import random
from time import time
HEADERS=[
    "Event", "White", "Black", "Result", "UTCDate", "UTCTime", "WhiteElo", 
    "BlackElo", "WhiteRatingDiff", "BlackRatingDiff", "ECO", "Opening", 
    "TimeControl", "Termination", "AN"
]

RUN = True
num_games_saved = 0
num_batches = 1
NUM_MOVES_TO_BE_SAVED = 10_000_000
try:
    with open("./dataset/processed/test_elite/results_white_13.csv", "w") as whiteFile, open("./dataset/processed/test_elite/results_black_13.csv", "w") as blackFile:
        white_writer = csv.writer(whiteFile)
        black_writer = csv.writer(blackFile)
        save_move, get_move_dictionary = init_move_saver()
        black_num_moves_saved = white_num_moves_saved = 0

        files = iter(os.listdir("./dataset/raw/elite_dataset"))
        while RUN:
            ti = time()
            # Read all png files
            file = next(files)
            for game in game_file(f'./dataset/raw/elite_dataset/{file}'):
                gameboard = game.board()
                num_moves = 0
                for move in game.mainline_moves():
                    bitmap = board_to_tensor(gameboard, gameboard.turn)
                    turn = gameboard.turn # Save the turn before playing the move

                    num_moves += 1
                    gameboard.push(move)
                    # Skip first 8 moves (To not penalize the model for playing a move that wasnt the expected ) 
                    # if num_moves < 8:
                    #     continue
                    # if random.randint(1,32) != 1:
                    #     continue

                    if turn == chess.WHITE:
                        writer = white_writer
                        white_num_moves_saved+=1
                    else:
                        writer = black_writer
                        black_num_moves_saved+=1


                    writer.writerow([
                        bitmap.tolist(), # bitmaps
                        save_move(move)
                    ])
                num_games_saved+=1

            if (NUM_MOVES_TO_BE_SAVED <= black_num_moves_saved) and (NUM_MOVES_TO_BE_SAVED <= white_num_moves_saved):
                RUN = False

            tf = time()
            print(f"Read game file in {tf-ti} | num_games_saved {num_games_saved}\n\tnum_moves_saved White: {white_num_moves_saved:_d} - Black: {black_num_moves_saved:_d}\n")

    import pickle
    with open("./dataset/processed/test_elite/move_dictionary_13.p", "wb") as f:
        dict_bytes = pickle.dump(get_move_dictionary(), f)

except Exception as e:
    print(e.with_traceback())

Read game file in 501.40917348861694 | num_games_saved 43310
	num_moves_saved White: 1_812_883 - Black: 1_790_167

Read game file in 535.963387966156 | num_games_saved 90228
	num_moves_saved White: 3_793_464 - Black: 3_746_352

Read game file in 571.394248008728 | num_games_saved 141059
	num_moves_saved White: 5_937_372 - Black: 5_863_604

Read game file in 521.5183157920837 | num_games_saved 187543
	num_moves_saved White: 7_896_488 - Black: 7_798_412

Read game file in 614.577383518219 | num_games_saved 242862
	num_moves_saved White: 10_225_594 - Black: 10_098_917



In [9]:
display(get_move_dictionary())
print(len(get_move_dictionary()))

{'d2d4': 0,
 'g7g6': 1,
 'c2c4': 2,
 'f8g7': 3,
 'b1c3': 4,
 'd7d6': 5,
 'e2e4': 6,
 'b8d7': 7,
 'f2f4': 8,
 'e7e5': 9,
 'f4e5': 10,
 'd6e5': 11,
 'd4d5': 12,
 'g8h6': 13,
 'h2h3': 14,
 'e8g8': 15,
 'c1e3': 16,
 'c7c6': 17,
 'd1d2': 18,
 'f7f5': 19,
 'g1f3': 20,
 'f5f4': 21,
 'e3f2': 22,
 'h6f7': 23,
 'e1c1': 24,
 'g6g5': 25,
 'c1b1': 26,
 'd7f6': 27,
 'd5c6': 28,
 'd8d2': 29,
 'b7c6': 30,
 'c4c5': 31,
 'c8e6': 32,
 'f1a6': 33,
 'a8b8': 34,
 'h1d1': 35,
 'h7h5': 36,
 'd2d6': 37,
 'f8e8': 38,
 'd6e6': 39,
 'e8e6': 40,
 'a6c4': 41,
 'b8e8': 42,
 'c4e6': 43,
 'b2b4': 44,
 'g5g4': 45,
 'f3h4': 46,
 'f7g5': 47,
 'd1d8': 48,
 'g8h7': 49,
 'h4f5': 50,
 'g5e4': 51,
 'c3e4': 52,
 'f6e4': 53,
 'd8d7': 54,
 'e4f2': 55,
 'd7g7': 56,
 'h7h8': 57,
 'g7g5': 58,
 'e5e4': 59,
 'g5h5': 60,
 'h8g8': 61,
 'f5d4': 62,
 'e6e8': 63,
 'b4b5': 64,
 'e4e3': 65,
 'b5c6': 66,
 'e3e2': 67,
 'd4e2': 68,
 'e8e2': 69,
 'c6c7': 70,
 'e2e8': 71,
 'h5d5': 72,
 'g8f6': 73,
 'b8c6': 74,
 'e2e3': 75,
 'f8b4': 76,
 'd1c2': 

1925
