# Data cleaning
## Data structure
Raw data has match info and the game PGN format (moves made in the games)
Example:

```
                Event            White       Black Result     UTCDate   UTCTime \
0           Classical          eisaaaa    HAMID449    1-0  2016.06.30  22:00:01 \
1               Blitz           go4jas  Sergei1973    0-1  2016.06.30  22:00:01 \
2    Blitz tournament  Evangelistaizac      kafune    1-0  2016.06.30  22:00:02 \
3      Correspondence           Jvayne    Wsjvayne    1-0  2016.06.30  22:00:02 \
4    Blitz tournament           kyoday   BrettDale    0-1  2016.06.30  22:00:02 \

    WhiteElo  BlackElo  WhiteRatingDiff  BlackRatingDiff  ECO
0       1901      1896             11.0            -11.0  D10
1       1641      1627            -11.0             12.0  C20
2       1647      1688             13.0            -13.0  B01
3       1706      1317             27.0            -25.0  A00
4       1945      1900            -14.0             13.0  B9 

                                         Opening  TimeControl   Termination                                                     AN
0                                    Slav Defense       300+5  Time forfeit      1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1                       King's Pawn Opening: 2.b3       300+0        Normal      1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2   Scandinavian Defense: Mieses-Kotroc Variation       180+0  Time forfeit      1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....
3                            Van't Kruijs Opening           -        Normal      1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...
4     Sicilian Defense: Najdorf, Lipnitsky Attack       180+0  Time forfeit      1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...
```
So a few columns of the game info and then a column called `AN` that contains the PGN of the game, so we must convert this table into gameboards
From this data, the most important data to save is `Result`, `ELO`s, `TimeControl` and `AN`


In [1]:
import pandas as pd
import chess
import numpy as np

piece_to_idx = {
    'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,
    'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11
}

def board_to_bitmap(board: chess.Board):
    piece_map = board.piece_map()
    bitmap = np.zeros((12,1), dtype=np.uint64)
    for square, piece in piece_map.items():
        bitmap[piece_to_idx[piece.symbol()]] |= 1<<square
    return bitmap

def board_to_tensor(board: chess.Board, sideToPlay: bool):
    tensor = np.zeros(13, dtype=np.uint64) # each board is represented in a int(8*8)-> uint64, a bit for each square
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece:
            idx = piece_to_idx[piece.symbol()]
            tensor[idx] |= 1<<square # Set the bit to 1 at square for the board idx
    tensor[12] = 1 if sideToPlay else 0
    return tensor

def get_possible_moves(board: chess.Board) -> np.ndarray[np.uint64]: # np.ndarray[np.uint64, shape=(63)]
    array = np.zeros(shape=(63,), dtype=np.uint64) # 64*63 bits
    for move in board.legal_moves:
        move_idx = MOVE_DICTIONARY[move.uci()[:4]]
        num_idx = move_idx // 64
        bit_idx = move_idx % 64
        array[num_idx] |= 1<<bit_idx
    return array


letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
numbers = list(range(1, 10)) # [1..9]
MOVE_DICTIONARY = {}
cumulative = 0
for i in range(8):
    for j in range(8):
        for k in range(8):
            for w in range (8):
                if (i == k and j == w):
                    cumulative += 1
                    continue
                from_square = f"{letters[i]}{numbers[j]}"
                to_square = f"{letters[k]}{numbers[w]}"
                MOVE_DICTIONARY[f"{from_square}{to_square}"] = (i * 8**3) + (j * 8**2) + (k * 8) + w - cumulative
REVERSE_MOVE_DICTIONARY = {
    value: key for key,value in MOVE_DICTIONARY.items()
}

In [2]:
import re

def clean_pgn(pgn_text):
    # Remove {...} comments and [%...] evals
    cleaned = re.sub(r'\{[^}]*\}', '', pgn_text)
    cleaned = re.sub(r'\[%[^]]*\]', '', cleaned)
    cleaned = re.sub(r'[0-9]+\.\.\.', '', cleaned)
    cleaned = re.sub(r'\?!?', '', cleaned)
    return cleaned



In [3]:
import csv
from time import time
HEADERS=["Event", "White", "Black", "Result", "UTCDate", "UTCTime", "WhiteElo", "BlackElo", "WhiteRatingDiff", "BlackRatingDiff", "ECO", "Opening", "TimeControl", "Termination", "AN"]

start = 1 # To ignore headers
total = 0
READ_SIZE = 10_000
RUN = True
i = 1
try:
    with open("./dataset/processed/results_with_valid_moves.csv", "w") as f:
        writer = csv.writer(f)
        while RUN:
            ti = time()
            games_df = pd.read_csv('./dataset/raw/chess_games.csv', skiprows=start, nrows=READ_SIZE, names=HEADERS)

            print(f"Reading {i}th batch of games...")
            # Result, WhiteElo, BlackElo, TimeControl, BoardPositions: uint_64[], move_played
            for index, game in games_df.iterrows():
                board = chess.Board()
                # Get PGN
                Result = game["Result"]
                WhiteElo = game["WhiteElo"]
                BlackElo = game["BlackElo"]
                TimeControl = game["TimeControl"]
                moves_string = clean_pgn(game["AN"])

                bitmaps = []

                tokens = moves_string.replace("\n", " ").split()
                moves = [token for token in tokens if not token[0].isdigit() and '.' not in token]
                sideToPlay = True #True for white, False for Black
                numMoves = 0
                for move in moves:
                    try:
                        move: chess.Move = board.push_san(move)
                        numMoves += 1
                        if numMoves < 10:
                            continue

                        bitmap = board_to_tensor(board, sideToPlay)
                        possible_moves = get_possible_moves(board)
                        writer.writerow([
                            bitmap.tolist(), # bitmaps
                            MOVE_DICTIONARY[move.uci()[:4]], # Played move
                            # 0 for invalid 1 for valid; To keep constant size
                            possible_moves.tolist(),
                        ])
                        sideToPlay = not sideToPlay
                    except Exception as e:
                        print(f"Skipping bad move: {moves} - {move} – #{e}#")
                        break
            
            start += READ_SIZE
            tf = time()
            total += len(games_df)
            i+=1
            print(f"Read {len(games_df)} in {tf-ti} | TOTAL: {total}")

            if 50_000 <= total:
                RUN = False

except Exception as e:
    print(e.with_traceback())

Reading 1th batch of games...
Read 10000 in 63.66488695144653 | TOTAL: 10000
Reading 2th batch of games...
Read 10000 in 62.60974836349487 | TOTAL: 20000
Reading 3th batch of games...
Read 10000 in 62.867859840393066 | TOTAL: 30000
Reading 4th batch of games...
Read 10000 in 62.526477575302124 | TOTAL: 40000
Reading 5th batch of games...
Read 10000 in 63.64720368385315 | TOTAL: 50000
