# Data cleaning
## Data structure
Raw data has match info and the game PGN format (moves made in the games)
Example:

```
                Event            White       Black Result     UTCDate   UTCTime \
0           Classical          eisaaaa    HAMID449    1-0  2016.06.30  22:00:01 \
1               Blitz           go4jas  Sergei1973    0-1  2016.06.30  22:00:01 \
2    Blitz tournament  Evangelistaizac      kafune    1-0  2016.06.30  22:00:02 \
3      Correspondence           Jvayne    Wsjvayne    1-0  2016.06.30  22:00:02 \
4    Blitz tournament           kyoday   BrettDale    0-1  2016.06.30  22:00:02 \

    WhiteElo  BlackElo  WhiteRatingDiff  BlackRatingDiff  ECO
0       1901      1896             11.0            -11.0  D10
1       1641      1627            -11.0             12.0  C20
2       1647      1688             13.0            -13.0  B01
3       1706      1317             27.0            -25.0  A00
4       1945      1900            -14.0             13.0  B9 

                                         Opening  TimeControl   Termination                                                     AN
0                                    Slav Defense       300+5  Time forfeit      1. d4 d5 2. c4 c6 3. e3 a6 4. Nf3 e5 5. cxd5 e...
1                       King's Pawn Opening: 2.b3       300+0        Normal      1. e4 e5 2. b3 Nf6 3. Bb2 Nc6 4. Nf3 d6 5. d3 ...
2   Scandinavian Defense: Mieses-Kotroc Variation       180+0  Time forfeit      1. e4 d5 2. exd5 Qxd5 3. Nf3 Bg4 4. Be2 Nf6 5....
3                            Van't Kruijs Opening           -        Normal      1. e3 Nf6 2. Bc4 d6 3. e4 e6 4. Nf3 Nxe4 5. Nd...
4     Sicilian Defense: Najdorf, Lipnitsky Attack       180+0  Time forfeit      1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...
```
So a few columns of the game info and then a column called `AN` that contains the PGN of the game, so we must convert this table into gameboards
From this data, the most important data to save is `Result`, `ELO`s, `TimeControl` and `AN`


In [10]:
import pandas as pd
import chess
import numpy as np

piece_to_idx = {
    'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,
    'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11
}

def board_to_bitmap(board: chess.Board):
    piece_map = board.piece_map()
    bitmap = np.zeros((12,1), dtype=np.uint64)
    for square, piece in piece_map.items():
        bitmap[piece_to_idx[piece.symbol()]] |= 1<<square
    return bitmap

def board_to_tensor(board: chess.Board):
    tensor = np.zeros((12, 8, 8), dtype=np.uint8)
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece:
            idx = piece_to_idx[piece.symbol()]
            row = 7 - square // 8
            col = square % 8
            tensor[idx, row, col] = 1
    return tensor

In [7]:
import re

def clean_pgn(pgn_text):
    # Remove {...} comments and [%...] evals
    cleaned = re.sub(r'\{[^}]*\}', '', pgn_text)
    cleaned = re.sub(r'\[%[^]]*\]', '', cleaned)
    cleaned = re.sub(r'[0-9]+\.\.\.', '', cleaned)
    cleaned = re.sub(r'\?!?', '', cleaned)
    return cleaned

In [20]:
import csv
from time import time
HEADERS=["Event", "White", "Black", "Result", "UTCDate", "UTCTime", "WhiteElo", "BlackElo", "WhiteRatingDiff", "BlackRatingDiff", "ECO", "Opening", "TimeControl", "Termination", "AN"]

start = 1 # To ignore headers
total = 0
READ_SIZE = 10000
RUN = True
try:
    with open("./dataset/processed/results_2_3.csv", "a+") as f:
        writer = csv.writer(f)
        while RUN:
            ti = time()
            games_df = pd.read_csv('./dataset/raw/chess_games.csv', skiprows=start, nrows=READ_SIZE, names=HEADERS)

            # Result, WhiteElo, BlackElo, TimeControl, BoardPositions: uint_64[], move_played
            for index, game in games_df.iterrows():
                board = chess.Board()
                # Get PGN
                Result = game["Result"]
                WhiteElo = game["WhiteElo"]
                BlackElo = game["BlackElo"]
                TimeControl = game["TimeControl"]
                moves_string = clean_pgn(game["AN"])

                bitmaps = []

                tokens = moves_string.replace("\n", " ").split()
                moves = [token for token in tokens if not token[0].isdigit() and '.' not in token]
                sideToPlay = True #True for white, False for Black
                for move in moves:
                    try:
                        move: chess.Move = board.push_san(move)
                        # bitmap = board_to_bitmap(board)
                        bitmap = board_to_tensor(board)
                        writer.writerow([
                            Result,
                            WhiteElo,
                            BlackElo,
                            TimeControl,
                            sideToPlay,
                            bitmap.tolist(),
                            move.uci()
                        ])
                        sideToPlay = not sideToPlay
                    except Exception as e:
                        print(f"Skipping bad move: {moves} - {move} – #{e}#")
                        break
            
            # If is less than READ_SIZE, then all games have been read
            if 250_000 <= total:
                RUN = False

            start += READ_SIZE
            tf = time()
            total += len(games_df)
            print(f"Read {len(games_df)} in {tf-ti} | TOTAL: {total}")

except Exception as e:
    print(e.with_traceback())

Read 10000 in 90.00027847290039 | TOTAL: 10000
Read 10000 in 88.6656084060669 | TOTAL: 20000
Read 10000 in 87.69201493263245 | TOTAL: 30000
Read 10000 in 88.65520358085632 | TOTAL: 40000
Read 10000 in 89.36712312698364 | TOTAL: 50000
Read 10000 in 90.83561062812805 | TOTAL: 60000
Read 10000 in 89.52825999259949 | TOTAL: 70000
Read 10000 in 89.48972868919373 | TOTAL: 80000
Read 10000 in 89.6222825050354 | TOTAL: 90000
Read 10000 in 89.45392298698425 | TOTAL: 100000
Read 10000 in 89.50855040550232 | TOTAL: 110000
Read 10000 in 89.74093627929688 | TOTAL: 120000
Read 10000 in 89.98530912399292 | TOTAL: 130000
Read 10000 in 90.28936648368835 | TOTAL: 140000
Read 10000 in 90.33854866027832 | TOTAL: 150000
Read 10000 in 91.9039466381073 | TOTAL: 160000
Read 10000 in 93.96782970428467 | TOTAL: 170000
Read 10000 in 92.381671667099 | TOTAL: 180000
Read 10000 in 96.61593675613403 | TOTAL: 190000
Read 10000 in 97.61758852005005 | TOTAL: 200000
Read 10000 in 96.41415047645569 | TOTAL: 210000
Read 1