In [1]:
import pandas as pd
import numpy as np

In [3]:
CHUNKS = 2

white_frames = []
black_frames = []

for i in range(CHUNKS):
    white_frames.append(pd.read_csv(f'database/white_{i*100+1}-{(i+1)*100}.csv'))
    black_frames.append(pd.read_csv(f'database/black_{i*100+1}-{(i+1)*100}.csv'))

white_all_df = pd.concat(white_frames).dropna()
black_all_df = pd.concat(black_frames).dropna()

white_all_df.shape[0], black_all_df.shape[0]

(6182, 6303)

In [4]:
white_all_df

Unnamed: 0,a1P,b1P,c1P,d1P,e1P,f1P,g1P,h1P,a2P,b2P,...,b8k,c8k,d8k,e8k,f8k,g8k,h8k,k,q,score
0,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,46
1,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,83
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,77
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,152
4,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,134
3110,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,320
3111,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,999998
3112,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,174


In [8]:
import chess
import chess.engine
import chess.pgn

In [9]:
PIECES = [chess.PAWN, chess.KNIGHT, chess.BISHOP, chess.ROOK, chess.QUEEN, chess.KING]
PLAYERS = [chess.WHITE, chess.BLACK]

fix_names_func = {chess.WHITE: lambda x: x.upper(), chess.BLACK: lambda x: x.lower()}

In [10]:
def get_columns_names():
    columns = []
    for player in PLAYERS:
        for piece in PIECES:
            symbol = fix_names_func[player](chess.piece_symbol(piece))
            columns.extend(map(lambda square: f"{square}{symbol}", chess.SQUARE_NAMES))
        columns.append(fix_names_func[player]("K"))
        columns.append(fix_names_func[player]("Q"))
    return columns

In [11]:
columns = get_columns_names()

white_df = white_all_df.drop_duplicates(subset=columns)
black_df = black_all_df.drop_duplicates(subset=columns)

white_df.shape[0], black_df.shape[0]

(5840, 5865)

In [13]:
SCORE_THRESHOLD = 50
white_dataset = white_df[(white_df['score'] >= SCORE_THRESHOLD) | (white_df['score'] <= -SCORE_THRESHOLD)]
black_dataset = black_df[(black_df['score'] >= SCORE_THRESHOLD) | (black_df['score'] <= -SCORE_THRESHOLD)]

white_dataset.shape[0], black_dataset.shape[0]

(4600, 4485)

# Classification

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
from sklearn.linear_model import LogisticRegression

def test_lr(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", LogisticRegression(random_state=1, max_iter=250).fit(X_trn, y_trn).score(X_tst, y_tst))

In [17]:
test_lr(white_dataset)
test_lr(black_dataset)

score: 0.8675889328063241
score: 0.8399729912221472


In [18]:
from sklearn.svm import SVC

def test_svc_poly(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", SVC(kernel='poly', degree=2).fit(X_trn, y_trn).score(X_tst, y_tst))

def test_svc_rbf(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", SVC(kernel='rbf', gamma='scale', C=30).fit(X_trn, y_trn).score(X_tst, y_tst)) 

In [19]:
test_svc_poly(white_dataset)
test_svc_poly(black_dataset)

score: 0.9235836627140975
score: 0.9041188386225524


In [20]:
test_svc_rbf(white_dataset)
test_svc_rbf(black_dataset)

score: 0.9525691699604744
score: 0.9223497636731938


In [21]:
from sklearn.ensemble import RandomForestClassifier

def test_rf(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", RandomForestClassifier(random_state=1, n_estimators=200).fit(X_trn, y_trn).score(X_tst, y_tst))

In [22]:
test_rf(white_dataset)
test_rf(black_dataset)

score: 0.9492753623188406
score: 0.9237002025658338
