In [1]:
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict, Counter
import chess.pgn
import chess
from chess import QUEEN
import numpy as np
import pandas as pd

In [2]:
def extract_game_features(game):
    """Extract features for a given game"""

    first_check = True
    first_queen_move = True

    features = defaultdict(int)
    node = game

    while node.variations:  # and node.board().fullmove_number < cut_off:
        move = node.variation(0).move

        board = node.board()

        # print(board.fullmove_number, move)
        moved_piece = board.piece_type_at(move.from_square)
        captured_piece = board.piece_type_at(move.to_square)

        if moved_piece == QUEEN and first_queen_move:
            features['queen_moved_at'] = board.fullmove_number
            first_queen_move = False

        if captured_piece == QUEEN:
            features['queen_changed_at'] = board.fullmove_number

        # if captured_piece:
            # print('Capture', board.fullmove_number, move, moved_piece,captured_piece)
            # captures[]
            # if board.fullmove_number == 10:
            #    features['captures_5']
        if move.promotion:
            features['promotion'] += 1
        if board.is_check():
            features['total_checks'] += 1
            if first_check:
                features['first_check_at'] = board.fullmove_number
                first_check = False
        # castling
        uci_repr = move.uci()
        if uci_repr == 'e1g1':
            features['white_king_castle'] = board.fullmove_number
        elif uci_repr == 'e1c1':
            features['white_queen_castle'] = board.fullmove_number
        elif uci_repr == 'e8g8':
            features['black_king_castle'] = board.fullmove_number
        elif uci_repr == 'e8c8':
            features['black_queen_castle'] = board.fullmove_number

        node = node.variation(0)
    if board.is_checkmate():
        features['is_checkmate'] += 1
    if board.is_stalemate():
        features['is_stalemate'] += 1
    if board.is_insufficient_material():
        features['insufficient_material'] += 1
    if board.can_claim_draw():
        features['can_claim_draw'] += 1
    features['total_moves'] = board.fullmove_number

    # Pieces at the end of the game
    piece_placement = board.fen().split()[0]
    end_pieces = Counter(x for x in piece_placement if x.isalpha())

    # count number of piece at end position
    features.update({'end_' + piece: cnt
                     for piece, cnt in end_pieces.items()})
    return features

In [6]:
def games_features():
    l = []
    pgn = open("../givenData/data/data.pgn")
    #game = chess.pgn.read_game(pgn)
    cnt = 0
    elos = []
    results = []
    resultFeature = {}
    while cnt < 50000:
        game = chess.pgn.read_game(pgn)
        features = extract_game_features(game)
        result = game.headers['Result'].split("-")
        if (result[0] == '1/2'):
            result[0] = 0.5
            result[1] = 0.5
        else:
             result[0] = int(result[0])
             result[1] = int(result[1])
        features['WhiteResult'] = result[0]
        features['BlackResult']= result[1]
        results.append(resultFeature)
        l.append(features)
        if cnt < 25000:
            elos.append(int(game.headers['WhiteElo']))
            elos.append(int(game.headers['BlackElo']))
        cnt += 1

    vec = DictVectorizer()
    X = vec.fit_transform(l)
    return X,elos,results, l,vec

X, elos, results, features, vec = games_features()
names=vec.get_feature_names()
games_array = pd.DataFrame(X.toarray(), columns=names)

In [8]:
import pickle
with open('datapgn.pickle', 'wb') as f:
    pickle.dump(features, f)
    pickle.dump(elos, f)
    pickle.dump(vec, f)
    pickle.dump(X, f)
    pickle.dump(results, f)

In [9]:
stockfish = pd.read_csv('../givenData/stockfish/stockfish.csv')
stockfish.MoveScores = stockfish.MoveScores.str.replace('NA','').str.split()
stockfish.MoveScores = stockfish.MoveScores.apply(lambda x: np.asarray(x, dtype=int))
stockfish.drop('Event', axis=1, inplace=True)
#stockfish.index = list(range(0, 2*len(ecos),2))
stockfish.head()

Unnamed: 0,MoveScores
0,"[18, 17, 12, 8, -5, 12, 3, -2, 22, 21, 20, 13,..."
1,"[26, 44, 26, 18, 14, 34, 36, 31, 37, 35, 42, 5..."
2,"[26, 51, 68, 57, 65, 77, 48, 93, 61, 63, 63, 5..."
3,"[2, 21, 5, 53, 35, 45, 37, 54, 10, 22, 8, 48, ..."
4,"[26, 64, 35, 53, 18, 20, 18, 20, 10, 49, 60, 9..."


In [10]:
def extract_score_features(scores):
    features = dict()
    if scores.size == 0:
        return features
    scores = np.r_[0, scores]
    abs_scores = np.abs(scores)
    diffs = np.diff(scores)
    white_diffs = diffs[::2]
    black_diffs = diffs[1::2]
    subset_names = ['diffs', 'white_diffs', 'black_diffs']
    subsets = [diffs, white_diffs, black_diffs]
    stats = [np.min, np.max, np.mean, lambda x: np.median(np.abs(x))]
    stat_names = ['min', 'max', 'mean', 'median_abs']
    for subset, subset_name in zip(subsets, subset_names):
        for stat, stat_name in zip(stats, stat_names):
            features[stat_name + '_' + subset_name] = stat(subset)
            # np.hi
    features['advantage120_idx'] = np.argmax(abs_scores > 120) or len(scores)
    features['advantage70_idx'] = np.argmax(abs_scores > 70) or len(scores)
    return features

In [11]:
l = stockfish.MoveScores.apply(extract_score_features).tolist()
vec = DictVectorizer()
X = vec.fit_transform(l)
scores_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [12]:
final_df = scores_df.join(games_array)

In [13]:
elos = np.array(elos)
mean_elos = (elos[::2] + elos[1::2]) / 2
elo_difference = elos[::2] - elos[1::2]
mean_elos.shape, elo_difference.shape

((25000,), (25000,))

In [20]:
X_train, X_test = final_df[:20000], final_df[20000:25000]
mean_elo_train, mean_elo_test = mean_elos[:20000], mean_elos[20000:25000]
elo_diff_train, elo_diff_test = elo_difference[:20000], elo_difference[20000:25000]

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [22]:
rf_mean = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf_mean.fit(X_train, mean_elo_train)
mean_pred_rf = rf_mean.predict(X_train)
mean_absolute_error(mean_elo_train, mean_pred_rf)

65.045735119549647

In [23]:
rf_diff = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf_diff.fit(X_train, elo_diff_train)
elo_difference_pred = rf_diff.predict(X_train)
mean_absolute_error(elo_diff_train, elo_difference_pred)

55.42280952653099

In [24]:
mean_pred_rf = rf_mean.predict(X_test)
elo_difference_pred = rf_diff.predict(X_test)
white_elos = mean_pred_rf + elo_difference_pred / 2
black_elos = mean_pred_rf - elo_difference_pred / 2

In [31]:
real_white_elo = elos[40000:50000:2]
real_black_elo = elos[40001:50000:2]

In [35]:
for i in range(5000):
    print(real_white_elo[i], white_elos[i])
    print(real_black_elo[i], black_elos[i])

1690 2239.959
2248 2247.297
2320 2166.515
2423 2293.659
1963 2196.471
1798 1906.857
1881 1934.065
2141 2155.277
1647 2406.323
1399 2416.307
1803 2121.813
2215 2293.739
2107 2311.419
2388 2324.925
2643 2407.418
2556 2273.94
2198 2160.041
2117 2312.211
2213 2230.115
1898 2099.595
1676 2183.423
1528 1907.721
2198 2015.659
2521 2201.229
2398 2203.097
2187 2054.809
2543 2481.582
2605 2523.374
2005 2270.875
1912 2275.451
2715 2441.106
2707 2474.648
2305 2196.494
2458 2347.004
2096 2138.835
2028 2037.417
2381 2431.078
2212 2244.796
2024 2309.893
2269 2415.677
2262 2346.663
2174 2331.985
2308 2301.286
1979 2137.89
2572 2251.927
2679 2401.287
2431 2241.393
2192 2078.773
2399 2293.365
2025 2169.433
2341 2245.294
2097 2123.684
2608 2363.252
2277 2206.834
2687 2400.579
2617 2394.237
2597 2412.152
2404 2288.768
2392 2420.351
2377 2440.127
2030 2293.669
2027 2352.211
2651 2380.816
2498 2377.59
2172 2303.533
2049 2361.157
2201 2313.636
2183 2244.834
2130 2118.373
2327 2128.501
2163 2188.296
2520 2337