In [1]:
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict, Counter
import chess.pgn
import chess
from chess import QUEEN
import numpy as np
import pandas as pd

In [205]:
def extract_game_features(game):
    """Extract features for a given game"""

    first_check = True
    first_queen_move = True

    features = defaultdict(int)
    node = game

    while node.variations:  # and node.board().fullmove_number < cut_off:
        move = node.variation(0).move

        #print("NAGs", node.nags)
        if len(node.nags) > 0:
            print("NAGS found at", game.headers['Event'])
        board = node.board()

        # print(board.fullmove_number, move)
        moved_piece = board.piece_type_at(move.from_square)
        captured_piece = board.piece_type_at(move.to_square)

        if moved_piece == QUEEN and first_queen_move:
            features['queen_moved_at'] = board.fullmove_number
            first_queen_move = False

        if captured_piece == QUEEN:
            features['queen_changed_at'] = board.fullmove_number

        # if captured_piece:
            # print('Capture', board.fullmove_number, move, moved_piece,captured_piece)
            # captures[]
            # if board.fullmove_number == 10:
            #    features['captures_5']
        if move.promotion:
            features['promotion'] += 1
        if board.is_check():
            features['total_checks'] += 1
            if first_check:
                features['first_check_at'] = board.fullmove_number
                first_check = False
        # castling
        uci_repr = move.uci()
        if uci_repr == 'e1g1':
            features['white_king_castle'] = board.fullmove_number
        elif uci_repr == 'e1c1':
            features['white_queen_castle'] = board.fullmove_number
        elif uci_repr == 'e8g8':
            features['black_king_castle'] = board.fullmove_number
        elif uci_repr == 'e8c8':
            features['black_queen_castle'] = board.fullmove_number

        node = node.variation(0)
    if board.is_checkmate():
        features['is_checkmate'] += 1
    if board.is_stalemate():
        features['is_stalemate'] += 1
    if board.is_insufficient_material():
        features['insufficient_material'] += 1
    if board.can_claim_draw():
        features['can_claim_draw'] += 1
    features['total_moves'] = board.fullmove_number

    # Pieces at the end of the game
    piece_placement = board.fen().split()[0]
    end_pieces = Counter(x for x in piece_placement if x.isalpha())

    # count number of piece at end position
    features.update({'end_' + piece: cnt
                     for piece, cnt in end_pieces.items()})
    return features

In [206]:
def games_features():
    l = []
    pgn = open("../givenData/data/data.pgn")
    #game = chess.pgn.read_game(pgn)
    cnt = 0
    elos = []
    results = []
    resultFeature = {}
    while cnt < 50000:
        game = chess.pgn.read_game(pgn)
        features = extract_game_features(game)
        result = game.headers['Result'].split("-")
        if (result[0] == '1/2'):
            result[0] = 0.5
            result[1] = 0.5
        else:
             result[0] = int(result[0])
             result[1] = int(result[1])
        features['WhiteResult'] = result[0]
        features['BlackResult']= result[1]
        results.append(resultFeature)
        l.append(features)
        if cnt < 25000:
            elos.append(int(game.headers['WhiteElo']))
            elos.append(int(game.headers['BlackElo']))
        cnt += 1

    vec = DictVectorizer()
    X = vec.fit_transform(l)
    return X,elos,results, l,vec

X, elos, results, features, vec = games_features()
names=vec.get_feature_names()
games_array = pd.DataFrame(X.toarray(), columns=names)

KeyboardInterrupt: 

In [8]:
import pickle
with open('datapgn1.pickle', 'wb') as f:
    pickle.dump(features, f)
    pickle.dump(elos, f)
    pickle.dump(vec, f)
    pickle.dump(X, f)
    pickle.dump(results, f)

In [46]:
with open('datapgn1.pickle', 'rb') as f:
    features = pickle.load(f)

In [50]:
features

[defaultdict(int,
             {'BlackResult': 0.5,
              'WhiteResult': 0.5,
              'black_king_castle': 5,
              'end_B': 1,
              'end_K': 1,
              'end_N': 1,
              'end_P': 6,
              'end_Q': 1,
              'end_R': 2,
              'end_b': 1,
              'end_k': 1,
              'end_n': 1,
              'end_p': 6,
              'end_q': 1,
              'end_r': 2,
              'first_check_at': 14,
              'queen_changed_at': 19,
              'queen_moved_at': 9,
              'total_checks': 1,
              'total_moves': 19,
              'white_king_castle': 7}),
 defaultdict(int,
             {'BlackResult': 0.5,
              'WhiteResult': 0.5,
              'end_B': 2,
              'end_K': 1,
              'end_N': 2,
              'end_P': 7,
              'end_Q': 1,
              'end_R': 2,
              'end_b': 2,
              'end_k': 1,
              'end_n': 2,
              'end_p': 7,
   

In [9]:
stockfish = pd.read_csv('../givenData/stockfish/stockfish.csv')
stockfish.MoveScores = stockfish.MoveScores.str.replace('NA','').str.split()
stockfish.MoveScores = stockfish.MoveScores.apply(lambda x: np.asarray(x, dtype=int))
stockfish.drop('Event', axis=1, inplace=True)
#stockfish.index = list(range(0, 2*len(ecos),2))
stockfish.head()

Unnamed: 0,MoveScores
0,"[18, 17, 12, 8, -5, 12, 3, -2, 22, 21, 20, 13,..."
1,"[26, 44, 26, 18, 14, 34, 36, 31, 37, 35, 42, 5..."
2,"[26, 51, 68, 57, 65, 77, 48, 93, 61, 63, 63, 5..."
3,"[2, 21, 5, 53, 35, 45, 37, 54, 10, 22, 8, 48, ..."
4,"[26, 64, 35, 53, 18, 20, 18, 20, 10, 49, 60, 9..."


In [10]:
def extract_score_features(scores):
    features = dict()
    if scores.size == 0:
        return features
    scores = np.r_[0, scores]
    abs_scores = np.abs(scores)
    diffs = np.diff(scores)
    white_diffs = diffs[::2]
    black_diffs = diffs[1::2]
    subset_names = ['diffs', 'white_diffs', 'black_diffs']
    subsets = [diffs, white_diffs, black_diffs]
    stats = [np.min, np.max, np.mean, lambda x: np.median(np.abs(x))]
    stat_names = ['min', 'max', 'mean', 'median_abs']
    for subset, subset_name in zip(subsets, subset_names):
        for stat, stat_name in zip(stats, stat_names):
            features[stat_name + '_' + subset_name] = stat(subset)
            # np.hi
    features['advantage120_idx'] = np.argmax(abs_scores > 120) or len(scores)
    features['advantage70_idx'] = np.argmax(abs_scores > 70) or len(scores)
    return features

In [11]:
l = stockfish.MoveScores.apply(extract_score_features).tolist()
vec = DictVectorizer()
X = vec.fit_transform(l)
scores_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [42]:
stock_fish_features = defaultdict(float)
row = 0
scores = np.r_[0, stockfish.ix[row, 0]]
white_scores = scores[::2]
black_scores = scores[1::2]
diffs = np.diff(scores)
white_diffs = diffs[::2]
black_diffs = diffs[1::2]
stock_fish_features['mean_abs_error'] = np.abs(diffs).sum() / len(scores)

In [55]:
scores_df

Unnamed: 0,advantage120_idx,advantage70_idx,max_black_diffs,max_diffs,max_white_diffs,mean_black_diffs,mean_diffs,mean_white_diffs,median_abs_black_diffs,median_abs_diffs,median_abs_white_diffs,min_black_diffs,min_diffs,min_white_diffs
0,39.0,28.0,44.0,44.0,30.0,6.421053,1.421053,-3.578947,9.0,10.5,11.0,-11.0,-27.0,-27.0
1,14.0,14.0,20.0,26.0,26.0,5.500000,4.230769,3.142857,9.0,7.0,6.0,-8.0,-18.0,-18.0
2,63.0,6.0,5009.0,5009.0,92.0,58.207547,-108.905660,-276.018868,13.0,13.5,17.0,-1550.0,-9578.0,-9578.0
3,69.0,28.0,85.0,85.0,31.0,15.789474,2.545455,-10.358974,13.5,13.0,13.0,-16.0,-69.0,-69.0
4,20.0,12.0,309.0,309.0,39.0,49.791667,17.857143,-12.800000,36.5,21.0,16.0,-11.0,-87.0,-87.0
5,59.0,41.0,27.0,29.0,29.0,2.758621,-0.189655,-3.137931,7.0,7.5,9.0,-28.0,-57.0,-57.0
6,28.0,26.0,5519.0,5519.0,56.0,181.567568,11.973333,-153.157895,21.0,12.0,11.0,-42.0,-5497.0,-5497.0
7,35.0,35.0,4926.0,5371.0,5371.0,-79.555556,0.000000,78.312500,13.0,9.0,8.5,-5375.0,-5375.0,-3825.0
8,70.0,32.0,229.0,229.0,101.0,10.533333,4.505495,-1.391304,8.0,8.0,8.5,-41.0,-108.0,-108.0
9,29.0,24.0,142.0,142.0,29.0,15.241379,5.949153,-3.033333,19.0,14.0,9.0,-32.0,-69.0,-69.0


In [54]:
final_df = scores_df.join(games_array)
final_df.values

array([[   39.,    28.,    44., ...,    19.,     7.,     0.],
       [   14.,    14.,    20., ...,     7.,     7.,     0.],
       [   63.,     6.,  5009., ...,    53.,     8.,     0.],
       ..., 
       [   46.,    44.,  1035., ...,    35.,     7.,     0.],
       [   57.,    31.,   123., ...,    38.,     6.,     0.],
       [   55.,    52.,   533., ...,    54.,    18.,     0.]])

In [13]:
elos = np.array(elos)
mean_elos = (elos[::2] + elos[1::2]) / 2
elo_difference = elos[::2] - elos[1::2]
mean_elos.shape, elo_difference.shape

((25000,), (25000,))

In [20]:
X_train, X_test = final_df[:20000], final_df[20000:25000]
mean_elo_train, mean_elo_test = mean_elos[:20000], mean_elos[20000:25000]
elo_diff_train, elo_diff_test = elo_difference[:20000], elo_difference[20000:25000]

In [167]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [168]:
rf_mean = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf_mean.fit(X_train, mean_elo_train)
mean_pred_rf = rf_mean.predict(X_train)
mean_absolute_error(mean_elo_train, mean_pred_rf)

65.238643299084046

In [169]:
rf_diff = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf_diff.fit(X_train, elo_diff_train)
elo_difference_pred = rf_diff.predict(X_train)
mean_absolute_error(elo_diff_train, elo_difference_pred)

55.444770499561614

In [170]:
mean_pred_rf = rf_mean.predict(X_test)
elo_difference_pred = rf_diff.predict(X_test)
white_elos_rf = mean_pred_rf + elo_difference_pred / 2
black_elos_rf = mean_pred_rf - elo_difference_pred / 2

In [31]:
real_white_elo = elos[40000:50000:2]
real_black_elo = elos[40001:50000:2]

In [172]:
mean_absolute_error(real_black_elo + real_white_elo, black_elos_rf+white_elos_rf)

349.63869384218333

In [61]:
for i in range(5000):
    print(real_white_elo[i], white_elos[i])
    print(real_black_elo[i], black_elos[i])

-549.959
0.703
153.485
129.341
-233.471
-108.857
-53.065
-14.277
-759.323
-1017.307
-318.813
-78.739
-204.419
63.075
235.582
282.06
37.959
-195.211
-17.115
-201.595
-507.423
-379.721
182.341
319.771
194.903
132.191
61.418
81.626
-265.875
-363.451
273.894
232.352
108.506
110.996
-42.835
-9.417
-50.078
-32.796
-285.893
-146.677
-84.663
-157.985
6.714
-158.89
320.073
277.713
189.607
113.227
105.635
-144.433
95.706
-26.684
244.748
70.166
286.421
222.763
184.848
115.232
-28.351
-63.127
-263.669
-325.211
270.184
120.41
-131.533
-312.157
-112.636
-61.834
11.627
198.499
-25.296
182.15
591.062
619.186
-280.879
-88.323
-415.758
-435.438
313.064
461.51
368.92
126.734
-596.517
-391.325
-200.307
-88.767
188.955
299.001
-119.241
190.241
128.089
245.409
316.638
241.276
-353.76
-294.996
213.907
147.459
323.544
-262.838
392.825
364.545
-64.579
-7.769
14.248
33.166
-13.085
-41.991
231.419
79.057
-381.871
-260.471
-127.207
38.763
78.113
-30.181
242.051
-73.031
147.053
-369.603
121.736
12.78
-215.688
-368

In [107]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [181]:
def baseline_model():
 # create model
    model = Sequential()
    model.add(Dense(64, input_dim=40, kernel_initializer='normal', activation='relu'))
    model.add(Dense(32, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [189]:
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
diff_estimator = KerasRegressor(build_fn=baseline_model, epochs=100, verbose=2)
mean_estimator = KerasRegressor(build_fn=baseline_model, epochs=100, verbose=2)

In [190]:
mean_estimator.fit(X_train.values, mean_elo_train)

Epoch 1/100
 - 14s - loss: 811.4363
Epoch 2/100
 - 7s - loss: 316.4296
Epoch 3/100
 - 7s - loss: 284.6816
Epoch 4/100
 - 7s - loss: 273.5235
Epoch 5/100
 - 8s - loss: 268.0299
Epoch 6/100
 - 7s - loss: 266.3611
Epoch 7/100
 - 7s - loss: 261.1208
Epoch 8/100
 - 7s - loss: 257.6505
Epoch 9/100
 - 7s - loss: 253.8711
Epoch 10/100
 - 7s - loss: 252.1670
Epoch 11/100
 - 7s - loss: 253.7728
Epoch 12/100
 - 7s - loss: 252.0343
Epoch 13/100
 - 8s - loss: 247.5992
Epoch 14/100
 - 8s - loss: 245.8707
Epoch 15/100
 - 8s - loss: 244.4202
Epoch 16/100
 - 8s - loss: 244.6123
Epoch 17/100
 - 7s - loss: 244.0549
Epoch 18/100
 - 8s - loss: 241.3076
Epoch 19/100
 - 8s - loss: 239.4962
Epoch 20/100
 - 7s - loss: 236.9457
Epoch 21/100
 - 7s - loss: 237.1899
Epoch 22/100
 - 6s - loss: 235.4197
Epoch 23/100
 - 7s - loss: 233.2359
Epoch 24/100
 - 8s - loss: 232.0477
Epoch 25/100
 - 7s - loss: 229.1196
Epoch 26/100
 - 7s - loss: 228.2650
Epoch 27/100
 - 7s - loss: 227.0938
Epoch 28/100
 - 7s - loss: 224.9538


<keras.callbacks.History at 0x1b252791c18>

In [191]:
mean_pred = mean_estimator.predict(X_test.values)

In [192]:
diff_estimator.fit(X_train.values, elo_diff_train)

Epoch 1/100
 - 10s - loss: 179.0145
Epoch 2/100
 - 7s - loss: 162.1872
Epoch 3/100
 - 7s - loss: 159.3778
Epoch 4/100
 - 7s - loss: 158.6998
Epoch 5/100
 - 8s - loss: 157.7445
Epoch 6/100
 - 8s - loss: 157.6511
Epoch 7/100
 - 8s - loss: 157.3156
Epoch 8/100
 - 7s - loss: 157.0364
Epoch 9/100
 - 8s - loss: 156.6723
Epoch 10/100
 - 7s - loss: 156.7509
Epoch 11/100
 - 7s - loss: 156.2287
Epoch 12/100
 - 8s - loss: 156.3272
Epoch 13/100
 - 7s - loss: 155.7955
Epoch 14/100
 - 7s - loss: 155.8879
Epoch 15/100
 - 8s - loss: 155.5559
Epoch 16/100
 - 7s - loss: 155.4478
Epoch 17/100
 - 8s - loss: 155.1137
Epoch 18/100
 - 8s - loss: 155.0431
Epoch 19/100
 - 8s - loss: 155.1060
Epoch 20/100
 - 8s - loss: 154.6100
Epoch 21/100
 - 8s - loss: 154.5940
Epoch 22/100
 - 8s - loss: 154.3903
Epoch 23/100
 - 8s - loss: 154.3311
Epoch 24/100
 - 8s - loss: 154.4174
Epoch 25/100
 - 8s - loss: 154.3206
Epoch 26/100
 - 8s - loss: 153.9382
Epoch 27/100
 - 7s - loss: 154.0498
Epoch 28/100
 - 8s - loss: 153.5959


<keras.callbacks.History at 0x1b252d53390>

In [193]:
diff_pred = diff_estimator.predict(X_test.values)

In [194]:
white_elos = mean_pred + diff_pred / 2
black_elos = mean_pred - diff_pred / 2

In [195]:
real_white_elo = elos[40000:50000:2]
real_black_elo = elos[40001:50000:2]

In [199]:
[print(white_elos_rf[i], white_elos[i], real_white_elo[i]) for i in range(5000)]

2248.751 2186.99 1690
2152.857 2051.6 2320
2198.678 2267.44 1963
1948.682 1949.77 1881
2398.629 2377.27 1647
2122.373 2048.81 1803
2287.484 2338.27 2107
2408.44 2297.24 2643
2179.701 2114.42 2198
2221.181 2267.5 2213
2190.205 2208.75 1676
2006.639 2011.58 2198
2214.682 2283.25 2398
2488.832 2383.57 2543
2287.183 2225.54 2005
2416.785 2451.48 2715
2198.043 1923.01 2305
2178.835 2175.48 2096
2407.243 2409.73 2381
2318.026 2534.02 2024
2355.327 2317.57 2262
2293.146 2279.4 2308
2245.14 2217.15 2572
2254.531 2335.38 2431
2286.757 2285.85 2399
2268.588 2311.65 2341
2361.822 2272.45 2608
2405.972 2358.47 2687
2416.908 2443.69 2597
2412.046 2442.26 2392
2267.457 2272.78 2030
2396.086 2416.77 2651
2315.172 2326.7 2172
2312.811 2321.46 2201
2112.284 2012.54 2130
2179.515 2330.48 2163
2104.104 2169.77 2723
2114.536 2211.22 1805
2178.259 2219.3 1801
2269.591 2243.57 2577
2325.393 2590.39 2674
2189.144 2311.26 1573
2128.638 2127.34 1941
2316.086 2367.81 2491
2390.657 2289.01 2254
2276.434 2337.36 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [200]:
[print(white_elos_rf[i], black_elos[i], real_black_elo[i]) for i in range(5000)]

2248.751 2176.26 2248
2152.857 2168.66 2423
2198.678 1974.49 1798
1948.682 2095.97 2141
2398.629 2345.02 1399
2122.373 2230.33 2215
2287.484 2309.25 2388
2408.44 2319.84 2556
2179.701 2264.19 2117
2221.181 2160.4 1898
2190.205 1966.73 1528
2006.639 2221.34 2521
2214.682 2108.77 2187
2488.832 2344.96 2605
2287.183 2221.52 1912
2416.785 2441.31 2707
2198.043 2102.76 2458
2178.835 2059.57 2028
2407.243 2179.82 2212
2318.026 2734.43 2269
2355.327 2286.8 2174
2293.146 2061.7 1979
2245.14 2323.08 2679
2254.531 2127.48 2192
2286.757 2178.42 2025
2268.588 2125.61 2097
2361.822 2106.81 2277
2405.972 2313.24 2617
2416.908 2306.17 2404
2412.046 2350.67 2377
2267.457 2286.15 2027
2396.086 2401.3 2498
2315.172 2340.28 2049
2312.811 2289.41 2183
2112.284 1995.52 2327
2179.515 2527.3 2520
2104.104 2237.43 2721
2114.536 2389.06 2132
2178.259 1975.64 1614
2269.591 2180.63 2711
2325.393 2506.93 2397
2189.144 2419.2 2023
2128.638 2246.34 2215
2316.086 2277.29 2491
2390.657 2091.53 2387
2276.434 2344.34 2

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [198]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(black_elos+white_elos, real_black_elo + real_white_elo)

400.30956733398438