# Chess Game Results

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

## Wrangle Data

In [2]:
# Define function to get white and blacks first two moves 
def first_move_white(col):
    moves = col.split()
    return moves[1]

def first_move_black(col):
    moves = col.split()
    return moves[2]

def second_move_white(col):
    moves = col.split()
    return moves[4]

def second_move_black(col):
    moves = col.split()
    return moves[5]

# Define function to find length of game
def game_len(col):
    moves = col.split()
    return len(moves)

In [3]:
# Define wrangle function 
def wrangle(data):
    df = pd.read_csv('../data/'+data)
    
    #Create datetime column and make that column the index
    df['UTCDateTime'] = df['UTCDate'] + ' ' + df['UTCTime']
    df.set_index('UTCDateTime', inplace=True)
    
    #Drop UTCDate and UTCTime
    df.drop(columns=['UTCDate', 'UTCTime'], inplace=True)
    
    #Drop * values for result
    df = df[df.Result != '*']
    
    #Drop games with less than 5 turns
    df['length'] = df['AN'].apply(game_len)
    df = df[df.length > 15]
    df.drop(columns=['length'], axis=1, inplace=True)
    
    #Drop player names
    df.drop(columns=['White', 'Black'], axis=1, inplace=True)
    
    #Make new Columns for the first moves of black and white 
    df['WhiteFirst'] = df['AN'].apply(first_move_white)
    df['BlackFirst'] = df['AN'].apply(first_move_black)
    df['WhiteSecond'] = df['AN'].apply(second_move_white)
    df['BlackSecond'] = df['AN'].apply(second_move_black)
    
    #Drop moves and elo change to avoid data leakage
    df.drop(columns=['AN', 'WhiteRatingDiff', 'BlackRatingDiff'], axis=1, inplace=True)
    
    return df

In [4]:
df = wrangle('chess.csv')


df.head()

Unnamed: 0_level_0,Unnamed: 0,Event,Result,WhiteElo,BlackElo,ECO,Opening,TimeControl,Termination,WhiteFirst,BlackFirst,WhiteSecond,BlackSecond
UTCDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016.06.30 22:00:01,0,Classical,1-0,1901,1896,D10,Slav Defense,300+5,Time forfeit,d4,d5,c4,c6
2016.06.30 22:00:01,1,Blitz,0-1,1641,1627,C20,King's Pawn Opening: 2.b3,300+0,Normal,e4,e5,b3,Nf6
2016.06.30 22:00:02,2,Blitz tournament,1-0,1647,1688,B01,Scandinavian Defense: Mieses-Kotroc Variation,180+0,Time forfeit,e4,d5,exd5,Qxd5
2016.06.30 22:00:02,3,Correspondence,1-0,1706,1317,A00,Van't Kruijs Opening,-,Normal,e3,Nf6,Bc4,d6
2016.06.30 22:00:02,4,Blitz tournament,0-1,1945,1900,B90,"Sicilian Defense: Najdorf, Lipnitsky Attack",180+0,Time forfeit,e4,c5,Nf3,d6


## Baseline Accuracy

In [5]:
base_acc = df['Result'].value_counts(normalize=True)[0]
print('Baseline Accuracy:', base_acc)

Baseline Accuracy: 0.49296060843565853


## Split Data

In [6]:
target = 'Result'

X = df.drop(columns=target, axis = 1)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .2, random_state=42)

## Build Model

In [7]:
# Hyperparamater tuning for LogisticRegressionCV
# pipe = make_pipeline(
#                 OneHotEncoder(use_cat_names=True), 
#                 SimpleImputer(), 
#                 LogisticRegression(random_state=42, solver='saga', n_jobs=-1)
# )

# params = {
#     'logisticregression__max_iter': range(75,201,25)
# }

# model_gs = GridSearchCV(
#         pipe,
#         param_grid=params, 
#         cv=5,
#         n_jobs=-1, 
#         verbose=1
# )

# model_gs.fit(X_train, y_train);

# model_gs.best_params_

In [8]:
# Model built with linear classification
model_LR = make_pipeline(
                OrdinalEncoder(), 
                SimpleImputer(), 
                LogisticRegression(random_state=42, solver='saga', n_jobs=-1, max_iter=125)
)
model_LR.fit(X_train, y_train);



In [9]:
print('Training Accuracy:', model_LR.score(X_train, y_train))
print('Validation Accuracy:', model_LR.score(X_val, y_val))

Training Accuracy: 0.6350375557893972
Validation Accuracy: 0.6259796806966619


In [15]:
# Hyperparamater tuning for XGBClassifier
pipe = make_pipeline(
                OrdinalEncoder(), 
                SimpleImputer(), 
                RandomForestClassifier(random_state=42, criterion='entropy', n_estimators=200)
)

params = {
    'randomforestclassifier__max_depth': range(4,16)
}

model_gsr = GridSearchCV(
        pipe,
        param_grid=params, 
        cv=5,
        n_jobs=-1, 
        verbose=1
)

model_gsr.fit(X_train, y_train);

model_gsr.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


{'randomforestclassifier__max_depth': 9}

In [24]:
# Model built with XGB classifier 
model_RF = make_pipeline(
            OrdinalEncoder(), 
            SimpleImputer(), 
            RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=9, n_estimators=200)
)
model_RF.fit(X_train, y_train);

In [25]:
print('Training Accuracy:', model_RF.score(X_train, y_train))
print('Validation Accuracy:', model_RF.score(X_val, y_val))

Training Accuracy: 0.6959613919227838
Test Accuracy: 0.618577648766328


## Test Score
The better of the two mdeels was the logistical Regressor. Below is the test data accuracy for that model. 

In [13]:
test_df = wrangle('test_chess.csv')
target = 'Result'

X_test = test_df.drop(columns=target, axis = 1)
y_test = test_df[target]

In [14]:
model_LR.score(X_test, y_test)

0.6260207963479584