In [17]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import seaborn as sns
import sklearn as sk
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [18]:
games = pd.read_csv('games.csv')

In [19]:
games.columns

Index(['id', 'rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
       'winner', 'increment_code', 'white_id', 'white_rating', 'black_id',
       'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply'],
      dtype='object')

I am interested in determining how the difference between two ratings relates to the probability of winning for each player. 

For example, suppose that we both meet to play a chess match. I, having been a decent player, have a rating of 1800. You just started playing the other day so you have a rating of 1000. The idea behind a rating is to let us know that we are not evently matched. Given my rating, you should know that I will probably crush you, and your chance of winning is dismally lower than mine. So how can I quantify this intuition? 

My intuition tells me that the bigger the difference, the more likely the person with the higher rating will win.

Our ingredients for this cake will be a few new variables:

1) diff_rating will quantify the difference between the player ratings,

2) abs_diff_rating will simply take the absolute value of the quantity above.

3) hr_won will be a binary variable taking 1 if the person with the higher rating won, and 0 if they didn't.

4) higher_rating_coded will determine whether white won (takes value of 1, 0 otherwise).


Having created these variables, we intend to use the abs_diff_rating to predict whether hr_won will be a 1 or a 0 using a logistic regression. 

In [20]:
# Create a new variable for the difference in rating between two players.
games['diff_rating'] = games.white_rating - games.black_rating

In [21]:
# Create a new variable with absolute value of differences
games['abs_diff_rating'] = np.abs(games['diff_rating'])

In [22]:
# Create new key
# -- higher_rating will encode who has the higher rating. 
games['higher_rating'] = ''

In [23]:
# Given the organization of the data, positive differences in the diff_rating indicate white had a higher rating, 
# whereas negative ones indicated a higher rating for black. A difference of zero, of course, indicates an equal rating. 
games.loc[games.diff_rating > 0, 'higher_rating'] = 'white'
games.loc[games.diff_rating < 0, 'higher_rating'] = 'black'
games.loc[games.diff_rating == 0, 'higher_rating'] = 'same'

In [24]:
# Now, we encode a variable that returns 1 if the higher rating won and 0 otherwise. 
games['higher_rating_won'] = 0
games.loc[games.winner == games.higher_rating, 'higher_rating_won'] = 1

In [25]:
# A cursory check shows this worked.
games[['higher_rating_won', 'winner', 'higher_rating']].head()

Unnamed: 0,higher_rating_won,winner,higher_rating
0,1,white,white
1,0,black,white
2,0,white,black
3,0,white,black
4,1,white,white


In [26]:
# Next, we will code higher rating as a binary so that we can easily use it 
games['higher_rating_coded'] = 0
games.loc[games.higher_rating == 'white', 'higher_rating_coded'] = 1

Now would be a good time to show the variables we just created. Also, let's see the original winner variable. 

In [31]:
# Randomly select a sample from the variables we created. 
# We want to see if our created variables match up
rand_ind = np.random.randint(len(games), size = 10)
games.iloc[rand_ind][['higher_rating','higher_rating_coded', 'diff_rating', 'abs_diff_rating', 'higher_rating_won', 'winner']]

Unnamed: 0,higher_rating,higher_rating_coded,diff_rating,abs_diff_rating,higher_rating_won,winner
1016,white,1,287,287,1,white
14209,white,1,973,973,1,white
19892,black,0,-59,59,1,black
10413,white,1,64,64,1,white
5541,white,1,24,24,1,white
10575,black,0,-72,72,1,black
2121,black,0,-215,215,0,white
3798,white,1,115,115,1,white
13122,white,1,113,113,0,black
5457,black,0,-158,158,1,black


With this small output, we can see that we've done what we set out to do.

After trying a multiple logistic regression with these variables, I wonder if we are not oversimplifying. For one, to make the problem a binary (to allow for modelling with a logistic regression) I hid the number of draws. So, we will make a new variable that will include draws, wins, and losses for the higher rated player. 

In [32]:
games['result'] = 0
games.loc[games.winner == 'draw' , 'result'] = 1
games.loc[games.higher_rating_won == 1, 'result'] = 2

In [35]:
games[['result', 'higher_rating_won', 'winner']].tail()

Unnamed: 0,result,higher_rating_won,winner
20053,2,1,white
20054,0,0,black
20055,0,0,white
20056,2,1,white
20057,2,1,black


In [37]:
games.loc[games.result == 1 , 'winner'].value_counts()

draw    950
Name: winner, dtype: int64

In [38]:
# Output a new dataset with the variables we just created. 
games.to_csv('games_new_vars.csv')