In [39]:
from pathlib import Path
import pandas as pd

In [40]:
def get_color(player, game):
    player = player.lower()
    if player == "dominguezperez":
        player = "dominguez perez"
    elif player == "vachierlagrave":
        player = "vachier lagrave"
    elif player == "vallejopons":
        player = "vallejo pons"
    elif player == "wangh":
        player = "wang"
        
    
    if player in game["White"].split(",")[0].lower():
        return "White"
    elif player in game["Black"].split(",")[0].lower():
        return "Black"
    elif player in game["White"].split(" ")[0].lower(): ##for chinese players
        return "White"
    elif player in game["Black"].split(" ")[0].lower(): ##for chinese players
        return "Black"
    elif game['White'] == '?' or game['Black'] == '?':
        return
    else:
        raise Exception("Player not in game")

In [41]:
def get_cp_loss(player):
    cp = pd.read_csv(f"output/centipawns/{player}.csv", index_col=0)
    cp = cp.clip(-1000, 1000)
    diffs = cp.diff(axis=1)
    md = pd.read_csv(f"output/metadata/{player}.csv", index_col=0)
    colors = [get_color(player, row) for _, row in md.iterrows()]
    all_diffs = []
    for i in range(cp.shape[0]):
        start = 0 if colors[i] == "Black" else 1
        player_diffs = diffs.iloc[i, :].dropna()[start::2].values
        if colors[i] == "White":
            player_diffs *= -1
        player_diffs = player_diffs.clip(min=0)
        all_diffs.append(player_diffs)
    df = pd.DataFrame(all_diffs)
    df.to_csv(f"output/cp_loss/{player}.csv")

In [42]:
pgns = Path("pgns")
players = [pgn.stem for pgn in pgns.glob("*.pgn")] # uncomment once all players are processed
players

['Anand',
 'Andreikin',
 'Aronian',
 'Bu',
 'carlsen',
 'Caruana',
 'ding',
 'DominguezPerez',
 'Duda',
 'Eljanov',
 'erigaisi',
 'Firouzja',
 'Giri',
 'gukesh',
 'Harikrishna',
 'Karjakin',
 'Le',
 'Mamedyarov',
 'Nakamura',
 'nepo',
 'niemann',
 'Rapport',
 'So',
 'Tomashevsky',
 'Topalov',
 'VachierLagrave',
 'VallejoPons',
 'Vitiugov',
 'WangH',
 'Wei',
 'Yu']

Calculate move-by-move centipawn loss.

In [43]:
for player in players:
    print(player)
    get_cp_loss(player)

Anand
Andreikin
Aronian
Bu
carlsen
Caruana
ding
DominguezPerez
Duda
Eljanov
erigaisi
Firouzja
Giri
gukesh
Harikrishna
Karjakin
Le
Mamedyarov
Nakamura
nepo
niemann
Rapport
So
Tomashevsky
Topalov
VachierLagrave
VallejoPons
Vitiugov
WangH
Wei
Yu


Calculate mean cp loss

In [44]:
for player in players:
    loss = pd.read_csv(f"output/cp_loss/{player}.csv", index_col=0)
    mean_cp = loss.mean(axis=1)
    df = pd.DataFrame({'Mean_CP':mean_cp})
    df.to_csv(f"output/mean_cp_loss/{player}.csv")

Calculate std cp loss

In [45]:
for player in players:
    loss = pd.read_csv(f"output/cp_loss/{player}.csv", index_col=0)
    std_cp = loss.std(axis=1)
    df = pd.DataFrame({'Std_CP':std_cp})
    df.to_csv(f"output/std_cp_loss/{player}.csv")

Calculate elo

In [None]:
import re

for player in players:
    md = pd.read_csv(f"output/metadata/{player}.csv", index_col=0)
    name = player
    
    if player == "DominguezPerez":
        name = "Dominguez Perez"
    elif name == "VachierLagrave":
        name = "Vachier Lagrave"
    elif player == "VallejoPons":
        name = "Vallejo Pons"
    elif player == "WangH":
        name = "Wang"

    mask_w = md['White'].str.contains(name, flags=re.IGNORECASE, regex=True)
    mask_b = md['Black'].str.contains(name, flags=re.IGNORECASE, regex=True)
    md['Elo'] = 0
    md['OppElo'] = 0
    
    # player elo
    md.loc[mask_w, 'Elo'] = md['WhiteElo']
    md.loc[mask_b, 'Elo'] = md['BlackElo']
    
    # opp elo
    md.loc[~mask_w, 'OppElo'] = md['WhiteElo']
    md.loc[~mask_b, 'OppElo'] = md['BlackElo']
    
    df = pd.DataFrame(md[['Elo', 'OppElo']])
    df.to_csv(f"output/elo/{player}.csv")

Get player-based win-loss

In [None]:
for player in players:
    md = pd.read_csv(f"output/metadata/{player}.csv", index_col=0)
    mask_w = md['White'].str.contains(player, flags=re.IGNORECASE, regex=True)
    mask_b = md['Black'].str.contains(player, flags=re.IGNORECASE, regex=True)
    
    md['WL'] = 0
    
    # player winloss
    md.loc[mask_w, 'WL'] = [x.split('-')[0] for x in md.loc[mask_w,'Result']]
    md.loc[mask_b, 'WL'] = [x.split('-')[1] for x in md.loc[mask_b,'Result']]
    
    df = pd.DataFrame(md['WL'])
    df[df['WL'] == 0] = -1
    df[df['WL'] == '0'] = -1
    df[df['WL'] == "1/2"] = 0   
    df.to_csv(f"output/winloss/{player}.csv")

Get White win-loss

In [None]:
for player in players:
    md = pd.read_csv(f"output/metadata/{player}.csv", index_col=0)
    
    md['WhiteWL'] = [x.split('-')[0] for x in md['Result']]
    
    df = pd.DataFrame(md['WhiteWL'])
    df[df['WhiteWL'] == 0] = -1
    df[df['WhiteWL'] == '0'] = -1
    df[df['WhiteWL'] == "1/2"] = 0
    df.to_csv(f"output/whitewinloss/{player}.csv")

Get age

In [None]:
## helper functions
from enum import Enum

## players' birthdays
class Birthdays(Enum):
    Anand = "1969.12.11"
    Andreikin = "1990.02.05"
    Aronian = "1982.10.06"
    Bu = "1985.12.10"
    carlsen = "1990.11.30"
    Caruana = "1992.07.30"
    ding = "1992.10.24"
    DominguezPerez = "1983.09.23"
    Duda = "1998.04.26"
    Eljanov = "1983.05.10"
    erigaisi = "2003.09.03"
    Firouzja = "2003.06.18"
    Giri = "1994.06.28"
    gukesh = "2006.05.29"
    Harikrishna = "1986.05.10"
    Karjakin = "1990.01.12"
    Le = "1991.03.13"
    Mamedyarov = "1985.04.12"
    Nakamura = "1987.12.09"
    nepo = "1990.07.14"
    niemann = "2003.06.18"
    Rapport = "1996.03.25"
    So = "1993.10.09"
    Tomashevsky = "1987.07.01"
    Topalov = "1975.03.15"
    VachierLagrave = "1990.10.21"
    VallejoPons = "1982.08.21"
    Vitiugov = "1987.02.04"
    WangH = "1989.08.04"
    Wei = "1999.06.02"
    Yu = "1994.06.08"
  
from datetime import datetime
from dateutil.parser import parse  

## ref: https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
## ref: https://stackoverflow.com/questions/63121002/how-to-subtract-dates-in-pythons-datetime-module-to-get-age-in-a-year-month-d

def calculateAge(player, df):
    dates = df['Date']
    birthday = Birthdays[player].value
    birthday = datetime.strptime(birthday, '%Y.%m.%d')
    ages = []
    
    for game_date in dates:
        try: #check if cell is a datetime format
            parse(game_date)
        except:
            continue
        
        game_date = datetime.strptime(game_date, '%Y.%m.%d')
        age = game_date.year - birthday.year - ((game_date.month, game_date.day) < (birthday.month, birthday.day)) 
        ages.append(age)
    return ages

In [None]:
for player in players:
    md = pd.read_csv(f"output/metadata/{player}.csv", index_col=0)
    ages = calculateAge(player, md)
    df = pd.DataFrame({'Age':ages})
    df.to_csv(f"output/age/{player}.csv")

Get number of years playing chess

In [None]:
## age when players started playing chess
class Time(Enum):
    Anand = 6
    Andreikin = 6 #could not find online. filling in with mean of other players
    Aronian = 9
    Bu = 6
    carlsen = 5
    Caruana = 5
    ding = 4
    DominguezPerez = 6 #could not find online. filling in with mean of other players
    Duda = 5
    Eljanov = 6 #could not find online. filling in with mean of other players
    erigaisi = 8
    Firouzja = 8
    Giri = 6
    gukesh = 7
    Harikrishna = 4
    Karjakin = 5
    Le = 7
    Mamedyarov = 8
    Nakamura = 7
    nepo = 4
    niemann = 8
    Rapport = 4
    So = 6
    Tomashevsky = 6 #could not find online. filling in with mean of other players
    Topalov = 8
    VachierLagrave = 5
    VallejoPons = 5
    Vitiugov = 6 #could not find online. filling in with mean of other players
    WangH = 6
    Wei = 8
    Yu = 7


In [None]:
for player in players:
    age = pd.read_csv(f"output/age/{player}.csv", index_col=0)
    df = age - Time[player].value
    df.rename(columns={"Age":"Time"}, inplace=True)
    df.to_csv(f"output/time/{player}.csv")

Combine into one big data set

In [None]:
for player in players:
    age = pd.read_csv(f"output/age/{player}.csv", index_col=0)
    time = pd.read_csv(f"output/time/{player}.csv", index_col=0)
    means = pd.read_csv(f"output/mean_cp_loss/{player}.csv", index_col=0)
    stds = pd.read_csv(f"output/std_cp_loss/{player}.csv", index_col=0)
    elo = pd.read_csv(f"output/elo/{player}.csv", index_col=0)
    wl = pd.read_csv(f"output/winloss/{player}.csv", index_col=0)
    whitewl = pd.read_csv(f"output/whitewinloss/{player}.csv", index_col=0)
    
    df = pd.concat([age, time, means, stds, elo, wl, whitewl], axis=1)
    df['Name'] = player
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    df.to_csv(f"./data/{player}.csv")