In [None]:
import pandas as pd
import numpy as np

In [None]:
def calc_gini_impurity(frame):
    frame_size = len(frame)
    eco_g = frame.groupby('ECO')
    square_frac = (eco_g.size() / frame_size).pow(2)
    return 1 - square_frac.sum()

In [None]:
file_path = 'BestPlayers.csv'

In [None]:
df = pd.read_csv(file_path, sep=';', encoding='utf-8')
print(df.shape)
df.head()

In [None]:
# Get the gini impurity of each player
white_name_g = df.groupby('WhiteName')
gini_se = white_name_g.apply(calc_gini_impurity)
gini_se.name = 'GiniImpurity'

In [None]:
# Get number of games as white for each player
size_se = white_name_g.size()
size_se.name = 'GamesInDataset'

In [None]:
# Get max elo
player_elo_df = pd.DataFrame(np.concatenate([df.loc[:, ['WhiteName', 'WhiteElo']], df.loc[:, ['BlackName', 'BlackElo']]]))
player_elo_df.columns = ['Name', 'Elo']
player_elo_df.dropna(inplace=True)
player_elo_df['Elo'] = player_elo_df['Elo'].astype(int)

max_elo_se = player_elo_df.groupby("Name").max()
max_elo_se.columns = ['MaxElo']

In [None]:
# Merge series above series into a df
min_num_games = 10
result_df = pd.concat([gini_se, size_se], axis=1)
result_df = result_df[result_df['GamesInDataset'] >= min_num_games]
result_df.reset_index(inplace=True)
result_df = result_df.merge(max_elo_se, how='inner', left_on='WhiteName', right_index=True)
print(result_df.shape)
result_df.head()

In [None]:
# Write to a csv
gini_path = 'GiniImpurity.csv'
result_df.to_csv(gini_path, sep=';', index=False, encoding='utf-8')