In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UnicodeWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
url = 'https://drive.google.com/file/d/1ltOks54u7V6tvK22EmT5n_XL8D2V4o8R/view?usp=share_link'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

df = pd.read_stata(path)

## RankDist Variable

In [3]:
def get_rankdist(winner_rank: int = 0, loser_rank: int = 0) -> float:
    if np.isnan(winner_rank):
        inverse_wr = 0
    else:
        inverse_wr = 1 / winner_rank

    if np.isnan(loser_rank):
        inverse_lr = 0
    else:
        inverse_lr = 1 / loser_rank
    rankdist = -(inverse_wr - inverse_lr)
    return rankdist

df['WRankDist'] = [get_rankdist(x,y) for x,y in zip(df['wrank'], df['lrank'])]
df['LRankDist'] = [get_rankdist(y, x) for x, y in zip(df['wrank'], df['lrank'])]

## WikiBuzz Variable

In [4]:
def get_wikibuzz(winner_pageviews, winner_median, loser_pageviews, loser_median):
    try:
        buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['W_WikiBuzz'] = [get_wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_w'], df['wiki_med365_w'],
                                                                                df['wiki_yesterday_l'], df['wiki_med365_l'])]
df['L_WikiBuzz'] = [get_wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_l'], df['wiki_med365_l'],
                                                                                df['wiki_yesterday_w'], df['wiki_med365_w'])]

## Inverse Odds

In [5]:
def impliedprob(player_odds):
    return 1/player_odds

df['W_inverse_bestodds'] = [impliedprob(x) for x in df['maxw']]
df['L_inverse_bestodds'] = [impliedprob(x) for x in df['maxl']]
df['W_inverse_avgodds'] = [impliedprob(x) for x in df['avgw']]
df['L_inverse_avgodds'] = [impliedprob(x) for x in df['avgl']]
df['W_inverse_B365'] = [impliedprob(x) for x in df['b365w']]
df['L_inverse_B365'] = [impliedprob(x) for x in df['b365l']]

In [6]:
# Date Column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
# Match index column
df['match_id'] = df.index

In [7]:
# Convert each row into 2 rows (1 for winner, 1 for loser) sharing a match_id.
df_winners = df[['match_id', 'winner', 'date', 'year', 'WRankDist', 'W_WikiBuzz', 'W_inverse_B365', 'W_inverse_avgodds', 'W_inverse_bestodds']].copy()
df_winners['outcome'] = 1
df_losers = df[['match_id', 'loser', 'date', 'year', 'LRankDist', 'L_WikiBuzz', 'L_inverse_B365', 'L_inverse_avgodds', 'L_inverse_bestodds']].copy()
df_losers['outcome'] = 0

df_winners.rename({'winner': 'player', 'WRankDist': 'rankdist', 'W_WikiBuzz': 'wikibuzz',
                   'W_inverse_B365': 'inverse_b365', 'W_inverse_avgodds': 'inverse_avg', 'W_inverse_bestodds': 'inverse_best'}, axis=1, inplace=True)
df_losers.rename({'loser': 'player', 'LRankDist': 'rankdist', 'L_WikiBuzz': 'wikibuzz',
                  'L_inverse_B365': 'inverse_b365', 'L_inverse_avgodds': 'inverse_avg', 'L_inverse_bestodds': 'inverse_best'}, axis=1, inplace=True)
df = pd.concat([df_winners, df_losers], axis=0)

In [8]:
# Remove bad rows.
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['rankdist'].notna()]
df = df[df['inverse_avg'].notna()]

df.sort_values(by='date', inplace=True)

In [9]:
df

Unnamed: 0,match_id,player,date,year,rankdist,wikibuzz,inverse_b365,inverse_avg,inverse_best,outcome
12776,12776,Rodina E.,2015-07-02,2015.0,0.023432,0.0,0.250000,0.246914,0.222222,0
12785,12785,Tomljanovic A.,2015-07-02,2015.0,0.064577,0.0,0.166667,0.181818,0.164474,0
12784,12784,Kuznetsova S.,2015-07-02,2015.0,-0.032537,0.0,0.819672,0.813008,0.800000,0
12783,12783,Cepelova J.,2015-07-02,2015.0,0.011399,0.0,0.285714,0.288184,0.266667,0
12782,12782,Lucic M.,2015-07-02,2015.0,0.031481,0.0,0.381679,0.355872,0.327869,0
...,...,...,...,...,...,...,...,...,...,...
4139,4139,Alexandrova E.,2020-02-15,2020.0,0.089286,1.323982,0.500000,0.483092,0.454545,0
4139,4139,Bertens K.,2020-02-15,2020.0,-0.089286,-1.323982,0.555556,0.571429,0.526316,1
4142,4142,Sakkari M.,2020-02-15,2020.0,-0.007619,-1.612937,0.421941,0.431034,0.408163,0
4143,4143,Bertens K.,2020-02-16,2020.0,-0.085000,-2.935837,0.636943,0.641026,0.621118,1
