# Data Processing (Clegg)
Covered in this notebook:
1. Downloading match data from tennis-data.co.uk
2. Obtain pageview counts for each player.
2. Adding columns for all variables described in paper:
    a. RankDist.
    b. WikiBuzz.
    c. Implied Probability / Inverse Odds.
3. Produce a cleaned wta_matches_cleaned.csv file ready for further use.

In [1]:
# Imports, Remove Warnings for notebook readability.
import pandas as pd
from datetime import datetime, timedelta
import warnings
import requests
from statistics import median
import numpy as np
import json
import pandas as pd
warnings.filterwarnings("ignore", category=UserWarning)

In [121]:
# Load dataframe.
url = 'http://www.tennis-data.co.uk/{year}w/{year}.xlsx'
wta_matchess = [(pd.read_excel(url.format(year=str(x)), sheet_name=str(x))) for x in range(2016, 2022 + 1)]
wta_matches = pd.concat(wta_matchess, axis=0, ignore_index=True)
wta_matches['Date'] = pd.to_datetime(wta_matches['Date'])
wta_matches = wta_matches[wta_matches["Date"] > '2016-07-01']
wta_matches['match_id'] = wta_matches.index # Index column.
wta_matches['Winner'] = wta_matches['Winner'].str.replace(' ', '_')
wta_matches['Loser'] = wta_matches['Loser'].str.replace(' ', '_')

In [135]:
# Link players to their Wikipedia pages.
url = "https://drive.google.com/file/d/1PP6qoLuh43Fdkj5oVfqTKN4TWe6dcwfd/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
players = pd.read_csv(path)

player_dict = dict(zip(players.odds_player, players.player))
player_dict = {k.replace(' ', '_'): v.replace(' ', '_') for k, v in player_dict.items() if not isinstance(v, float)}
new_player_dict = {
    'Wang_X.':'Wang_Xiyu',
    'Wang_Xiy.':'Wang_Xiyu',
    'Stephens_S.':'Sloane_Stephens',
    'Pliskova_Ka.':'Karolína_Plíšková',
    'Williams_S.':'Serena_Williams',
    'Pliskova_Kr.':'Kristýna_Plíšková',
    'Alexandra_Krunic':'Aleksandra_Krunić',
    'Yingying_Duan':'Duan_Yingying',
    'Lin_Zhu':'Zhu_Lin_(tennis)',
    'Rodionova_Ar.':'Arina_Rodionova',
    'Carina_Witthoeft':'Carina_Witthöft',
    'Stefanie_Voegele':'Stefanie_Vögele',
    'Sílvia_Soler-Espinosa':'Sílvia_Soler_Espinosa',
    'Xinyun_Han':'Han_Xinyun',
    'Yafan_Wang':'Wang_Yafan',
    'Jana_Cepelova':'Jana_Čepelová',
    'Zhang_Shuai':'Zhang_Shuai',
    'Catherine_McNally':'Caty_McNally',
    'Viktoria_Kuzmova':'Viktória_Kužmová',
    'Barbora_Krejcikova':'Barbora_Krejčíková',
    'En-Shuo_Liang':'Liang_En-shuo',
    'Chloe_Paquet':'Chloé_Paquet',
    'Su-Wei_Hsieh':'Hsieh_Su-wei',
    'Saisai_Zheng':'Zheng_Saisai',
    'Marketa_Vondrousova':'Markéta_Vondroušová',
    'Fangzhou_Liu':'Liu_Fangzhou',
    'Patricia_Maria_Tig':'Patricia_Maria_Țig',
    'Julia_Goerges':'Julia_Görges',
    'Lesley_Pattinama_Kerkhove':'Lesley_Pattinama_Kerkhove',
    'Tamara_Zidansek':'Tamara_Zidanšek',
    'Iga_Swiatek':'Iga_Świątek',
    'Anna_Karolina_Schmiedlova':'Anna_Karolína_Schmiedlová',
    'Shuai_Peng':'Peng_Shuai',
    'Leonie_Kung':'Leonie_Küng',
    'Radwanska_U.':'Agnieszka_Radwańska',
    'Barbora_Strycova':'Barbora_Strýcová',
    'Cagla_Buyukakcay':'Çağla_Büyükakçay',
    'Rodionova_An.':'Arina_Rodionova',
    'Alexandra_Cadantu':'Alexandra_Cadanțu-Ignatik',
    'Marie_Bouzkova':'Marie_Bouzková',
    'Denisa_Allertova':'Denisa_Šátralová',
    'Lucie_Hradecka':'Lucie_Hradecká',
    'Ivana_Jorovic':'Ivana_Jorović',
    'Maia_Lumsden':'Maia_Lumsden',
    'Mirjana_Lucic':'Mirjana_Lučić-Baroni',
    'Hailey_Baptiste':'Hailey_Baptiste',
    'Katerina_Siniakova':'Kateřina_Siniaková',
    'Mihaela_Buzarnescu':'Mihaela_Buzărnescu',
    'Qiang_Wang':'Wang_Qiang_(tennis)',
    'Xiaodi_You':'You_Xiaodi',
    'Paula_Cristina_Goncalves':'Paula_Cristina_Gonçalves',
    'Aliona_Bolsova':'Aliona_Bolsova',
    'Tereza_Smitkova':'Tereza_Smitková',
    'Xinyu_Wang':'Wang_Xinyu',
    'Leylah_Fernandez':'Leylah_Fernandez',
    'Magdalena_Rybarikova':'Magdaléna_Rybáriková',
    'Johanna_Larsson':'Johanna_Larsson',
    'Mirjana_Lucic-Baroni':'Mirjana_Lučić-Baroni',
    'Danka_Kovinic':'Danka_Kovinić',
    'Tereza_Martincova':'Tereza_Martincová',
    'Montserrat_Gonzalez':'Montserrat_González',
    'Maria_Herazo_Gonzalez':'María_Herazo_González',
    'Ipek_Soylu':'İpek_Soylu',
    'Petra_Martic':'Petra_Martić',
    'Timea_Babos':'Tímea_Babos',
    'Nina_Stojanovic':'Nina_Stojanović',
    'Na-Lae_Han':'Han_Na-lae',
    'Jia-Jing_Lu':'Lu_Jiajing', #From 2nd iteration, losers column...
    'Lesley_Pattinama_Kerkhove':'Lesley_Pattinama_Kerkhove',
    'Ya-Hsuan_Lee':'Lee_Ya-hsuan',
    'Alexandra_Cadanțu-Ignatik':'Alexandra_Cadanțu-Ignatik',
    'Maria_Mateas':'Maria_Mateas',
    'Selena_Janicijevic':'Séléna_Janicijevic',
    'Jessica_Pieri':'Jessica_Pieri',
    'Denisa_Šátralová':'Denisa_Šátralová',
    'Kamilla_Rakhimova':'Kamilla_Rakhimova',
    'Tess_Sugnaux':'Tess_Sugnaux',
    'Maia_Lumsden':'Maia_Lumsden',
    'Mirjam_Bjorklund':'Mirjam_Björklund',
    'Alyssa_Mayo':'Alyssa_Mayo',
    'Komola_Umarova':'Komola_Umarova',
    'Ng_Kwan-yau':'Ng_Kwan-yau',
    'Cristiana_Ferrando':'Cristiana_Ferrando',
    'Martina_Capurro_Taborda':'Martina_Capurro_Taborda',
    'Elena-Gabriela_Ruse':'Elena-Gabriela_Ruse',
    'Freya_Christie':'Freya_Christie',
    'Jovana_Jovic':'Jovana_Jović',
    'Mira_Antonitsch':'Mira_Antonitsch',
    'Dayana_Yastremska':'Dayana_Yastremska',
    'Emiliana_Arango':'Emiliana_Arango',
    'Nadia_Echeverria_Alam':'Nadia_Echeverría_Alam',
    'Frances_Altick':'Frances_Altick',
    'Ayaka_Okuno':'Ayaka_Okuno',
    'Anastasiya_Shoshyna':'Anastasiya_Shoshyna',
    'Jesika_Maleckova':'Jesika_Malečková',
    'Wushuang_Zheng':'Zheng_Wushuang',
    'Karolina_Muchova':'Karolína_Muchová',
    'Varvara_Gracheva':'Varvara_Gracheva',
    'Baindl_K.':'Kateryna_Baindl', # third iteration here
    'Uchijima_M.':'Moyuka_Uchijima',
    'Kalieva_E.':'Elvina_Kalieva',
    'Osorio_M.':'Camila_Osorio',
    'Naito_Y.':'Yuki_Naito',
    'Lazaro_A.':'Andrea_Lázaro_García',
    'Alves_C.':'Carolina_Alves_(tennis)',
    'Schunk_N.':'Nastasja_Schunk',
    'Saigo_R.':'Rina_Saigo',
    'Ioana_Minca':'Ioana_Mincă',
    'Zheng_Q.':'Zheng_Qinwen',
    'Guth_M.':'Mara_Guth',
    'Palicova_B.':'Barbora_Palicová',
    'Shibahara_E.':'Ena_Shibahara',
    'Pliskova_Ka.':'Karolína_Plíšková',
    'Reyngold_E.':'Ekaterina_Reyngold',
    'Middendorf_J.':'Julia_Middendorf',
    'Cengiz_B.':'Berfu_Cengiz',
    'Jeong_B.':'Jeong_Bo-young',
    'Szabanin_N.':'Natália_Szabanin',
    'Stakusic_M.':'Marina_Stakusic',
    'Mandlik_E.':'Elizabeth_Mandlik',
    'Contreras_Gomez_F.':'Fernanda_Contreras_Gómez',
    'Paoletti_M.':'Matilde_Paoletti',
    'Lamens_S.':'Suzan_Lamens',
    'Sherif_M.':'Mayar_Sherif',
    'Alexandra_Krunic':'Aleksandra_Krunić',
    'Bartunkova_N.':'Nikola_Bartůňková',
    'Jones_F.':'Francesca_Jones_(tennis)',
    'Yashina_E.':'Ekaterina_Yashina',
    'Mariana_Duque-Mariño':'Mariana_Duque_Mariño',
    'Bassols_Ribera_M.':'Marina_Bassols_Ribera',
    'Fruhvirtova_L.':'Linda_Fruhvirtová',
    'Laura_Pous-Tio':'Laura_Pous_Tió',
    'Williams_S.':'Serena_Williams',
    'Serban_R.':'Raluca_Șerban',
    'Caregaro_M.':'Martina_Caregaro',
    'Gabueva_A.':'Angelina_Gabueva',
    'Stearns_P.':'Peyton_Stearns',
    'Na-Lae_Han':'Han_Na-lae',
    'Scott_K.':'Katrina_Scott',
    'Lovric_P.':'Pia_Lovrič',
    'Bandecchi_S.':'Susan_Bandecchi',
    'Preston_T.':'Taylah_Preston',
    'Bronzetti_L.':'Lucia_Bronzetti',
    'Berberovic_N.':'Nefisa_Berberović',
    'Herazo_M.':'María_Herazo_González',
    'Montgomery_R.':'Robin_Montgomery',
    'Turati_B.':'Bianca_Turati',
    'Hewitt_D.':'Dalayna_Hewitt',
    'Hanatani_N.':'Nagi_Hanatani',
    'Zueger_J.':'Joanne_Züger',
    'Noskova_L.':'Linda_Nosková',
    'Marcinko_P.':'Petra_Marčinko',
    'Kraus_S.':'Sinja_Kraus',
    'Chang_S.':'Sophie_Chang',
    'Yu_E.':'Eleana_Yu',
    'Krawczyk_D.':'Desirae_Krawczyk',
    'Krueger_A.':'Ashlyn_Krueger',
    'Prisacariu_A.':'Andreea_Prisăcariu',
    'Jimenez_V.':'Victoria_Jiménez_Kasintseva',
    'En-Shuo_Liang':'Liang_En-shuo',
    'Monnet_C.':'Carole_Monnet',
    'Radivojevic_L.':'Lola_Radivojević',
    'Ruse_E-G.':'Elena-Gabriela_Ruse',
    'Morderger_Y.':'Yana_Morderger',
    'Burillo_I.':'Irene_Burillo_Escorihuela',
    'Sebov_K.':'Katherine_Sebov',
    'Mansouri_Y.':'Yasmine_Mansouri',
    'Bejlek_S.':'Sára_Bejlek',
    'Jang_S.':'Jang_Su-jeong',
    'Wurth_T.':'Tara_Würth',
    'Yingying_Duan':'Duan_Yingying',
    'Pigato_L.':'Lisa_Pigato',
    'Back_D.':'Back_Da-yeon',
    'Glushko_L.':'Lina_Glushko',
    'Kartal_S.':'Sonay_Kartal',
    'Andreeva_E.':'Erika_Andreeva',
    'Ignatik_A.':'Alexandra_Cadanțu-Ignatik',
    'Bassols_M.':'Marina_Bassols_Ribera',
    'Cross_K.':'Kayla_Cross',
    'Yifan_Xu':'Xu_Yifan',
    'Maria_Camila_Osorio_Serrano':'Camila_Osorio',
    'Anderson_R.':'Robin_Anderson_(tennis)',
    'Radisic_N.':'Nika_Radišić',
    'Parrizas_Diaz_N.':'Nuria_Párrizas_Díaz',
    'Sun_L.':'Lulu_Sun',
    'Sakatsume_H.':'Himeno_Sakatsume',
    'Bucsa_C.':'Cristina_Bucșa',
    'Jacquemot_E.':'Elsa_Jacquemot',
    'Bektas_E.':'Emina_Bektas',
    'María-Teresa_Torró-Flor':'María_Teresa_Torró_Flor',
    'Yang_Zha.':'Yang_Zhaoxuan',
    'Nuudi_M.':'Maileen_Nuudi',
    'Okamura_K.':'Kyōka_Okamura',
    'Falkowska_W.':'Weronika_Falkowska',
    'Talaba_G.':'Gabriela_Lee',
    'Kulambayeva_Z.':'Zhibek_Kulambayeva',
    'Fruhvirtova_B.':'Linda_Fruhvirtová',
    'Falkner_Z.':'Živa_Falkner',
    'Harrison_Ca.':'Catherine_Harrison_(tennis)',
    'Brace_C.':'Cadence_Brace',
    'Radwanska_U.':'Agnieszka_Radwańska',
    'Eala_A.':'Alex_Eala',
    'Plazas_J.':'Jessica_Plazas',
    'Grey_S.':'Sarah_Beth_Grey',
    'Gervais_J.':'Julie_Gervais',
    'Raducanu_E.':'Emma_Raducanu',
    'Carle_M.':'María_Carlé',
    'Tjandramulia_O.':'Olivia_Tjandramulia',
    'Di_Sarra_F.':'Federica_Di_Sarra',
    'Olyanovskaya_V.':'Valeriia_Olianovskaia',
    'Kubka_M.':'Martyna_Kubka',
    'McNally_C.':'Caty_McNally',
    'Natalija_Kostic':'Natalija_Stevanović',
    'Baszak_W.':'Weronika_Baszak',
    'Papamichail_D.':'Despina_Papamichail',
    'Bhatia_R.':'Riya_Bhatia',
    'Chwalinska_M.':'Maja_Chwalińska',
    'Havlickova_L.':'Lucie_Havlíčková',
    'Hartono_A.':'Arianne_Hartono',
    'Hontama_M.':'Mai_Hontama',
    'Gleason_Q.':'Quinn_Gleason',
    'Burrage_J.':'Jodie_Burrage',
    'Hatouka_Y.':'Yuliya_Hatouka',
    'Pliskova_Kr.':'Kristýna_Plíšková',
    'Lys_E.':'Eva_Lys',
    'Gadecki_O.':'Olivia_Gadecki',
    'Rodionova_Ar.':'Arina_Rodionova',
    'Lee_G.':'Gabriela_Lee',
    'Malygina_E.':'Elena_Malõgina',
    'Stefanini_L.':'Lucrezia_Stefanini',
    'Sutjiadi_A.':'Aldila_Sutjiadi',
    'Jeanjean_L.':'Léolia_Jeanjean',
    'Miyazaki_Y.':'Yuriko_Miyazaki',
    'Maria_Herazo_Gonzalez':'María_Herazo_González',
    'Tkacheva_M.':'Mariia_Tkacheva',
    'Mendez_S.':'Seone_Mendez',
    'Rodionova_An.':'Arina_Rodionova',
    'Saville_D.':'Daria_Saville',
    'Mishina_D.':'Daria_Mishina',
    'Andreea_Rosca':'Andreea_Roșca',
    'Babel_O.':'Océane_Babel',
    'Ji-Hee_Choi':'Choi_Ji-hee',
    'Anshba_A.':'Amina_Anshba',
    'Brancaccio_N.':'Nuria_Brancaccio',
    'Shymanovich_I.':'Iryna_Shymanovich',
    'Mboko_V.':'Victoria_Mboko',
    'Zakharova_A.':'Maria_Zakharova',
    'Riske-Amritraj_A.':'Alison_Riske-Amritraj',
    'Gasanova_A.':'Anastasia_Gasanova',
    'Selekhmeteva_O.':'Oksana_Selekhmeteva',
    'Wagner_S.':'Stephanie_Wagner',
    'Tikhonova_A.':'Anastasia_Tikhonova_(tennis)',
    'Da_Silva_Fick_G.':'Gabriella_Da_Silva-Fick',
    'Jani_R-L.':'Réka_Luca_Jani',
    'Avanesyan_E.':'Elina_Avanesyan',
    'Silva_E.':'Eden_Silva',
    'Monroy_Y.':'Yuliana_Monroy',
    'Raducànu_E.':'Emma_Raducanu',
    'Zuger_J.':'Joanne_Züger',
    'Andreeva_M.':'Erika_Andreeva',
    'Salkova_D.':'Dominika_Šalková',
    'Parks_A.':'Alycia_Parks',
    'Snigur_D.':'Daria_Snigur',
    'Davis_L.':'Lauren_Davis',
} # Introduce new player:wikipedia_page dictionary.
player_dict = player_dict | new_player_dict

wta_matches.replace({'Winner': player_dict}, inplace=True)
wta_matches.replace({'Loser': player_dict}, inplace=True)

## 2. Page view counts.

In [3]:
request_url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'\
              'per-article/en.wikipedia.org/all-access/all-agents/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                         '(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
           'From': 'faxulous@gmail.com'}

def get_yday_views(player, date) -> int:
    """
    Pageviews for player from yesterday.
    :param player: Two formats, either betfair in or the other historical tennis files in.
    :param date: potentially two formats again. but easy to deal with.
    """
    match_date = date.date()
    prev_day_date = (match_date - timedelta(days=1)).strftime('%Y%m%d')
    try:
        prev_day_request = requests.get(request_url +
                                        f'{player}/daily/{prev_day_date}/{prev_day_date}',
                                        headers=headers).json()
        prev_day_views = prev_day_request['items'][0]['views']
    except KeyError:
        prev_day_views = 'KeyError'
    print(date)
    return prev_day_views

def get_median_views(player, date) -> float:
    """
    Median pageviews for player over past year.
    :param player: Two formats, either betfair in or the other historical tennis files in.
    :param date: potentially two formats again. but easy to deal with.
    """
    match_date = date.date()
    end_of_year = (match_date - timedelta(days=2)).strftime('%Y%m%d')
    start_of_year = (match_date - timedelta(days=365)).strftime('%Y%m%d')
    try:
        past_year_request = requests.get(request_url +
                                         f'{player}/daily/{start_of_year}/{end_of_year}',
                                         headers=headers).json()
        view_list = []
        for n in range(0, len(past_year_request['items'])):
            views = past_year_request['items'][n]['views']
            view_list.append(views)
        past_year_median = median(view_list)
    except KeyError:
        past_year_median = 'KeyError'
    return past_year_median

In [58]:
all_p_requests = []
for player in list(player_dict.values()):
    print(player)
    try:
        p_request = requests.get(request_url + f'{player}/daily/20150101/20230505', headers=headers).json()["items"]
    except:
        pass
    all_p_requests.extend(p_request)

# Convert combined JSON data to a pandas dataframe
df = pd.DataFrame(all_p_requests)

# Convert 'timestamp' to datetime format and set it as the index
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d00')
df.set_index('timestamp', inplace=True)
# Pivot the dataframe to have 'article' names as columns
pivot_df = df.pivot_table(values='views', columns='article', index=df.index)

Francoise_Abanda
Nigina_Abduraimova
Kristie_Ahn
Kristie_Ahn
Destanee_Aiava
Ayla_Aksu
Fatma_Al_Nabhani
Lauren_Albanese
Silvia_Albano
Audrey_Albie
Ekaterina_Alexandrova
Denisa_Allertova
Ellen_Allgurin
Frances_Altick
Carolina_Alves
Maria_Fernanda_Alves
Akgul_Amanmuradova
Bianca_Andreescu
Tessah_Andrianjafitrimo
Amanda_Anisimova
Mira_Antonitsch
Shuko_Aoyama
Emiliana_Arango
Manon_Arcangioli
Usue_Maitane_Arconada
Tetyana_Arefyeva
Greta_Arn
Lara_Arruabarrena
Lara_Arruabarrena
Lara_Arruabarrena
Sofia_Arvidsson
Rita_Atik
Victoria_Azarenka
Timea_Babos
Timea_Bacsinszky
Paula_Badosa
Paula_Badosa
Paula_Badosa
Alison_Bai
Naiktha_Bains
Elena_Baltacha
Sybille_Bammer
Hailey_Baptiste
Irina_Bara
Gioia_Barbieri
Kristina_Barrois
Hilary_Barte
Mona_Barthel
Marion_Bartoli
Ashleigh_Barty
Annika_Beck
Irina-Camelia_Begu
Catherine_Bellis
Belinda_Bencic
Iveta_Benesova
Ghita_Benhadi
Marie_Benoit
Kiki_Bertens
Yuliya_Beygelzimer
Eva_Birnerova
Kimberly_Birrell
Mirjam_Bjorklund
Tornado_Alicia_Black
Anna_Blinkova
Bojana

In [136]:
# Function to get the day before value and median value
def get_values(article, date):
    start_date = date - timedelta(days=365)
    end_date = date - timedelta(days=1)

    try:
        past_year_data = pivot_df.loc[start_date:end_date, article]
    except KeyError:
        failed_articles.append(article)
        return (float('nan'), float('nan'))

    if past_year_data.isna().sum() / len(past_year_data) > 0.25:
        return (float('nan'), float('nan'))
    
    day_before_value = past_year_data.iloc[-1]
    median_value = past_year_data.median()
    print(article)
    return (day_before_value, median_value)

failed_articles = []
wta_matches['Date'] = pd.to_datetime(wta_matches['Date'])
# Apply the get_values function to the winner and loser columns
wta_matches['winner_day_before_value'], wta_matches['winner_median_value'] = zip(*wta_matches.apply(lambda row: get_values(row['Winner'], row['Date']), axis=1))
wta_matches['loser_day_before_value'], wta_matches['loser_median_value'] = zip(*wta_matches.apply(lambda row: get_values(row['Loser'], row['Date']), axis=1))
print("Failed articles:", set(failed_articles))

Sloane_Stephens
Timea_Bacsinszky
Ekaterina_Makarova
Misaki_Doi
Angelique_Kerber
Lucie_Safarova
Simona_Halep
Yaroslava_Shvedova
Madison_Keys
Dominika_Cibulkova
Agnieszka_Radwanska
Anastasia_Pavlyuchenkova
Svetlana_Kuznetsova
Elena_Vesnina
Coco_Vandeweghe
Serena_Williams
Ekaterina_Makarova
Angelique_Kerber
Anastasia_Pavlyuchenkova
Dominika_Cibulkova
Simona_Halep
Yaroslava_Shvedova
Elena_Vesnina
Serena_Williams
Venus_Williams
Angelique_Kerber
Venus_Williams
Serena_Williams
Elena_Vesnina
Serena_Williams
Angelique_Kerber
Serena_Williams
Aliaksandra_Sasnovich
Vania_King
Çağla_Büyükakçay
Misa_Eguchi
Polona_Hercog
Monica_Niculescu
Laura_Siegemund
Danka_Kovinić
Sara_Errani
Patricia_Maria_Țig
Pauline_Parmentier
Francesca_Schiavone
Isabella_Shinikova
Anastasija_Sevastova
Simona_Halep
Polona_Hercog
Laura_Siegemund
Pauline_Parmentier
Vania_King
Danka_Kovinić
Anastasija_Sevastova
Simona_Halep
Sara_Errani
Laura_Siegemund
Vania_King
Anastasija_Sevastova
Simona_Halep
Anastasija_Sevastova
Simona_Halep
S

In [137]:
wta_matches

Unnamed: 0,WTA,Location,Tournament,Date,Tier,Court,Surface,Round,Best of,Winner,...,PSL,MaxW,MaxL,AvgW,AvgL,match_id,winner_day_before_value,winner_median_value,loser_day_before_value,loser_median_value
1540,34,London,Wimbledon,2016-07-02,Grand Slam,Outdoor,Grass,2nd Round,3,Sloane_Stephens,...,5.00,1.21,5.60,1.17,4.95,1540,1206.0,220.0,1101.0,40.0
1541,34,London,Wimbledon,2016-07-02,Grand Slam,Outdoor,Grass,2nd Round,3,Timea_Bacsinszky,...,3.56,1.34,3.85,1.29,3.53,1541,642.0,161.0,334.0,70.0
1542,34,London,Wimbledon,2016-07-02,Grand Slam,Outdoor,Grass,2nd Round,3,Ekaterina_Makarova,...,2.42,3.13,1.46,2.88,1.40,1542,792.0,172.0,17.0,11.0
1545,34,London,Wimbledon,2016-07-02,Grand Slam,Outdoor,Grass,3rd Round,3,Misaki_Doi,...,2.34,1.71,2.40,1.64,2.24,1545,587.0,53.0,363.0,38.0
1546,34,London,Wimbledon,2016-07-02,Grand Slam,Outdoor,Grass,3rd Round,3,Angelique_Kerber,...,7.72,1.14,7.72,1.10,6.77,1546,1615.0,566.0,460.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15829,55,Fort Worth,WTA Finals,2022-11-05,Tour Championships,Indoor,Hard,Round Robin,3,Caroline_Garcia,...,2.48,1.71,2.48,1.64,2.28,15829,1476.0,233.0,4232.0,292.0
15830,55,Fort Worth,WTA Finals,2022-11-06,Tour Championships,Indoor,Hard,Round Robin,3,Iga_Świątek,...,6.23,1.18,6.55,1.14,5.61,15830,4557.0,2344.0,6.0,11.0
15831,55,Fort Worth,WTA Finals,2022-11-06,Tour Championships,Indoor,Hard,Semifinals,3,Caroline_Garcia,...,1.65,2.47,1.66,2.32,1.61,15831,5478.0,236.0,4687.0,719.0
15832,55,Fort Worth,WTA Finals,2022-11-07,Tour Championships,Indoor,Hard,Semifinals,3,Aryna_Sabalenka,...,1.18,5.90,1.20,5.25,1.16,15832,1754.0,746.0,8995.0,2375.0


In [55]:
session = requests.Session()
def get_yday_views(player, date) -> int:
    """
    Pageviews for player from yesterday.
    :param player: Two formats, either betfair in or the other historical tennis files in.
    :param date: potentially two formats again. but easy to deal with.
    """
    match_date = date.date()
    prev_day_date = (match_date - timedelta(days=1)).strftime('%Y%m%d')
    try:
        prev_day_request = session.get(request_url +
                                        f'{player}/daily/{prev_day_date}/{prev_day_date}',
                                        headers=headers).json()
        prev_day_views = prev_day_request['items'][0]['views']
    except KeyError:
        prev_day_views = 'KeyError'
    return prev_day_views

def get_median_views(player, date) -> float:
    """
    Median pageviews for player over past year.
    :param player: Two formats, either betfair in or the other historical tennis files in.
    :param date: potentially two formats again. but easy to deal with.
    """
    match_date = date.date()
    end_of_year = (match_date - timedelta(days=2)).strftime('%Y%m%d')
    start_of_year = (match_date - timedelta(days=365)).strftime('%Y%m%d')
    try:
        past_year_request = session.get(request_url +
                                         f'{player}/daily/{start_of_year}/{end_of_year}',
                                         headers=headers).json()
        view_list = []
        for n in range(0, len(past_year_request['items'])):
            views = past_year_request['items'][n]['views']
            view_list.append(views)
        past_year_median = median(view_list)
    except KeyError:
        past_year_median = 'KeyError'
    return past_year_median

In [56]:
%%time
wta_matches['Wprevdayviews'] = [get_yday_views(x,y) for x,y in zip(wta_matches['Winner'], wta_matches['Date'])]
print("1Done")
wta_matches['Lprevdayviews'] = [get_yday_views(x,y) for x,y in zip(wta_matches['Loser'], wta_matches['Date'])]
print("2Done")
wta_matches['Wmedian_views'] = [get_median_views(x,y) for x,y in zip(wta_matches['Winner'], wta_matches['Date'])]
print("3Done")
wta_matches['Lmedian_views'] = [get_median_views(x,y) for x,y in zip(wta_matches['Loser'], wta_matches['Date'])]

1Done
2Done
3Done
CPU times: total: 2min 24s
Wall time: 4h 29min 52s


In [49]:
#wta_matches.to_csv("wta_matches.csv")
wta_matches = pd.read_csv("wta_matches.csv")

In [50]:
print('Total Rows: '+str(len(wta_matches)))
wta_matches = wta_matches[wta_matches.Wprevdayviews != 'KeyError']
wta_matches = wta_matches[wta_matches.Lprevdayviews != 'KeyError']
wta_matches = wta_matches[wta_matches.Wmedian_views != 'KeyError']
wta_matches = wta_matches[wta_matches.Lmedian_views != 'KeyError']
print('Total (Usable) Rows: '+str(len(wta_matches)))

Total Rows: 14292
Total (Usable) Rows: 12413


## 3a. RankDist.

In [51]:
def RankDist_ij(winner_rank: int = 0, loser_rank: int = 0) -> float:
    """
    Calculates the RankDist variable.
    If no rank available, use 0.
    """
    if np.isnan(winner_rank):
        inverse_wr = 0
    else:
        inverse_wr = 1 / winner_rank

    if np.isnan(loser_rank):
        inverse_lr = 0
    else:
        inverse_lr = 1 / loser_rank
    rankdist = -(inverse_wr - inverse_lr)
    return rankdist

In [52]:
wta_matches['WRankDist'] = [RankDist_ij(x,y) for x,y in zip(wta_matches['WRank'], wta_matches['LRank'])]
wta_matches['LRankDist'] = [RankDist_ij(y, x) for x, y in zip(wta_matches['WRank'], wta_matches['LRank'])]

## 3b. WikiBuzz.

In [53]:
def WikiBuzz_ij(winner_pageviews, winner_median, loser_pageviews, loser_median) -> float:
    """
    Calculates the WikiBuzz variable.
    """
    w_ij = winner_pageviews
    w_dash_ij = loser_pageviews
    wcurl_ij = winner_median
    wcurl_dash_ij = loser_median
    wikibuzz = np.log(w_ij / wcurl_ij) - np.log(w_dash_ij / wcurl_dash_ij)
    return wikibuzz

In [54]:
wta_matches['Wprevdayviews'] = wta_matches['Wprevdayviews'].astype(str).astype(float)
wta_matches['Wmedian_views'] = wta_matches['Wmedian_views'].astype(str).astype(float)
wta_matches['Lprevdayviews'] = wta_matches['Lprevdayviews'].astype(str).astype(float)
wta_matches['Lmedian_views'] = wta_matches['Lmedian_views'].astype(str).astype(float)

In [55]:
# Using the view columns from above.
wta_matches['W_WikiBuzz'] = [WikiBuzz_ij(row[0], row[1], row[2], row[3]) for row in
                          zip(wta_matches['Wprevdayviews'], wta_matches['Wmedian_views'],
                              wta_matches['Lprevdayviews'], wta_matches['Lmedian_views'])]
wta_matches['L_WikiBuzz'] = [WikiBuzz_ij(row[0], row[1], row[2], row[3]) for row in
                          zip(wta_matches['Lprevdayviews'], wta_matches['Lmedian_views'],
                              wta_matches['Wprevdayviews'], wta_matches['Wmedian_views'])]

## 3c. Implied Probability / Inverse Odds.

In [56]:
def implied_probability(player_odds) -> float:
    """
    Get probability of winning for player, implied by odds.
    """
    return 1 / player_odds

In [57]:
wta_matches['W_inverse_bestodds'] = [implied_probability(x) for x in wta_matches['MaxW']]
wta_matches['L_inverse_bestodds'] = [implied_probability(x) for x in wta_matches['MaxL']]
wta_matches['W_inverse_B365'] = [implied_probability(x) for x in wta_matches['B365W']]
wta_matches['L_inverse_B365'] = [implied_probability(x) for x in wta_matches['B365L']]
wta_matches['W_inverse_avg'] = [implied_probability(x) for x in wta_matches['AvgW']]
wta_matches['L_inverse_avg'] = [implied_probability(x) for x in wta_matches['AvgL']]

## 4. Cleaned CSV File Produced.

In [58]:
wta_matches_winners = wta_matches[['match_id', 'WTA', 'Winner', 'Date', 'WRankDist', 'W_WikiBuzz',
                       'W_inverse_B365', 'W_inverse_bestodds', 'W_inverse_avg']].copy()
wta_matches_winners['outcome'] = 1
wta_matches_losers = wta_matches[['match_id', 'WTA', 'Loser', 'Date', 'LRankDist', 'L_WikiBuzz',
                      'L_inverse_B365', 'L_inverse_bestodds', 'L_inverse_avg']].copy()
wta_matches_losers['outcome'] = 0
wta_matches_winners.rename({'Winner': 'player', 'Date': 'date', 'WRankDist': 'rankdist', 'W_WikiBuzz': 'wikibuzz',
                   'W_inverse_B365': 'inverse_b365', 'W_inverse_bestodds': 'inverse_best',
                   'W_inverse_avg': 'inverse_avg'}, axis=1, inplace=True)
wta_matches_losers.rename({'Loser': 'player', 'Date': 'date', 'LRankDist': 'rankdist', 'L_WikiBuzz': 'wikibuzz',
                  'L_inverse_B365': 'inverse_b365', 'L_inverse_bestodds': 'inverse_best',
                  'L_inverse_avg': 'inverse_avg'}, axis=1, inplace=True)
wta_matches = pd.concat([wta_matches_winners, wta_matches_losers], axis=0)

In [59]:
wta_matches['year'] = wta_matches['date'].str[:4]
wta_matches.sort_values(by='date', inplace=True)

In [60]:
wta_matches.to_csv("wta_matches_cleaned.csv")