# Data Processing (CC)
Covered in this notebook:
1. Downloading match data from tennis-data.co.uk
2. Obtain pageview counts for each player.
2. Adding columns for all variables described in paper:
    a. RankDist.
    b. WikiBuzz.
    c. Implied Probability / Inverse Odds.
3. Produce a ccsv file ready for further use.

In [1]:
# Imports, Remove Warnings for notebook readability.
import pandas as pd
from datetime import datetime, timedelta
import warnings
import requests
from statistics import median
import numpy as np
import json
import pandas as pd
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Load all data.
url = 'http://www.tennis-data.co.uk/{year}w/{year}.xlsx'
wta_matchess = [(pd.read_excel(url.format(year=str(x)), sheet_name=str(x))) for x in range(2015, 2022 + 1)]
wta_matches = pd.concat(wta_matchess, axis=0, ignore_index=True)
wta_matches['Date'] = pd.to_datetime(wta_matches['Date'])
wta_matches = wta_matches[wta_matches["Date"] > '2015-07-01']
wta_matches['match_id'] = wta_matches.index # Index column.
wta_matches['Winner'] = wta_matches['Winner'].str.replace(' ', '_')
wta_matches['Loser'] = wta_matches['Loser'].str.replace(' ', '_')

In [3]:
# Link players to their Wikipedia pages.
url = "https://drive.google.com/file/d/1PP6qoLuh43Fdkj5oVfqTKN4TWe6dcwfd/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
players = pd.read_csv(path)

player_dict = dict(zip(players.odds_player, players.player))
player_dict = {k.replace(' ', '_'): v.replace(' ', '_') for k, v in player_dict.items() if not isinstance(v, float)}
new_player_dict = {
    'Wang_X.':'Wang_Xiyu',
    'Wang_Xiy.':'Wang_Xiyu',
    'Stephens_S.':'Sloane_Stephens',
    'Pliskova_Ka.':'Karolína_Plíšková',
    'Williams_S.':'Serena_Williams',
    'Pliskova_Kr.':'Kristýna_Plíšková',
    'Alexandra_Krunic':'Aleksandra_Krunić',
    'Yingying_Duan':'Duan_Yingying',
    'Lin_Zhu':'Zhu_Lin_(tennis)',
    'Rodionova_Ar.':'Arina_Rodionova',
    'Carina_Witthoeft':'Carina_Witthöft',
    'Stefanie_Voegele':'Stefanie_Vögele',
    'Sílvia_Soler-Espinosa':'Sílvia_Soler_Espinosa',
    'Xinyun_Han':'Han_Xinyun',
    'Yafan_Wang':'Wang_Yafan',
    'Jana_Cepelova':'Jana_Čepelová',
    'Zhang_Shuai':'Zhang_Shuai',
    'Catherine_McNally':'Caty_McNally',
    'Viktoria_Kuzmova':'Viktória_Kužmová',
    'Barbora_Krejcikova':'Barbora_Krejčíková',
    'En-Shuo_Liang':'Liang_En-shuo',
    'Chloe_Paquet':'Chloé_Paquet',
    'Su-Wei_Hsieh':'Hsieh_Su-wei',
    'Saisai_Zheng':'Zheng_Saisai',
    'Marketa_Vondrousova':'Markéta_Vondroušová',
    'Fangzhou_Liu':'Liu_Fangzhou',
    'Patricia_Maria_Tig':'Patricia_Maria_Țig',
    'Julia_Goerges':'Julia_Görges',
    'Lesley_Pattinama_Kerkhove':'Lesley_Pattinama_Kerkhove',
    'Tamara_Zidansek':'Tamara_Zidanšek',
    'Iga_Swiatek':'Iga_Świątek',
    'Anna_Karolina_Schmiedlova':'Anna_Karolína_Schmiedlová',
    'Shuai_Peng':'Peng_Shuai',
    'Leonie_Kung':'Leonie_Küng',
    'Radwanska_U.':'Agnieszka_Radwańska',
    'Barbora_Strycova':'Barbora_Strýcová',
    'Cagla_Buyukakcay':'Çağla_Büyükakçay',
    'Rodionova_An.':'Arina_Rodionova',
    'Alexandra_Cadantu':'Alexandra_Cadanțu-Ignatik',
    'Marie_Bouzkova':'Marie_Bouzková',
    'Denisa_Allertova':'Denisa_Šátralová',
    'Lucie_Hradecka':'Lucie_Hradecká',
    'Ivana_Jorovic':'Ivana_Jorović',
    'Maia_Lumsden':'Maia_Lumsden',
    'Mirjana_Lucic':'Mirjana_Lučić-Baroni',
    'Hailey_Baptiste':'Hailey_Baptiste',
    'Katerina_Siniakova':'Kateřina_Siniaková',
    'Mihaela_Buzarnescu':'Mihaela_Buzărnescu',
    'Qiang_Wang':'Wang_Qiang_(tennis)',
    'Xiaodi_You':'You_Xiaodi',
    'Paula_Cristina_Goncalves':'Paula_Cristina_Gonçalves',
    'Aliona_Bolsova':'Aliona_Bolsova',
    'Tereza_Smitkova':'Tereza_Smitková',
    'Xinyu_Wang':'Wang_Xinyu',
    'Leylah_Fernandez':'Leylah_Fernandez',
    'Magdalena_Rybarikova':'Magdaléna_Rybáriková',
    'Johanna_Larsson':'Johanna_Larsson',
    'Mirjana_Lucic-Baroni':'Mirjana_Lučić-Baroni',
    'Danka_Kovinic':'Danka_Kovinić',
    'Tereza_Martincova':'Tereza_Martincová',
    'Montserrat_Gonzalez':'Montserrat_González',
    'Maria_Herazo_Gonzalez':'María_Herazo_González',
    'Ipek_Soylu':'İpek_Soylu',
    'Petra_Martic':'Petra_Martić',
    'Timea_Babos':'Tímea_Babos',
    'Nina_Stojanovic':'Nina_Stojanović',
    'Na-Lae_Han':'Han_Na-lae',
    'Jia-Jing_Lu':'Lu_Jiajing', #From 2nd iteration, losers column...
    'Lesley_Pattinama_Kerkhove':'Lesley_Pattinama_Kerkhove',
    'Ya-Hsuan_Lee':'Lee_Ya-hsuan',
    'Alexandra_Cadanțu-Ignatik':'Alexandra_Cadanțu-Ignatik',
    'Maria_Mateas':'Maria_Mateas',
    'Selena_Janicijevic':'Séléna_Janicijevic',
    'Jessica_Pieri':'Jessica_Pieri',
    'Denisa_Šátralová':'Denisa_Šátralová',
    'Kamilla_Rakhimova':'Kamilla_Rakhimova',
    'Tess_Sugnaux':'Tess_Sugnaux',
    'Maia_Lumsden':'Maia_Lumsden',
    'Mirjam_Bjorklund':'Mirjam_Björklund',
    'Alyssa_Mayo':'Alyssa_Mayo',
    'Komola_Umarova':'Komola_Umarova',
    'Ng_Kwan-yau':'Ng_Kwan-yau',
    'Cristiana_Ferrando':'Cristiana_Ferrando',
    'Martina_Capurro_Taborda':'Martina_Capurro_Taborda',
    'Elena-Gabriela_Ruse':'Elena-Gabriela_Ruse',
    'Freya_Christie':'Freya_Christie',
    'Jovana_Jovic':'Jovana_Jović',
    'Mira_Antonitsch':'Mira_Antonitsch',
    'Dayana_Yastremska':'Dayana_Yastremska',
    'Emiliana_Arango':'Emiliana_Arango',
    'Nadia_Echeverria_Alam':'Nadia_Echeverría_Alam',
    'Frances_Altick':'Frances_Altick',
    'Ayaka_Okuno':'Ayaka_Okuno',
    'Anastasiya_Shoshyna':'Anastasiya_Shoshyna',
    'Jesika_Maleckova':'Jesika_Malečková',
    'Wushuang_Zheng':'Zheng_Wushuang',
    'Karolina_Muchova':'Karolína_Muchová',
    'Varvara_Gracheva':'Varvara_Gracheva',
    'Baindl_K.':'Kateryna_Baindl', # third iteration here
    'Uchijima_M.':'Moyuka_Uchijima',
    'Kalieva_E.':'Elvina_Kalieva',
    'Osorio_M.':'Camila_Osorio',
    'Naito_Y.':'Yuki_Naito',
    'Lazaro_A.':'Andrea_Lázaro_García',
    'Alves_C.':'Carolina_Alves_(tennis)',
    'Schunk_N.':'Nastasja_Schunk',
    'Saigo_R.':'Rina_Saigo',
    'Ioana_Minca':'Ioana_Mincă',
    'Zheng_Q.':'Zheng_Qinwen',
    'Guth_M.':'Mara_Guth',
    'Palicova_B.':'Barbora_Palicová',
    'Shibahara_E.':'Ena_Shibahara',
    'Pliskova_Ka.':'Karolína_Plíšková',
    'Reyngold_E.':'Ekaterina_Reyngold',
    'Middendorf_J.':'Julia_Middendorf',
    'Cengiz_B.':'Berfu_Cengiz',
    'Jeong_B.':'Jeong_Bo-young',
    'Szabanin_N.':'Natália_Szabanin',
    'Stakusic_M.':'Marina_Stakusic',
    'Mandlik_E.':'Elizabeth_Mandlik',
    'Contreras_Gomez_F.':'Fernanda_Contreras_Gómez',
    'Paoletti_M.':'Matilde_Paoletti',
    'Lamens_S.':'Suzan_Lamens',
    'Sherif_M.':'Mayar_Sherif',
    'Alexandra_Krunic':'Aleksandra_Krunić',
    'Bartunkova_N.':'Nikola_Bartůňková',
    'Jones_F.':'Francesca_Jones_(tennis)',
    'Yashina_E.':'Ekaterina_Yashina',
    'Mariana_Duque-Mariño':'Mariana_Duque_Mariño',
    'Bassols_Ribera_M.':'Marina_Bassols_Ribera',
    'Fruhvirtova_L.':'Linda_Fruhvirtová',
    'Laura_Pous-Tio':'Laura_Pous_Tió',
    'Williams_S.':'Serena_Williams',
    'Serban_R.':'Raluca_Șerban',
    'Caregaro_M.':'Martina_Caregaro',
    'Gabueva_A.':'Angelina_Gabueva',
    'Stearns_P.':'Peyton_Stearns',
    'Na-Lae_Han':'Han_Na-lae',
    'Scott_K.':'Katrina_Scott',
    'Lovric_P.':'Pia_Lovrič',
    'Bandecchi_S.':'Susan_Bandecchi',
    'Preston_T.':'Taylah_Preston',
    'Bronzetti_L.':'Lucia_Bronzetti',
    'Berberovic_N.':'Nefisa_Berberović',
    'Herazo_M.':'María_Herazo_González',
    'Montgomery_R.':'Robin_Montgomery',
    'Turati_B.':'Bianca_Turati',
    'Hewitt_D.':'Dalayna_Hewitt',
    'Hanatani_N.':'Nagi_Hanatani',
    'Zueger_J.':'Joanne_Züger',
    'Noskova_L.':'Linda_Nosková',
    'Marcinko_P.':'Petra_Marčinko',
    'Kraus_S.':'Sinja_Kraus',
    'Chang_S.':'Sophie_Chang',
    'Yu_E.':'Eleana_Yu',
    'Krawczyk_D.':'Desirae_Krawczyk',
    'Krueger_A.':'Ashlyn_Krueger',
    'Prisacariu_A.':'Andreea_Prisăcariu',
    'Jimenez_V.':'Victoria_Jiménez_Kasintseva',
    'En-Shuo_Liang':'Liang_En-shuo',
    'Monnet_C.':'Carole_Monnet',
    'Radivojevic_L.':'Lola_Radivojević',
    'Ruse_E-G.':'Elena-Gabriela_Ruse',
    'Morderger_Y.':'Yana_Morderger',
    'Burillo_I.':'Irene_Burillo_Escorihuela',
    'Sebov_K.':'Katherine_Sebov',
    'Mansouri_Y.':'Yasmine_Mansouri',
    'Bejlek_S.':'Sára_Bejlek',
    'Jang_S.':'Jang_Su-jeong',
    'Wurth_T.':'Tara_Würth',
    'Yingying_Duan':'Duan_Yingying',
    'Pigato_L.':'Lisa_Pigato',
    'Back_D.':'Back_Da-yeon',
    'Glushko_L.':'Lina_Glushko',
    'Kartal_S.':'Sonay_Kartal',
    'Andreeva_E.':'Erika_Andreeva',
    'Ignatik_A.':'Alexandra_Cadanțu-Ignatik',
    'Bassols_M.':'Marina_Bassols_Ribera',
    'Cross_K.':'Kayla_Cross',
    'Yifan_Xu':'Xu_Yifan',
    'Maria_Camila_Osorio_Serrano':'Camila_Osorio',
    'Anderson_R.':'Robin_Anderson_(tennis)',
    'Radisic_N.':'Nika_Radišić',
    'Parrizas_Diaz_N.':'Nuria_Párrizas_Díaz',
    'Sun_L.':'Lulu_Sun',
    'Sakatsume_H.':'Himeno_Sakatsume',
    'Bucsa_C.':'Cristina_Bucșa',
    'Jacquemot_E.':'Elsa_Jacquemot',
    'Bektas_E.':'Emina_Bektas',
    'María-Teresa_Torró-Flor':'María_Teresa_Torró_Flor',
    'Yang_Zha.':'Yang_Zhaoxuan',
    'Nuudi_M.':'Maileen_Nuudi',
    'Okamura_K.':'Kyōka_Okamura',
    'Falkowska_W.':'Weronika_Falkowska',
    'Talaba_G.':'Gabriela_Lee',
    'Kulambayeva_Z.':'Zhibek_Kulambayeva',
    'Fruhvirtova_B.':'Linda_Fruhvirtová',
    'Falkner_Z.':'Živa_Falkner',
    'Harrison_Ca.':'Catherine_Harrison_(tennis)',
    'Brace_C.':'Cadence_Brace',
    'Radwanska_U.':'Agnieszka_Radwańska',
    'Eala_A.':'Alex_Eala',
    'Plazas_J.':'Jessica_Plazas',
    'Grey_S.':'Sarah_Beth_Grey',
    'Gervais_J.':'Julie_Gervais',
    'Raducanu_E.':'Emma_Raducanu',
    'Carle_M.':'María_Carlé',
    'Tjandramulia_O.':'Olivia_Tjandramulia',
    'Di_Sarra_F.':'Federica_Di_Sarra',
    'Olyanovskaya_V.':'Valeriia_Olianovskaia',
    'Kubka_M.':'Martyna_Kubka',
    'McNally_C.':'Caty_McNally',
    'Natalija_Kostic':'Natalija_Stevanović',
    'Baszak_W.':'Weronika_Baszak',
    'Papamichail_D.':'Despina_Papamichail',
    'Bhatia_R.':'Riya_Bhatia',
    'Chwalinska_M.':'Maja_Chwalińska',
    'Havlickova_L.':'Lucie_Havlíčková',
    'Hartono_A.':'Arianne_Hartono',
    'Hontama_M.':'Mai_Hontama',
    'Gleason_Q.':'Quinn_Gleason',
    'Burrage_J.':'Jodie_Burrage',
    'Hatouka_Y.':'Yuliya_Hatouka',
    'Pliskova_Kr.':'Kristýna_Plíšková',
    'Lys_E.':'Eva_Lys',
    'Gadecki_O.':'Olivia_Gadecki',
    'Rodionova_Ar.':'Arina_Rodionova',
    'Lee_G.':'Gabriela_Lee',
    'Malygina_E.':'Elena_Malõgina',
    'Stefanini_L.':'Lucrezia_Stefanini',
    'Sutjiadi_A.':'Aldila_Sutjiadi',
    'Jeanjean_L.':'Léolia_Jeanjean',
    'Miyazaki_Y.':'Yuriko_Miyazaki',
    'Maria_Herazo_Gonzalez':'María_Herazo_González',
    'Tkacheva_M.':'Mariia_Tkacheva',
    'Mendez_S.':'Seone_Mendez',
    'Rodionova_An.':'Arina_Rodionova',
    'Saville_D.':'Daria_Saville',
    'Mishina_D.':'Daria_Mishina',
    'Andreea_Rosca':'Andreea_Roșca',
    'Babel_O.':'Océane_Babel',
    'Ji-Hee_Choi':'Choi_Ji-hee',
    'Anshba_A.':'Amina_Anshba',
    'Brancaccio_N.':'Nuria_Brancaccio',
    'Shymanovich_I.':'Iryna_Shymanovich',
    'Mboko_V.':'Victoria_Mboko',
    'Zakharova_A.':'Maria_Zakharova',
    'Riske-Amritraj_A.':'Alison_Riske-Amritraj',
    'Gasanova_A.':'Anastasia_Gasanova',
    'Selekhmeteva_O.':'Oksana_Selekhmeteva',
    'Wagner_S.':'Stephanie_Wagner',
    'Tikhonova_A.':'Anastasia_Tikhonova_(tennis)',
    'Da_Silva_Fick_G.':'Gabriella_Da_Silva-Fick',
    'Jani_R-L.':'Réka_Luca_Jani',
    'Avanesyan_E.':'Elina_Avanesyan',
    'Silva_E.':'Eden_Silva',
    'Monroy_Y.':'Yuliana_Monroy',
    'Raducànu_E.':'Emma_Raducanu',
    'Zuger_J.':'Joanne_Züger',
    'Andreeva_M.':'Erika_Andreeva',
    'Salkova_D.':'Dominika_Šalková',
    'Parks_A.':'Alycia_Parks',
    'Snigur_D.':'Daria_Snigur',
    'Davis_L.':'Lauren_Davis',
} # Introduce new player:wikipedia_page dictionary.
player_dict = player_dict | new_player_dict

wta_matches.replace({'Winner': player_dict}, inplace=True)
wta_matches.replace({'Loser': player_dict}, inplace=True)

## 2. Page view counts.

In [None]:
request_url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'\
              'per-article/en.wikipedia.org/all-access/all-agents/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                         '(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
           'From': 'https://github.com/Faxulous/notFeelingTheBuzz'}
all_p_requests = []
for player in list(player_dict.values()):
    try:
        p_request = requests.get(request_url + f'{player}/daily/20150101/20230505', headers=headers).json()["items"]
    except:
        pass
    all_p_requests.extend(p_request)

# Convert combined JSON data to a pandas dataframe
df = pd.DataFrame(all_p_requests)

# Convert 'timestamp' to datetime format and set it as the index
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d00')
df.set_index('timestamp', inplace=True)
# Pivot the dataframe to have 'article' names as columns
pivot_df = df.pivot_table(values='views', columns='article', index=df.index)

In [None]:
for col in pivot_df.columns:
    pivot_df[col] = pivot_df[col].mask(pivot_df[col].notna().cummax().shift(fill_value=False), pivot_df[col].fillna(0))

In [None]:
# Function to get the day before value and median value
def get_values(article, date):
    start_date = date - timedelta(days=366)
    end_date = date - timedelta(days=1)

    try:
        past_year_data = pivot_df.loc[start_date:end_date, article]
    except KeyError:
        failed_articles.append(article)
        return (float('nan'), float('nan'))
    #print(past_year_data.isna().sum())
    if past_year_data.isna().sum() > 0:
        return (float('nan'), float('nan'))
    
    day_before_value = past_year_data.iloc[-1]
    median_value = past_year_data.iloc[:-1].median()
    return (day_before_value, median_value)

failed_articles = []
wta_matches['Date'] = pd.to_datetime(wta_matches['Date'])
# Apply the get_values function to the winner and loser columns
wta_matches['winner_previous_day_views'], wta_matches['winner_median_views'] = zip(*wta_matches.apply(lambda row: get_values(row['Winner'], row['Date']), axis=1))
wta_matches['loser_previous_day_views'], wta_matches['loser_median_views'] = zip(*wta_matches.apply(lambda row: get_values(row['Loser'], row['Date']), axis=1))
print("Failed articles:", set(failed_articles))

In [7]:
wta_matches = wta_matches.copy()
print('Total Rows: '+str(len(wta_matches)))
wta_matches.dropna(subset=["winner_previous_day_views", 'winner_median_views', "loser_previous_day_views", "loser_median_views"], inplace=True)
print('Total (Usable) Rows: '+str(len(wta_matches)))

Total Rows: 14292
Total (Usable) Rows: 13613


## 3a. RankDist.

In [8]:
def RankDist_ij(winner_rank: int = 0, loser_rank: int = 0) -> float:
    """
    Calculates the RankDist variable.
    If no rank available, use 0.
    """
    if np.isnan(winner_rank):
        inverse_wr = 0
    else:
        inverse_wr = 1 / winner_rank

    if np.isnan(loser_rank):
        inverse_lr = 0
    else:
        inverse_lr = 1 / loser_rank
    rankdist = -(inverse_wr - inverse_lr)
    return rankdist

In [9]:
wta_matches['WRankDist'] = [RankDist_ij(x,y) for x,y in zip(wta_matches['WRank'], wta_matches['LRank'])]
wta_matches['LRankDist'] = [RankDist_ij(y, x) for x, y in zip(wta_matches['WRank'], wta_matches['LRank'])]

## 3b. WikiBuzz.

In [10]:
def WikiBuzz_ij(winner_pageviews, winner_median, loser_pageviews, loser_median) -> float:
    """
    Calculates the WikiBuzz variable.
    """
    w_ij = winner_pageviews
    w_dash_ij = loser_pageviews
    wcurl_ij = winner_median if winner_median != 0 else 1
    wcurl_dash_ij = loser_median if loser_median != 0 else 1
    log_ij = np.log(w_ij / wcurl_ij) if w_ij != 0 else 0 
    log_dash_ij = np.log(w_dash_ij / wcurl_dash_ij) if w_dash_ij != 0 else 0 
    return log_ij - log_dash_ij

In [11]:
wta_matches['winner_previous_day_views'] = wta_matches['winner_previous_day_views'].astype(str).astype(float)
wta_matches['winner_median_views'] = wta_matches['winner_median_views'].astype(str).astype(float)
wta_matches['loser_previous_day_views'] = wta_matches['loser_previous_day_views'].astype(str).astype(float)
wta_matches['loser_median_views'] = wta_matches['loser_median_views'].astype(str).astype(float)

In [12]:
# Using the view columns from above.
wta_matches['W_WikiBuzz'] = [WikiBuzz_ij(row[0], row[1], row[2], row[3]) for row in
                          zip(wta_matches['winner_previous_day_views'], wta_matches['winner_median_views'],
                              wta_matches['loser_previous_day_views'], wta_matches['loser_median_views'])]
wta_matches['L_WikiBuzz'] = [WikiBuzz_ij(row[0], row[1], row[2], row[3]) for row in
                          zip(wta_matches['winner_previous_day_views'], wta_matches['winner_median_views'],
                              wta_matches['loser_previous_day_views'], wta_matches['loser_median_views'])]

## 3. Cleaned CSV File Produced.

In [13]:
wta_matches_winners = wta_matches[['match_id', 'WTA', 'Winner', 'Date', 'WRankDist', 'W_WikiBuzz',
                       'B365W', 'MaxW', 'AvgW']].copy()
wta_matches_winners['outcome'] = 1
wta_matches_losers = wta_matches[['match_id', 'WTA', 'Loser', 'Date', 'LRankDist', 'L_WikiBuzz',
                      'B365L', 'MaxL', 'AvgL']].copy()
wta_matches_losers['outcome'] = 0
wta_matches_winners.rename({'Winner': 'player', 'Date': 'date', 'WRankDist': 'rankdist', 'W_WikiBuzz': 'wikibuzz',
                            'B365W':'B365_Odds', 'MaxW':'Max_Odds', 'AvgW':'Avg_Odds'}, axis=1, inplace=True)
wta_matches_losers.rename({'Loser': 'player', 'Date': 'date', 'LRankDist': 'rankdist', 'L_WikiBuzz': 'wikibuzz',
                           'B365L':'B365_Odds', 'MaxL':'Max_Odds', 'AvgL':'Avg_Odds'}, axis=1, inplace=True)
wta_matches = pd.concat([wta_matches_winners, wta_matches_losers], axis=0)

In [14]:
# Reset the index and keep the old one
wta_matches = wta_matches.sort_index().reset_index(drop=True).rename(columns={'index': 'old_index'})

In [16]:
bet365cleaning_before = len(wta_matches)

In [18]:
# Get the indexes of rows where B365_Odds is greater than Max_Odds
indexes_to_remove = wta_matches[wta_matches['B365_Odds'] > wta_matches['Max_Odds']].index
# Remove the rows using the drop method
wta_matches.drop(indexes_to_remove, inplace=True)
len(wta_matches)

27206

In [17]:
bet365cleaning_before

27226

## 4. Implied Probability / Inverse Odds.

In [58]:
def implied_probability(player_odds) -> float:
    return 1 / player_odds

wta_matches['inverse_best'] = [implied_probability(x) for x in wta_matches['Max_Odds']]
wta_matches['inverse_b365'] = [implied_probability(x) for x in wta_matches['B365_Odds']]
wta_matches['inverse_avg'] = [implied_probability(x) for x in wta_matches['Avg_Odds']]

wta_matches.drop(columns=['B365_Odds', 'Max_Odds', 'Avg_Odds'], inplace=True)

In [59]:
wta_matches.to_csv("clegg_matches_cleaned.csv")