# In this notebook I find a ranking of top 30 current female tennis players. I expand this dataframe to include all tennis opens and the performace of the top 30 in these events.


In [1]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# Top tennis players


In [2]:
top_url = 'https://www.eurosport.com/tennis/wta/standings.shtml'

In [3]:
top1 = pd.read_html(top_url)

In [4]:
top = top1[0]

In [5]:
top.head(9)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Players,Pts,Unnamed: 4
0,,1,I. Swiatek,11695,
1,,2,A. Sabalenka,8138,
2,,3,C. Gauff,7638,
3,,4,E. Rybakina,5673,
4,,5,J. Pegula,4550,
5,,6,M. Vondroušová,4143,
6,,7,M. Sakkari,3980,
7,,8,Q. Zheng,3945,
8,,9,O. Jabeur,3748,


In [6]:
top =top.loc[:,'Players':'Pts']

In [7]:
# Replacing both names with just the last name:

for i in range(len(top['Players'])):
    top['Players'][i].lstrip().rstrip()

In [8]:
# Using just the last name:

for i in range(len(top['Players'])):
    top['Players'][i]=top['Players'][i].split(" ")[-1]

In [9]:
# Correcting the spelling:

top['Players'][top['Players']== 'Swiatek'] ='Świątek'
top['Players'][top['Players']== 'Krejcíková'] ='Krejčíková'

# Adding new info:  Roland Garros winners

In [10]:
rg_url = 'https://en.wikipedia.org/wiki/List_of_French_Open_women%27s_singles_champions#French_Open'

In [11]:
rg1 = pd.read_html(rg_url)

In [12]:
rg=rg1[3]

In [13]:
rg.head(9)

Unnamed: 0,Year[d],Country,Champion,Country.1,Runner-up,Score in the final[10]
0,1968,USA,Nancy Richey,GBR,Ann Haydon Jones,"5–7, 6–4, 6–1"
1,1969,AUS,Margaret Court,GBR,Ann Haydon Jones,"6–1, 4–6, 6–3"
2,1970,AUS,Margaret Court (2),FRG[h],Helga Niessen,"6–2, 6–4"
3,1971,AUS,Evonne Goolagong[i],AUS,Helen Gourlay,"6–3, 7–5"
4,1972,USA,Billie Jean King,AUS,Evonne Goolagong,"6–3, 6–3"
5,1973,AUS,Margaret Court (3),USA,Chris Evert,"6–7(5–7),[j] 7–6(8–6), 6–4"
6,1974,USA,Chris Evert,URS,Olga Morozova,"6–1, 6–2"
7,1975,USA,Chris Evert (2),TCH[k],Martina Navratilova[l],"2–6, 6–2, 6–1"
8,1976,GBR[m],Sue Barker,TCH,Renáta Tomanová,"6–2, 0–6, 6–2"


In [14]:
# Cleaning data: removing unnecessary numbers:

rg['Champion'].replace(r'\(\d\)','',regex=True,inplace = True);

# Removing spaces around names

rg.Champion = rg.Champion.str.strip()

# Keeping just the last name:

for i in range(len(rg['Champion'])):
    rg['Champion'][i] = rg['Champion'][i].split(" ")[-1]

In [15]:
# We get rid of all columns, just keeping the count of wins per player:
#countries.rename({'Country / territory':'Country'}, axis=1, inplace=True)

rg = pd.DataFrame(rg.Champion.value_counts()).reset_index()

In [16]:
rg.head(3)

Unnamed: 0,index,Champion
0,Evert,7
1,Graf,6
2,Henin,4


# Merging together two dataframes:

In [17]:
partial1 = pd.merge(top, rg, left_on = 'Players', right_on ='index', how = 'left')

In [18]:
partial1.head(3)

Unnamed: 0,Players,Pts,index,Champion
0,Świątek,11695,Świątek,3.0
1,Sabalenka,8138,,
2,Gauff,7638,,


In [19]:
# Renaming Champion column;
# Removing index column;
# Replacing NaNs in the last column with zeros;

partial1.rename({'Champion':'French Open Wins'}, axis =1, inplace = True);
partial1 = partial1.drop('index', axis = 1)
partial1['French Open Wins'].fillna(0, inplace =True);

In [20]:
partial1.head(3)

Unnamed: 0,Players,Pts,French Open Wins
0,Świątek,11695,3.0
1,Sabalenka,8138,0.0
2,Gauff,7638,0.0


# US Open

In [21]:
us_url = 'https://en.wikipedia.org/wiki/List_of_US_Open_women%27s_singles_champions'

In [22]:
us1 = pd.read_html(us_url)

In [23]:
us = us1[3]

In [24]:
us.head(4)

Unnamed: 0,Year[d],Country,Champion,Country.1,Runner-up,Score[14]
0,1968,GBR,Virginia Wade,USA,Billie Jean King,"6–4, 6–2"
1,1969,AUS,Margaret Court (3),USA,Nancy Richey,"6–2, 6–2"
2,1970,AUS,Margaret Court (4),USA,Rosemary Casals,"6–2, 2–6, 6–1"
3,1971,USA,Billie Jean King (2),USA,Rosemary Casals,"6–4, 7–6(5–2)"


In [25]:
# Removing spaces around names

us.Champion = us.Champion.str.strip()

# Keeping just the last name:

for i in range(len(us['Champion'])):
    us['Champion'][i] = us['Champion'][i].split(" ")[-1]

In [26]:
# We get rid of all columns, just keeping the count of wins per player:
#countries.rename({'Country / territory':'Country'}, axis=1, inplace=True)

us = pd.DataFrame(us.Champion.value_counts()).reset_index()

# Merging US Open with partial1:


In [27]:
partial2 = pd.merge(partial1, us, left_on = 'Players', right_on ='index', how = 'left')

In [28]:
# Renaming Champion column;
# Removing index column;
# Replacing NaNs in the last column with zeros;

partial2.rename({'Champion':'US Open Wins'}, axis =1, inplace = True);
partial2 = partial2.drop('index', axis = 1)
partial2['US Open Wins'].fillna(0, inplace =True);

In [29]:
partial2.head(5)

Unnamed: 0,Players,Pts,French Open Wins,US Open Wins
0,Świątek,11695,3.0,1.0
1,Sabalenka,8138,0.0,0.0
2,Gauff,7638,0.0,1.0
3,Rybakina,5673,0.0,0.0
4,Pegula,4550,0.0,0.0


# Australian Open


In [30]:
ao_url = 'https://en.wikipedia.org/wiki/List_of_Australian_Open_women%27s_singles_champions'

In [31]:
ao1 = pd.read_html(ao_url)

In [32]:
ao = ao1[3]

In [33]:
# Removing spaces around names

ao.Champion = ao.Champion.str.strip()

In [34]:
# Keeping just the last name:

for i in range(len(ao['Champion'])):
    ao['Champion'][i] = ao['Champion'][i].split(" ")[-1]

In [35]:
# We get rid of all columns, just keeping the count of wins per player:
#countries.rename({'Country / territory':'Country'}, axis=1, inplace=True)

ao = pd.DataFrame(ao.Champion.value_counts()).reset_index()

# Merging Australian Open with partial2

In [36]:
partial3 = pd.merge(partial2, ao, left_on = 'Players', right_on ='index', how = 'left')

In [37]:
# Renaming Champion column;
# Removing index column;
# Replacing NaNs in the last column with zeros;

partial3.rename({'Champion':'AUS Open Wins'}, axis =1, inplace = True);
partial3 = partial3.drop('index', axis = 1)
partial3['AUS Open Wins'].fillna(0, inplace =True);

In [38]:
partial3.head(5)

Unnamed: 0,Players,Pts,French Open Wins,US Open Wins,AUS Open Wins
0,Świątek,11695,3.0,1.0,0.0
1,Sabalenka,8138,0.0,0.0,2.0
2,Gauff,7638,0.0,1.0,0.0
3,Rybakina,5673,0.0,0.0,0.0
4,Pegula,4550,0.0,0.0,0.0


# Wimbledon: same drill


In [39]:
w_url = 'https://en.wikipedia.org/wiki/List_of_Wimbledon_ladies%27_singles_champions'

In [40]:
w1 = pd.read_html(w_url)
w = w1[3]

In [41]:
# Cleaning data: removing unnecessary numbers:

w['Champion'].replace(r'\(\d\)','',regex=True,inplace = True);

# Removing spaces around names

w.Champion = w.Champion.str.strip()

# Keeping just the last name:

for i in range(len(w['Champion'])):
    w['Champion'][i] = w['Champion'][i].split(" ")[-1]
    
# We get rid of all columns, just keeping the count of wins per player:
#countries.rename({'Country / territory':'Country'}, axis=1, inplace=True)

w = pd.DataFrame(w.Champion.value_counts()).reset_index()

# Merging Wimbledon with partial3

In [42]:
final = pd.merge(partial3, w, left_on = 'Players', right_on ='index', how = 'left')

In [43]:
# Renaming Champion column;
# Removing index column;
# Replacing NaNs in the last column with zeros;

final.rename({'Champion':'Wimbledon Wins'}, axis =1, inplace = True);
final = final.drop('index', axis = 1)
final['Wimbledon Wins'].fillna(0, inplace =True);

# Adding total wins column to the final dataframe

In [44]:
final['Total Wins'] = final.loc[:,'French Open Wins':"Wimbledon Wins"].sum(axis = 1, numeric_only = True)

In [45]:
final

Unnamed: 0,Players,Pts,French Open Wins,US Open Wins,AUS Open Wins,Wimbledon Wins,Total Wins
0,Świątek,11695,3.0,1.0,0.0,0.0,4.0
1,Sabalenka,8138,0.0,0.0,2.0,0.0,2.0
2,Gauff,7638,0.0,1.0,0.0,0.0,1.0
3,Rybakina,5673,0.0,0.0,0.0,1.0,1.0
4,Pegula,4550,0.0,0.0,0.0,0.0,0.0
5,Vondroušová,4143,0.0,0.0,0.0,1.0,1.0
6,Sakkari,3980,0.0,0.0,0.0,0.0,0.0
7,Zheng,3945,0.0,0.0,0.0,0.0,0.0
8,Jabeur,3748,0.0,0.0,0.0,0.0,0.0
9,Collins,3472,0.0,0.0,0.0,0.0,0.0


In [46]:
final.loc[:,['Players','Total Wins']][final['Total Wins']!= 0].sort_values(by = 'Total Wins', ascending = False)

Unnamed: 0,Players,Total Wins
0,Świątek,4.0
1,Sabalenka,2.0
20,Azarenka,2.0
2,Gauff,1.0
3,Rybakina,1.0
5,Vondroušová,1.0
10,Ostapenko,1.0
25,Krejčíková,1.0
