In [1]:
import pandas as pd
from datetime import datetime
matches_2022 = pd.read_csv('data/matches_2022.csv')
matches_2022.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2022-8888,Atp Cup,Hard,16,A,20220103,300,200000,,,...,50.0,32.0,7.0,10.0,3.0,5.0,11.0,3308.0,19.0,2260.0
1,2022-8888,Atp Cup,Hard,16,A,20220103,299,133430,,,...,33.0,21.0,8.0,9.0,3.0,6.0,14.0,2475.0,20.0,2230.0
2,2022-8888,Atp Cup,Hard,16,A,20220103,298,105138,,,...,80.0,62.0,20.0,16.0,6.0,7.0,19.0,2260.0,9.0,3706.0
3,2022-8888,Atp Cup,Hard,16,A,20220103,297,105807,,,...,27.0,17.0,1.0,7.0,4.0,8.0,20.0,2230.0,860.0,18.0
4,2022-8888,Atp Cup,Hard,16,A,20220103,296,106421,,,...,35.0,22.0,4.0,8.0,3.0,7.0,2.0,8640.0,11.0,3308.0


## Get unique players by name and ID

In [2]:
# Get only the column names which include the substring 'name'
matches_2022.filter(regex='name').columns

Index(['tourney_name', 'winner_name', 'loser_name'], dtype='object')

In [3]:
# Get only rows where the value of column 'winner_name' or 'loser_name' includes 'Tim van Rijthoven' without accounting for case
matches_2022[matches_2022['winner_name'].str.contains('Tim van Rijthoven', case=False) | matches_2022['loser_name'].str.contains('Tim van Rijthoven', case=False)]

# Same as above but only show the winner name or loser name columns
matches_2022[matches_2022['winner_name'].str.contains('Tim van Rijthoven', case=False) | matches_2022['loser_name'].str.contains('Tim van Rijthoven', case=False)][['winner_name', 'loser_name']]

Unnamed: 0,winner_name,loser_name
1374,Tim Van Rijthoven,Daniil Medvedev
1376,Tim Van Rijthoven,Felix Auger Aliassime
1379,Tim Van Rijthoven,Hugo Gaston
1386,Tim Van Rijthoven,Taylor Fritz
1397,Tim Van Rijthoven,Matthew Ebden
1550,Tim Van Rijthoven,Federico Delbonis
1611,Tim Van Rijthoven,Reilly Opelka
1641,Tim Van Rijthoven,Nikoloz Basilashvili
1656,Novak Djokovic,Tim Van Rijthoven
1721,Mitchell Krueger,Tim Van Rijthoven


In [4]:
# Turn the above 2 lines into a function which outputs the index number of the found rows and the ratio of Tim's presence in the winner_name to loser_name columns
def find_matches_by_name(name):
    # Find the rows where name is present in the winner_name or loser_name columns
    name_matches_2022 = matches_2022[matches_2022['winner_name'].str.contains(name, case=False) | matches_2022['loser_name'].str.contains(name, case=False)]
    name_matches_index = name_matches_2022.index

    # Get the percent wins of the player
    won_percent = len(name_matches_2022[name_matches_2022['winner_name'].str.contains(name, case=False)]) / len(name_matches_2022)

    # From the latest row in the row index, get the value of the largest date value in the tourney_date column
    name_matches_latest_date = name_matches_2022.iloc[-1]['tourney_date']
    name_matches_latest_date = datetime.strptime(str(name_matches_latest_date), '%Y%m%d').strftime('%Y-%m-%d') # YYYY-MM-DD format

    # If the winner_name in the last row is equal to the name (not accounting for case), get that row's winner_id, otherwise get the loser_id
    if name_matches_2022.iloc[-1]['winner_name'].lower() == name.lower():
        id = name_matches_2022.iloc[-1]['winner_id']
    else:
        id = name_matches_2022.iloc[-1]['loser_id']

    # As a dictionary, return the name of the player, a list of the index numbers of the rows where the player is present and the ratio of the player's presence in the winner_name to loser_name columns
    return {'name': name, 'id': id, 'matches_2022': [name_matches_index], 'win_percent': won_percent, 'latest_date': name_matches_latest_date}

# Get the index number of the rows where Tim is present and the ratio of Tim's presence in the winner_name to loser_name columns
find_matches_by_name('Tim van Rijthoven')


{'name': 'Tim van Rijthoven',
 'id': 126646,
 'matches_2022': [Index([1374, 1376, 1379, 1386, 1397, 1550, 1611, 1641, 1656, 1721, 2125, 2174,
         2331, 2452, 2554],
        dtype='int64')],
 'win_percent': 0.6,
 'latest_date': '2022-10-17'}

In [5]:
def find_matches_by_id(id):
    # Find the rows where name is present in the winner_name or loser_name columns
    id_won_matches_2022 = matches_2022[matches_2022['winner_id'] == id]
    id_lost_matches_2022 = matches_2022[matches_2022['loser_id'] == id]
    id_matches_2022 = pd.concat([id_won_matches_2022, id_lost_matches_2022])
    
    id_matches_index = id_matches_2022.index

    # Get the percent wins of the player
    won_percent = len(id_won_matches_2022) / len(id_matches_2022)
    
    # From the latest row in the row index, get the value of the largest date value in the tourney_date column
    id_matches_latest_date = id_matches_2022.iloc[-1]['tourney_date']
    id_matches_latest_date = datetime.strptime(str(id_matches_latest_date), '%Y%m%d').strftime('%Y-%m-%d') # YYYY-MM-DD format

    # Get the name of the player
    name = id_matches_2022.iloc[-1]['winner_name'] if id_matches_2022.iloc[-1]['winner_id'] == id else id_matches_2022.iloc[-1]['loser_name']

    # As a dictionary, return the name of the player, a list of the index numbers of the rows where the player is present and the ratio of the player's presence in the winner_name to loser_name columns
    return {'name': name, 'id': id, 'matches_2022': [id_matches_index], 'win_percent': won_percent, 'latest_date': id_matches_latest_date}

find_matches_by_id(126646)

{'name': 'Tim Van Rijthoven',
 'id': 126646,
 'matches_2022': [Index([1374, 1376, 1379, 1386, 1397, 1550, 1611, 1641, 2125, 1656, 1721, 2174,
         2331, 2452, 2554],
        dtype='int64')],
 'win_percent': 0.6,
 'latest_date': '2022-10-17'}

In [6]:
# Get the unique ids of the players
unique_ids = set(list(matches_2022['winner_id'].unique()) + list(matches_2022['loser_id'].unique()))

# Now make this performance even better by using multiprocessing
import multiprocessing as mp

pool = mp.Pool(processes=4)
all_people = pool.map(find_matches_by_id, unique_ids)
pool.close()

# Turn the list of dictionaries into a dataframe
all_people_df = pd.DataFrame(all_people)

# # Sort the dataframe by the win/lose ratio
# all_matches_matches_2022.sort_values(by='win_lose', ascending=False)

# # Get the top 10 players with the highest win/lose ratio
# all_matches_matches_2022.sort_values(by='win_lose', ascending=False).head(10)

all_people_df

Unnamed: 0,name,id,matches_2022,win_percent,latest_date
0,Nicolas Mejia,200711,"[[2824, 2808, 2810]]",0.333333,2022-03-04
1,Dennis Novak,110602,"[[1594, 2778, 366, 561, 1633, 2623, 2780]]",0.285714,2022-03-04
2,Gilles Simon,104468,"[[342, 1303, 1339, 1389, 2271, 2645, 2665, 330...",0.466667,2022-10-31
3,Kasidit Samrej,208937,"[[2868, 2870]]",1.000000,2022-09-16
4,Brandon Nakashima,206909,"[[146, 157, 448, 480, 740, 832, 1001, 1265, 13...",0.603448,2022-10-31
...,...,...,...,...,...
403,J J Wolf,200670,"[[588, 718, 818, 862, 1899, 1909, 1925, 2102, ...",0.535714,2022-10-24
404,Sergey Fomin,200672,"[[2845, 2846, 2861, 2863]]",0.500000,2022-03-04
405,Rio Noguchi,200677,"[[2424, 2409]]",0.500000,2022-10-03
406,Soon Woo Kwon,126952,"[[74, 128, 184, 335, 426, 501, 623, 989, 1054,...",0.454545,2022-09-15


## Get more characteristics

In [37]:
# Let's use Rafael Nadal as an example, because I've heard his name before. Get just his id number
nadal_id = all_people_df[all_people_df['name'].str.contains('Nadal', case=False)]['id'].values[0]

nadal = find_matches_by_id(nadal_id)
nadal

{'name': 'Rafael Nadal',
 'id': 104745,
 'matches_2022': [Index([  81,   82,   84,   88,  193,  241,  265,  277,  283,  286,  288,  563,
          564,  567,  572,  582,  653,  656,  661,  671,  691, 1092, 1102, 1157,
         1262, 1318, 1346, 1360, 1367, 1371, 1373, 1607, 1639, 1655, 1663, 1667,
         2158, 2190, 2206, 2697,  652, 1087, 1147, 1669, 2023, 2214, 2654, 2698,
         2699],
        dtype='int64')],
 'win_percent': 0.8163265306122449,
 'latest_date': '2022-11-14'}

In [39]:
# Get rows from the matches_2022 dataframe based on gilles['matches_2022']
nadal_df = matches_2022.iloc[nadal['matches_2022'][0]]
nadal_df.shape

(49, 49)

In [40]:
# Get list of columns from nadal_df
nadal_df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

Columns include information about the tournament, the winner and the loser, and more. Using the dictionary text file in the source Github repo (found here: https://github.com/JeffSackmann/tennis_atp/blob/master/matches_data_dictionary.txt), here are some explanations of the columns.
> - draw_size: number of players in the draw, often rounded up to the nearest power of 2. (For instance, a tournament with 28 players may be shown as 32.)
> - tourney_level: 
>   - For men: 'G' = Grand Slams, 'M' = Masters 1000s, 'A' = other tour-level events, 'C' = Challengers, 'S' = Satellites/ITFs, 'F' = Tour finals and other season-ending events, and 'D' = Davis Cup 
>   - For women, there are several additional tourney_level codes, including 'P' = Premier, 'PM' = Premier Mandatory, and 'I' = International. The various levels of ITFs are given by the prize money (in thousands), such as '15' = ITF $15,000. Other codes, such as 'T1' for Tier I (and so on) are used for older WTA tournament designations. 'D' is used for Federation/Fed/Billie Jean King Cup, and also for Wightman Cup and Bonne Bell Cup.
>   - Others, eventually for both genders: 'E' = exhibition (events not sanctioned by the tour, though the definitions can be ambiguous), 'J' = juniors, and 'T' = team tennis, which does yet appear anywhere in the dataset but will at some point.
> - match_num: somewhat arbitrary
> - winner_entry: 'WC' = wild card, 'Q' = qualifier, 'LL' = lucky loser, 'PR' = protected ranking, 'ITF' = ITF entry, and there are a few others that are occasionally used.
> - winner_hand: R = right, L = left, U = unknown. For ambidextrous players, this is their serving hand.
> - winner_ht: height in centimeters, where available
> - winner_ioc: three-character country code
> - winner_age: age, in years, as of the tourney_date
> - best_of: '3' or '5', indicating the the number of sets for this match
> - minutes: match length

Possibly interesting filters are 
1. Which tournaments has this person participated in?
2. The tournament levels this person has participated in
3. Main hand
4. Country they're from
5. Age
6. How many sets they play on average
7. Average game length, or longest, shortest games
8. Which surface they're strongest on (win percentage they have on that surface)

In [52]:
# From nadal_df, find row where the loser name is 'Daniil Medvedev' and the tourney_name is 'Australian Open'
nadal_medvedev = nadal_df[(nadal_df['loser_name'] == 'Daniil Medvedev') & (nadal_df['tourney_name'] == 'Australian Open')][['winner_name', 
  'loser_name', 'tourney_name', 'tourney_level', 'tourney_date',
  'draw_size', 'score', 'round',
  'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced',
  'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']]

# Print the row in a nice to read format, with column-value pairs on separate lines
for col in nadal_medvedev.columns:
    print(col, ':', nadal_medvedev[col].values[0])


winner_name : Rafael Nadal
loser_name : Daniil Medvedev
tourney_name : Australian Open
tourney_level : G
tourney_date : 20220117
draw_size : 128
score : 2-6 6-7(5) 6-4 6-4 7-5
round : F
w_ace : 3.0
w_df : 5.0
w_svpt : 189.0
w_1stIn : 117.0
w_1stWon : 78.0
w_2ndWon : 34.0
w_SvGms : 26.0
w_bpSaved : 16.0
w_bpFaced : 22.0
l_ace : 23.0
l_df : 5.0
l_svpt : 182.0
l_1stIn : 126.0
l_1stWon : 89.0
l_2ndWon : 23.0
l_SvGms : 26.0
l_bpSaved : 15.0
l_bpFaced : 22.0


In [74]:
def make_person(id):
    # From all people, get the row where the winner_id is equal to the id
    index_2022 = find_matches_by_id(id)['matches_2022']

    # Get the rows from the matches_2022 dataframe based on the index, split into won and lost
    matches = matches_2022.iloc[index_2022[0]]
    won_matches = matches.loc[matches_2022['winner_id'] == id]
    lost_matches = matches.loc[matches_2022['loser_id'] == id]
    
    # Create a dictionary of lists of wanted properties (same as above), using the won_matches and lost_matches dataframes
    person = {
        'name': won_matches.iloc[-1]['winner_name'] if won_matches.iloc[-1]['winner_id'] == id else lost_matches.iloc[-1]['loser_name'],
        'id': id,
        'rank': won_matches.iloc[-1]['winner_rank'] if won_matches.iloc[-1]['winner_id'] == id else lost_matches.iloc[-1]['loser_rank'],
        'rank_points': won_matches.iloc[-1]['winner_rank_points'] if won_matches.iloc[-1]['winner_id'] == id else lost_matches.iloc[-1]['loser_rank_points'],
        'matches_2022': index_2022,
        'won_matches_2022': len(won_matches),
        'lost_matches_2022': len(lost_matches),
        'win_percent': len(won_matches) / len(matches),
        'latest_date': matches.iloc[-1]['tourney_date'],
        'tourney_names': list(matches['tourney_name'].unique()),
        # summed stats
        'sum_aces': won_matches['w_ace'].sum() + lost_matches['l_ace'].sum(),
        'sum_double_faults': won_matches['w_df'].sum() + lost_matches['l_df'].sum(),
        'sum_service_points': won_matches['w_svpt'].sum() + lost_matches['l_svpt'].sum(),
        'sum_first_serve_points': won_matches['w_1stIn'].sum() + lost_matches['l_1stIn'].sum(),
        'sum_first_serve_points_won': won_matches['w_1stWon'].sum() + lost_matches['l_1stWon'].sum(),
        'sum_second_serve_points_won': won_matches['w_2ndWon'].sum() + lost_matches['l_2ndWon'].sum(),
        'sum_serve_games': won_matches['w_SvGms'].sum() + lost_matches['l_SvGms'].sum(),
        'sum_break_points_saved': won_matches['w_bpSaved'].sum() + lost_matches['l_bpSaved'].sum(),
        'sum_break_points_faced': won_matches['w_bpFaced'].sum() + lost_matches['l_bpFaced'].sum(),
        # average stats
        'avg_aces': (won_matches['w_ace'].sum() + lost_matches['l_ace'].sum()) / len(matches),
        'avg_double_faults': (won_matches['w_df'].sum() + lost_matches['l_df'].sum()) / len(matches),
        'avg_service_points': (won_matches['w_svpt'].sum() + lost_matches['l_svpt'].sum()) / len(matches),
        'avg_first_serve_points': (won_matches['w_1stIn'].sum() + lost_matches['l_1stIn'].sum()) / len(matches),
        'avg_first_serve_points_won': (won_matches['w_1stWon'].sum() + lost_matches['l_1stWon'].sum()) / len(matches),
        'avg_second_serve_points_won': (won_matches['w_2ndWon'].sum() + lost_matches['l_2ndWon'].sum()) / len(matches),
        'avg_serve_games': (won_matches['w_SvGms'].sum() + lost_matches['l_SvGms'].sum()) / len(matches),
        'avg_break_points_saved': (won_matches['w_bpSaved'].sum() + lost_matches['l_bpSaved'].sum()) / len(matches),
        'avg_break_points_faced': (won_matches['w_bpFaced'].sum() + lost_matches['l_bpFaced'].sum()) / len(matches)
    }
    
    return person

make_person(104745)

{'name': 'Rafael Nadal',
 'id': 104745,
 'rank': 2.0,
 'rank_points': 5820.0,
 'matches_2022': [Index([  81,   82,   84,   88,  193,  241,  265,  277,  283,  286,  288,  563,
          564,  567,  572,  582,  653,  656,  661,  671,  691, 1092, 1102, 1157,
         1262, 1318, 1346, 1360, 1367, 1371, 1373, 1607, 1639, 1655, 1663, 1667,
         2158, 2190, 2206, 2697,  652, 1087, 1147, 1669, 2023, 2214, 2654, 2698,
         2699],
        dtype='int64')],
 'won_matches_2022': 40,
 'lost_matches_2022': 9,
 'win_percent': 0.8163265306122449,
 'latest_date': 20221114,
 'tourney_names': ['Melbourne',
  'Australian Open',
  'Acapulco',
  'Indian Wells Masters',
  'Madrid Masters',
  'Rome Masters',
  'Roland Garros',
  'Wimbledon',
  'Us Open',
  'Tour Finals',
  'Cincinnati Masters',
  'Paris Masters'],
 'sum_aces': 227.0,
 'sum_double_faults': 168.0,
 'sum_service_points': 4126.0,
 'sum_first_serve_points': 2683.0,
 'sum_first_serve_points_won': 1949.0,
 'sum_second_serve_points_won': 793.

In [7]:
# TODO: import all the csv files from the past 10 years and then do the same thing as above. Can just add the new csv file names to the functions above and then run the functions again.
