In [1]:
import re, numpy as np, pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [26]:
url = 'https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/master/charting-m-points.csv'
df = pd.read_csv(url, encoding='latin-1', low_memory=False)

# Add tourney date
df['date'] = pd.to_datetime([i.split('-')[0] for i in df['match_id']])

# Add player names
names_tuple = [(i.split('-')[-2].replace('_', ' '), i.split('-')[-1].replace('_', ' '), i.split('-')[-3]) for i in df['match_id']]
df['P1'] = [i[0] for i in names_tuple]
df['P2'] = [i[1] for i in names_tuple]

# Add round
df['round'] = [i[2] for i in names_tuple]

# Add point winner & loser name
df['pointWinner'] = df.apply(lambda row: row['P1'] if row['PtWinner'] == 1 else row['P2'] if row['PtWinner'] == 2 else np.nan, axis=1)
df['pointLoser'] = df.apply(lambda row: row['P2'] if row['PtWinner'] == 1 else row['P1'] if row['PtWinner'] == 2 else np.nan, axis=1)

df.head(2)

Unnamed: 0,match_id,Pt,Set1,Set2,Gm1,Gm2,Pts,Gm#,TbSet,TB?,TBpt,Svr,Ret,Serving,1st,2nd,Notes,1stSV,2ndSV,1stIn,2ndIn,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,PtWinner,isSvrWinner,rallyCount,date,P1,P2,round,pointWinner,pointLoser
0,20220713-M-Newport-R16-Andy_Murray-Max_Purcell,1,0,0,0.0,0.0,0-0,1 (1),1,0,,1,2,AM,S,,,0,,0,,False,False,False,False,False,False,1,1,1,2022-07-13,Andy Murray,Max Purcell,R16,Andy Murray,Max Purcell
1,20220713-M-Newport-R16-Andy_Murray-Max_Purcell,2,0,0,0.0,0.0,15-0,1 (2),1,0,,1,2,AM,S,,Video feed froze for first 2 points,0,,0,,False,False,False,False,False,False,1,1,1,2022-07-13,Andy Murray,Max Purcell,R16,Andy Murray,Max Purcell


In [58]:
# Match Stats
wins = df.groupby(['match_id', 'pointWinner', 'round'])[['isRallyWinner', ]].sum()
errs = df.drop(columns=['pointWinner']).rename(columns={'pointLoser': 'pointWinner'}).groupby(['match_id', 'pointWinner', 'round'])[[ 'isUnforced', ]].sum()

stats = wins.join(errs, how='outer').reset_index()

stats['Ratio'] = stats['isRallyWinner'] / stats['isUnforced']
stats['Year'] = [int(i[:4]) for i in stats['match_id']]
stats['Tournament'] = [i.split('-')[2].replace('_', ' ').title() for i in stats['match_id']]
stats = stats.rename(columns={'pointWinner': 'Player', 'round': 'Round', 'isRallyWinner': 'Winner', 'isUnforced': 'Unf Error'})

stats.sample(10)

Unnamed: 0,match_id,Player,Round,Winner,Unf Error,Ratio,Year,Tournament
236,19870605-M-Roland_Garros-SF-Miloslav_Mecir-Iva...,Ivan Lendl,SF,21.0,33.0,0.636364,1987,Roland Garros
6223,20210113-M-Antalya-F-Alex_De_Minaur-Alexander_...,Alexander Bublik,F,2.0,3.0,0.666667,2021,Antalya
2176,20080420-M-Monte_Carlo_Masters-F-Rafael_Nadal-...,Rafael Nadal,F,13.0,16.0,0.8125,2008,Monte Carlo Masters
2182,20080511-M-Hamburg_Masters-F-Roger_Federer-Raf...,Rafael Nadal,F,30.0,21.0,1.428571,2008,Hamburg Masters
2650,20111122-M-Tour_Finals-RR-Roger_Federer-Rafael...,Rafael Nadal,RR,3.0,15.0,0.2,2011,Tour Finals
7148,20220605-M-Roland_Garros-F-Rafael_Nadal-Casper...,Casper Ruud,F,15.0,31.0,0.483871,2022,Roland Garros
211,19860907-M-US_Open-F-Miloslav_Mecir-Ivan_Lendl,Miloslav Mecir,F,14.0,45.0,0.311111,1986,Us Open
2711,20120406-M-Davis_Cup_World_Group_QF-RR-Ivo_Kar...,Juan Martin Del Potro,RR,14.0,12.0,1.166667,2012,Davis Cup World Group Qf
6369,20210507-M-Madrid_Masters-QF-John_Isner-Domini...,John Isner,QF,25.0,35.0,0.714286,2021,Madrid Masters
700,19940305-M-Indian_Wells_Masters-SF-Aaron_Krick...,Aaron Krickstein,SF,3.0,12.0,0.25,1994,Indian Wells Masters


In [70]:
# Player
player = [ 'Carlos Alcaraz' ]
rnd = 'F'
tournament = ['Australian Open', 'Roland Garros', 'Wimbledon', 'US Open', 'Madrid Masters']

stats[
    (stats['Player'].isin(player)) & 
    (stats['Round'] ==  rnd.upper()) &
    (stats['Tournament'].isin(tournament))
    ].sort_values('Year')

Unnamed: 0,match_id,Player,Round,Winner,Unf Error,Ratio,Year,Tournament
7079,20220508-M-Madrid_Masters-F-Carlos_Alcaraz-Ale...,Carlos Alcaraz,F,12.0,14.0,0.857143,2022,Madrid Masters
