In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
path = "https://raw.githubusercontent.com/DSEI21000-S21/project-tennis-ml/main/atp_matches/atp_matches-2000-2021_expanded.csv"

def readData(path):
    df = pd.read_csv(path)
    return df

def dropGibberishRows(dataframe, column_name, filter_variable):
    index_list = list(dataframe[dataframe[column_name] == filter_variable].index)
    dataframe = dataframe.drop(index_list)
    return dataframe

def encodeColumn(dataframe, column_list = []):
    df = pd.get_dummies(dataframe, columns= column_list)
    return df

def convertDate(dataframe, date):
    dataframe[date] = pd.to_datetime(dataframe[date], format = '%Y%m%d', errors='coerce')
    return dataframe

def addDateFeatures(dataframe, date):
    dataframe['year'] = dataframe[date].dt.year
    dataframe['month'] = dataframe[date].dt.month
    dataframe['day'] = dataframe[date].dt.day
    return dataframe

def appendTarget(dataframe, winner, loser):
    dataframe[winner] = 1
    dataframe[loser] = 0
    return dataframe

def getMissingDataPercentage(dataframe):
    percent_missing = dataframe.isnull().sum() * 100 / len(dataframe)
    missing_value_df = pd.DataFrame({'column_name': dataframe.columns,
                                 'percent_missing': percent_missing})
    pd.set_option("max_rows", None)
    return missing_value_df

def formatColumns(dataframe):
    columns = list(dataframe.columns)
    renamed_columns = [i.lower().strip().replace(' ', "_").replace(':','') for i in columns]
    renamed_columns_dict = {columns[i]: renamed_columns[i] for i in range(len(columns))}
    dataframe = dataframe.rename(columns = renamed_columns_dict)
    return dataframe

def convertColumnsToNumeric(dataframe, columns = []):
    dataframe[columns] = dataframe[columns].apply(pd.to_numeric, errors ='coerce')
    return dataframe 

def fillMissingDataMode(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mode().iloc[0])
    return dataframe

def dropColumn(dataframe, column_list = []):
    return dataframe.drop(column_list, axis =1)

def fillMissingDataMean(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mean().iloc[0])
    return dataframe

def fillMissingData999(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(999)
    return dataframe

In [14]:
atp_data = readData(path)
atp_data = dropColumn(atp_data, ['player_entry'])
atp_data = fillMissingDataMode(atp_data, ['surface', 'player_hand'])
atp_data = fillMissingDataMean(atp_data, ['player_height', 'player_age', 'minutes', 'ace', 'double_fault', 'service_points_won', 'first_serve_made', 'first_serve_won', 
'second_serve_won', 'serve_game', 'break_point_saved', 'break_point_faced', 'player_rank','player_rank_points'])
atp_data = fillMissingData999(atp_data, ['player_seed'])
atp_data = convertDate(atp_data, 'tourney_date')
atp_data = addDateFeatures(atp_data, 'tourney_date')
atp_data = dropColumn(atp_data, ['player_ioc', 'score', 'tourney_id', 'tourney_date', 'player_name'])


In [17]:
pd.set_option("display.max_columns", 35)
atp_data.head()

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,match_num,player_id,player_seed,player_hand,player_height,player_age,best_of,round,minutes,ace,double_fault,service_points_won,first_serve_made,first_serve_won,second_serve_won,serve_game,break_point_saved,break_point_faced,player_rank,player_rank_points,target,year,month,day
0,Auckland,Hard,32,A,1,103163,1.0,R,188.0,21.771389,3,R32,108.0,18.0,4.0,96.0,49.0,39.0,28.0,17.0,3.0,5.0,11.0,1612.0,1,2000,1,10
1,Auckland,Hard,32,A,2,102607,999.0,R,190.0,24.558522,3,R32,85.0,5.0,3.0,76.0,52.0,39.0,13.0,12.0,5.0,6.0,211.0,157.0,1,2000,1,10
2,Auckland,Hard,32,A,3,103252,999.0,R,175.0,21.390828,3,R32,56.0,0.0,0.0,55.0,35.0,25.0,12.0,8.0,1.0,1.0,48.0,726.0,1,2000,1,10
3,Auckland,Hard,32,A,4,103507,7.0,R,183.0,19.909651,3,R32,68.0,5.0,1.0,53.0,28.0,26.0,15.0,10.0,0.0,0.0,45.0,768.0,1,2000,1,10
4,Auckland,Hard,32,A,5,102103,999.0,R,180.0,27.381246,3,R32,115.0,1.0,2.0,98.0,66.0,39.0,14.0,13.0,6.0,11.0,167.0,219.0,1,2000,1,10


In [43]:
atp_data = readData(path)
atp_data = dropColumn(atp_data, ['player_entry'])
atp_data = fillMissingDataMode(atp_data, ['surface', 'player_hand'])
atp_data = fillMissingDataMean(atp_data, ['player_height', 'player_age', 'minutes', 'ace', 'double_fault', 'service_points_won', 'first_serve_made', 'first_serve_won', 
'second_serve_won', 'serve_game', 'break_point_saved', 'break_point_faced', 'player_rank','player_rank_points'])
atp_data = fillMissingData999(atp_data, ['player_seed'])
atp_data = convertDate(atp_data, 'tourney_date')
atp_data = addDateFeatures(atp_data, 'tourney_date')
atp_data = dropColumn(atp_data, ['player_ioc', 'score', 'tourney_id', 'tourney_date', 'player_name'])
atp_data = encodeColumn(atp_data, ['tourney_name','surface', 'tourney_level', 'player_hand','round'])
atp_data = formatColumns(atp_data)

In [44]:
getMissingDataPercentage(atp_data).head(5)

Unnamed: 0,column_name,percent_missing
draw_size,draw_size,0.0
match_num,match_num,0.0
player_id,player_id,0.0
player_seed,player_seed,0.0
player_height,player_height,0.0


In [45]:
atp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128070 entries, 0 to 128069
Columns: 1709 entries, draw_size to round_sf
dtypes: float64(15), int64(8), uint8(1686)
memory usage: 228.4 MB


In [46]:
atp_data.describe()

Unnamed: 0,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,double_fault,...,round_br,round_er,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_rr,round_sf
count,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,...,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0,128070.0
mean,54.295932,78.659202,105708.957445,681.098228,185.490424,26.35303,3.464777,114.749293,22.810784,20.109738,...,9.4e-05,0.0005,0.022066,0.085141,0.102444,0.169407,0.316186,0.15212,0.108722,0.04332
std,39.472986,124.849842,10142.957321,462.642477,6.454959,4.591264,0.844714,46.330934,52.540061,53.216963,...,0.009679,0.022349,0.146899,0.279092,0.303232,0.375113,0.464988,0.359139,0.311291,0.203577
min,4.0,1.0,100644.0,1.0,163.0,14.513347,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,10.0,103344.0,11.0,183.0,23.411362,3.0,79.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,32.0,25.0,104214.0,999.0,185.0,26.195756,3.0,105.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.0,87.0,105023.0,999.0,188.0,29.051335,3.0,144.0,10.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,128.0,1701.0,210013.0,999.0,208.0,185.490424,5.0,1266.0,185.490424,185.490424,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
atp_data.to_pickle("./atp_cleaned_data.pkl")

In [48]:
atp_data = pd.read_pickle("./atp_cleaned_data.pkl")
atp_data.head()

Unnamed: 0,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,double_fault,...,round_br,round_er,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_rr,round_sf
0,32,1,103163,1.0,188.0,21.771389,3,108.0,18.0,4.0,...,0,0,0,0,0,0,1,0,0,0
1,32,2,102607,999.0,190.0,24.558522,3,85.0,5.0,3.0,...,0,0,0,0,0,0,1,0,0,0
2,32,3,103252,999.0,175.0,21.390828,3,56.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,32,4,103507,7.0,183.0,19.909651,3,68.0,5.0,1.0,...,0,0,0,0,0,0,1,0,0,0
4,32,5,102103,999.0,180.0,27.381246,3,115.0,1.0,2.0,...,0,0,0,0,0,0,1,0,0,0
