In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
path = "https://raw.githubusercontent.com/DSEI21000-S21/project-tennis-ml/main/wta_matches/wta_matches-2000-2021_expanded.csv"

def readData(path):
    df = pd.read_csv(path)
    return df

def dropGibberishRows(dataframe, column_name, filter_variable):
    index_list = list(dataframe[dataframe[column_name] == filter_variable].index)
    dataframe = dataframe.drop(index_list)
    return dataframe

def encodeColumn(dataframe, column_list = []):
    df = pd.get_dummies(dataframe, columns= column_list)
    return df

def convertDate(dataframe, date):
    dataframe[date] = pd.to_datetime(dataframe[date], format = '%Y%m%d', errors='coerce')
    return dataframe

def addDateFeatures(dataframe, date):
    dataframe['year'] = dataframe[date].dt.year
    dataframe['month'] = dataframe[date].dt.month
    dataframe['day'] = dataframe[date].dt.day
    return dataframe

def appendTarget(dataframe, winner, loser):
    dataframe[winner] = 1
    dataframe[loser] = 0
    return dataframe

def getMissingDataPercentage(dataframe):
    percent_missing = dataframe.isnull().sum() * 100 / len(dataframe)
    missing_value_df = pd.DataFrame({'column_name': dataframe.columns,
                                 'percent_missing': percent_missing})
    pd.set_option("max_rows", None)
    return missing_value_df

def formatColumns(dataframe):
    columns = list(dataframe.columns)
    renamed_columns = [i.lower().strip().replace(' ', "_").replace(':','') for i in columns]
    renamed_columns_dict = {columns[i]: renamed_columns[i] for i in range(len(columns))}
    dataframe = dataframe.rename(columns = renamed_columns_dict)
    return dataframe

def convertColumnsToNumeric(dataframe, columns = []):
    dataframe[columns] = dataframe[columns].apply(pd.to_numeric, errors ='coerce')
    return dataframe 

def fillMissingDataMode(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mode().iloc[0])
    return dataframe

def dropColumn(dataframe, column_list = []):
    return dataframe.drop(column_list, axis =1)

def fillMissingDataMean(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mean().iloc[0])
    return dataframe

def fillMissingData999(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(999)
    return dataframe

In [16]:
wta_data = readData(path)
wta_data = dropColumn(wta_data, ['player_entry'])
wta_data = fillMissingDataMode(wta_data, ['surface', 'player_hand'])
wta_data = fillMissingDataMean(wta_data, ['player_height', 'player_age', 'minutes', 'ace', 'double_fault', 'service_points_won', 'first_serve_made', 'first_serve_won', 
'second_serve_won', 'serve_game', 'break_point_saved', 'break_point_faced', 'player_rank','player_rank_points'])
wta_data = fillMissingData999(wta_data, ['player_seed'])
wta_data = convertDate(wta_data, 'tourney_date')
wta_data = addDateFeatures(wta_data, 'tourney_date')
wta_data = dropColumn(wta_data, ['player_ioc', 'score', 'tourney_id', 'tourney_date', 'player_name'])
wta_data = encodeColumn(wta_data, ['tourney_name','surface', 'tourney_level', 'player_hand','round'])
wta_data = formatColumns(wta_data)

In [17]:
getMissingDataPercentage(wta_data)

Unnamed: 0,column_name,percent_missing
draw_size,draw_size,0.0
match_num,match_num,0.0
player_id,player_id,0.0
player_seed,player_seed,0.0
player_height,player_height,0.0
player_age,player_age,0.0
best_of,best_of,0.0
minutes,minutes,0.0
ace,ace,0.0
double_fault,double_fault,0.0


In [18]:
wta_data.info(verbose =True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119438 entries, 0 to 119437
Data columns (total 2782 columns):
 #    Column                                    Dtype  
---   ------                                    -----  
 0    draw_size                                 int64  
 1    match_num                                 int64  
 2    player_id                                 int64  
 3    player_seed                               float64
 4    player_height                             float64
 5    player_age                                float64
 6    best_of                                   int64  
 7    minutes                                   float64
 8    ace                                       float64
 9    double_fault                              float64
 10   service_points_won                        float64
 11   first_serve_made                          float64
 12   first_serve_won                           float64
 13   second_serve_won                         

In [19]:
wta_data.describe()

Unnamed: 0,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,double_fault,...,player_hand_u,round_br,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_rr,round_sf
count,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,...,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0,119438.0
mean,54.5766,82.883705,202298.02958,678.641488,173.385424,24.219136,3.002579,155.92505,72.4792,73.225291,...,0.039694,1.7e-05,0.021501,0.084395,0.111723,0.168573,0.31454,0.143171,0.113013,0.043068
std,41.109025,171.380133,3351.576382,463.529275,5.000963,4.647098,0.07177,37.753336,84.053933,83.532933,...,0.195241,0.004092,0.145047,0.277981,0.315027,0.374376,0.464334,0.350248,0.31661,0.203012
min,2.0,1.0,200001.0,1.0,153.0,14.039699,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,9.0,201212.0,11.0,173.0,21.122519,3.0,173.385424,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,32.0,25.0,201450.0,999.0,173.385424,23.956194,3.0,173.385424,5.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.0,92.0,201619.0,999.0,174.0,26.964408,3.0,173.385424,173.385424,173.385424,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,128.0,2701.0,223140.0,999.0,189.0,173.385424,5.0,2475.0,173.385424,173.385424,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
wta_data.to_pickle("./wta_cleaned_data.pkl")

In [21]:
wta_data = pd.read_pickle("./wta_cleaned_data.pkl")
wta_data.head()

Unnamed: 0,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,double_fault,...,player_hand_u,round_br,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_rr,round_sf
0,4,1,201419,999.0,172.0,18.173854,3,173.385424,173.385424,173.385424,...,0,0,0,0,0,0,0,0,1,0
1,4,2,200085,999.0,163.0,24.821355,3,173.385424,173.385424,173.385424,...,0,0,0,0,0,0,0,0,1,0
2,4,1,200652,999.0,173.385424,26.973306,3,173.385424,173.385424,173.385424,...,0,0,0,0,0,0,0,0,1,0
3,4,2,200128,999.0,189.0,24.457221,3,173.385424,173.385424,173.385424,...,0,0,0,0,0,0,0,0,1,0
4,4,1,200017,999.0,173.385424,28.928131,3,173.385424,173.385424,173.385424,...,0,0,0,0,0,0,0,0,1,0
