In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [46]:
path = "https://raw.githubusercontent.com/DSEI21000-S21/project-tennis-ml/main/wta_matches/wta_matches-2000-2021_expanded.csv"

def readData(path):
    df = pd.read_csv(path)
    return df

def dropGibberishRows(dataframe, column_name, filter_variable):
    index_list = list(dataframe[dataframe[column_name] == filter_variable].index)
    dataframe = dataframe.drop(index_list)
    return dataframe

def encodeColumn(dataframe, column_list = []):
    df = pd.get_dummies(dataframe, columns= column_list)
    return df

def convertDate(dataframe, date):
    dataframe[date] = pd.to_datetime(dataframe[date], format = '%Y%m%d', errors='coerce')
    return dataframe

def addDateFeatures(dataframe, date):
    dataframe['year'] = dataframe[date].dt.year
    dataframe['month'] = dataframe[date].dt.month
    dataframe['day'] = dataframe[date].dt.day
    return dataframe

def appendTarget(dataframe, winner, loser):
    dataframe[winner] = 1
    dataframe[loser] = 0
    return dataframe

def getMissingDataPercentage(dataframe):
    percent_missing = dataframe.isnull().sum() * 100 / len(dataframe)
    missing_value_df = pd.DataFrame({'column_name': dataframe.columns,
                                 'percent_missing': percent_missing})
    pd.set_option("max_rows", None)
    return missing_value_df

def formatColumns(dataframe):
    columns = list(dataframe.columns)
    renamed_columns = [i.lower().strip().replace(' ', "_").replace(':','') for i in columns]
    renamed_columns_dict = {columns[i]: renamed_columns[i] for i in range(len(columns))}
    dataframe = dataframe.rename(columns = renamed_columns_dict)
    return dataframe

def convertColumnsToNumeric(dataframe, columns = []):
    dataframe[columns] = dataframe[columns].apply(pd.to_numeric, errors ='coerce')
    return dataframe 

def fillMissingDataMode(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mode().iloc[0])
    return dataframe

def dropColumn(dataframe, column_list = []):
    return dataframe.drop(column_list, axis =1)

def fillMissingDataMean(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(dataframe[cols].mean().iloc[0])
    return dataframe

def fillMissingData999(dataframe, cols):
    dataframe[cols]= dataframe[cols].fillna(999)
    return dataframe

In [47]:
wta_data = readData(path)
wta_data = dropColumn(wta_data, ['player_entry'])
wta_data = fillMissingDataMode(wta_data, ['surface', 'player_hand'])
wta_data = fillMissingDataMean(wta_data, ['player_height', 'player_age', 'minutes', 'ace', 'double_fault', 'service_points_won', 'first_serve_made', 'first_serve_won', 
'second_serve_won', 'serve_game', 'break_point_saved', 'break_point_faced', 'player_rank','player_rank_points'])
wta_data = fillMissingData999(wta_data, ['player_seed'])
wta_data = convertDate(wta_data, 'tourney_date')
wta_data = addDateFeatures(wta_data, 'tourney_date')
wta_data = dropColumn(wta_data, ['player_ioc', 'score', 'tourney_id', 'tourney_date', 'player_name'])
mask = (wta_data['year'] == 2020) & (wta_data['surface'] == "Clay") & (wta_data['tourney_name'].str.contains('Roland'))
wta_data = wta_data[mask]
wta_data = wta_data.reset_index()
wta_data = encodeColumn(wta_data, ['tourney_name','surface', 'tourney_level', 'player_hand','round'])
wta_data = formatColumns(wta_data)

In [48]:
getMissingDataPercentage(wta_data)
wta_data

Unnamed: 0,index,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,...,player_hand_l,player_hand_r,player_hand_u,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_sf
0,58441,128,2101,201594,1.0,168.0,29.004791,3,82.0,1.0,...,0,1,0,0,0,1,0,0,0,0
1,58442,128,2102,201593,999.0,181.0,30.091718,3,150.0,1.0,...,0,1,0,0,0,1,0,0,0,0
2,58443,128,2103,211095,999.0,173.385424,25.820671,3,86.0,2.0,...,1,0,0,0,0,1,0,0,0,0
3,58444,128,2104,216153,25.0,173.385424,19.077344,3,59.0,2.0,...,0,1,0,0,0,1,0,0,0,0
4,58445,128,2105,201611,999.0,173.385424,26.568104,3,89.0,1.0,...,0,1,0,0,0,1,0,0,0,0
5,58446,128,2106,202446,999.0,173.385424,26.590007,3,81.0,1.0,...,0,1,0,0,0,1,0,0,0,0
6,58447,128,2107,201444,999.0,169.0,34.732375,3,94.0,0.0,...,0,1,0,0,0,1,0,0,0,0
7,58448,128,2108,216347,999.0,173.385424,19.329227,3,63.0,0.0,...,0,1,0,0,0,1,0,0,0,0
8,58449,128,2109,221103,999.0,173.385424,16.544832,3,101.0,3.0,...,0,1,0,0,0,1,0,0,0,0
9,58450,128,2110,203354,999.0,173.385424,26.902122,3,61.0,0.0,...,1,0,0,0,0,1,0,0,0,0


In [44]:
wta_data.info(verbose =True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       254 non-null    int64  
 1   draw_size                   254 non-null    int64  
 2   match_num                   254 non-null    int64  
 3   player_id                   254 non-null    int64  
 4   player_seed                 254 non-null    float64
 5   player_height               254 non-null    float64
 6   player_age                  254 non-null    float64
 7   best_of                     254 non-null    int64  
 8   minutes                     254 non-null    float64
 9   ace                         254 non-null    float64
 10  double_fault                254 non-null    float64
 11  service_points_won          254 non-null    float64
 12  first_serve_made            254 non-null    float64
 13  first_serve_won             254 non

In [49]:
wta_data.describe()

Unnamed: 0,index,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,...,player_hand_l,player_hand_r,player_hand_u,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_sf
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,...,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,88363.5,128.0,2216.488189,206775.244094,700.338583,174.1102,26.167224,3.0,98.669291,2.845555,...,0.094488,0.834646,0.070866,0.007874,0.031496,0.503937,0.062992,0.125984,0.251969,0.015748
std,29918.475223,0.0,117.721777,5971.83822,453.740714,3.820335,4.409994,0.0,32.403595,15.323434,...,0.293084,0.372234,0.257108,0.08856,0.174999,0.500972,0.243428,0.332487,0.435,0.124745
min,58441.0,128.0,2101.0,200033.0,1.0,157.0,16.544832,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58504.25,128.0,2132.25,201594.25,26.25,173.385424,23.329227,3.0,73.25,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,88363.5,128.0,2164.0,203389.0,999.0,173.385424,26.045175,3.0,96.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,118222.75,128.0,2231.75,211688.5,999.0,173.385424,28.80219,3.0,121.75,2.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,0.0
max,118286.0,128.0,2701.0,221103.0,999.0,185.0,40.281999,3.0,191.0,173.385424,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
wta_data.to_pickle("./wta_cleaned_data_french_open2020.pkl")

In [53]:
wta_data = pd.read_pickle("./wta_cleaned_data_french_open2020.pkl")
wta_data.head()

Unnamed: 0,index,draw_size,match_num,player_id,player_seed,player_height,player_age,best_of,minutes,ace,...,player_hand_l,player_hand_r,player_hand_u,round_f,round_qf,round_r128,round_r16,round_r32,round_r64,round_sf
0,58441,128,2101,201594,1.0,168.0,29.004791,3,82.0,1.0,...,0,1,0,0,0,1,0,0,0,0
1,58442,128,2102,201593,999.0,181.0,30.091718,3,150.0,1.0,...,0,1,0,0,0,1,0,0,0,0
2,58443,128,2103,211095,999.0,173.385424,25.820671,3,86.0,2.0,...,1,0,0,0,0,1,0,0,0,0
3,58444,128,2104,216153,25.0,173.385424,19.077344,3,59.0,2.0,...,0,1,0,0,0,1,0,0,0,0
4,58445,128,2105,201611,999.0,173.385424,26.568104,3,89.0,1.0,...,0,1,0,0,0,1,0,0,0,0
