In [88]:
import numpy as np
import pandas as pd
import os
import tarfile
from six.moves import urllib
import zipfile
import json
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics

In [3]:
ATP_df = pd.read_csv("https://raw.githubusercontent.com/DSEI21000-S21/project-tennis-ml/main/atp_matches/atp_matches-2000-2021.csv")
ATP_df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2000-301,Auckland,Hard,32,A,20000110,1,103163,1.0,,...,55,39,29,17,4,7,11,1612,63,595
1,2000-301,Auckland,Hard,32,A,20000110,2,102607,,Q,...,32,25,18,12,3,6,211,157,49,723
2,2000-301,Auckland,Hard,32,A,20000110,3,103252,,,...,33,20,7,8,7,11,48,726,59,649
3,2000-301,Auckland,Hard,32,A,20000110,4,103507,7.0,,...,43,29,14,10,6,8,45,768,61,616
4,2000-301,Auckland,Hard,32,A,20000110,5,102103,,Q,...,46,34,18,12,5,9,167,219,34,873


In [4]:
ATP_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64056 entries, 0 to 64055
Data columns (total 49 columns):
tourney_id            64056 non-null object
tourney_name          64056 non-null object
surface               63858 non-null object
draw_size             64056 non-null object
tourney_level         64056 non-null object
tourney_date          64056 non-null object
match_num             64056 non-null object
winner_id             64056 non-null object
winner_seed           26583 non-null object
winner_entry          7749 non-null object
winner_name           64056 non-null object
winner_hand           64038 non-null object
winner_ht             57292 non-null object
winner_ioc            64056 non-null object
winner_age            64044 non-null object
loser_id              64056 non-null object
loser_seed            14537 non-null object
loser_entry           12700 non-null object
loser_name            64056 non-null object
loser_hand            64006 non-null object
loser_ht    

We have numerical and categorical attributes. 

In [6]:
#Percentage of missing values
(ATP_df.isna().sum() / len(ATP_df))*100

tourney_id             0.000000
tourney_name           0.000000
surface                0.309105
draw_size              0.000000
tourney_level          0.000000
tourney_date           0.000000
match_num              0.000000
winner_id              0.000000
winner_seed           58.500375
winner_entry          87.902773
winner_name            0.000000
winner_hand            0.028100
winner_ht             10.559510
winner_ioc             0.000000
winner_age             0.018734
loser_id               0.000000
loser_seed            77.305795
loser_entry           80.173598
loser_name             0.000000
loser_hand             0.078057
loser_ht              15.025915
loser_ioc              0.000000
loser_age              0.024978
score                  0.000000
best_of                0.000000
round                  0.000000
minutes               11.418134
w_ace                  9.363682
w_df                   9.363682
w_svpt                 9.363682
w_1stIn                9.363682
w_1stWon

**Drop columns with more than 50% of missing values.**

In [7]:
df_clean = ATP_df.drop(['winner_seed', 'winner_entry', 'loser_seed', 'loser_entry'], axis=1)

In [9]:
#Percentage of missing values
(df_clean.isna().sum() / len(df_clean))*100

tourney_id             0.000000
tourney_name           0.000000
surface                0.309105
draw_size              0.000000
tourney_level          0.000000
tourney_date           0.000000
match_num              0.000000
winner_id              0.000000
winner_name            0.000000
winner_hand            0.028100
winner_ht             10.559510
winner_ioc             0.000000
winner_age             0.018734
loser_id               0.000000
loser_name             0.000000
loser_hand             0.078057
loser_ht              15.025915
loser_ioc              0.000000
loser_age              0.024978
score                  0.000000
best_of                0.000000
round                  0.000000
minutes               11.418134
w_ace                  9.363682
w_df                   9.363682
w_svpt                 9.363682
w_1stIn                9.363682
w_1stWon               9.363682
w_2ndWon               9.363682
w_SvGms                9.363682
w_bpSaved              9.363682
w_bpFace

**Drop all the row that with missing values.**

In [10]:
df_clean1 = df_clean.dropna()
df_clean1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47522 entries, 0 to 64052
Data columns (total 45 columns):
tourney_id            47522 non-null object
tourney_name          47522 non-null object
surface               47522 non-null object
draw_size             47522 non-null object
tourney_level         47522 non-null object
tourney_date          47522 non-null object
match_num             47522 non-null object
winner_id             47522 non-null object
winner_name           47522 non-null object
winner_hand           47522 non-null object
winner_ht             47522 non-null object
winner_ioc            47522 non-null object
winner_age            47522 non-null object
loser_id              47522 non-null object
loser_name            47522 non-null object
loser_hand            47522 non-null object
loser_ht              47522 non-null object
loser_ioc             47522 non-null object
loser_age             47522 non-null object
score                 47522 non-null object
best_of    

In [11]:
#Percentage of missing values
(df_clean1.isna().sum() / len(df_clean1))*100

tourney_id            0.0
tourney_name          0.0
surface               0.0
draw_size             0.0
tourney_level         0.0
tourney_date          0.0
match_num             0.0
winner_id             0.0
winner_name           0.0
winner_hand           0.0
winner_ht             0.0
winner_ioc            0.0
winner_age            0.0
loser_id              0.0
loser_name            0.0
loser_hand            0.0
loser_ht              0.0
loser_ioc             0.0
loser_age             0.0
score                 0.0
best_of               0.0
round                 0.0
minutes               0.0
w_ace                 0.0
w_df                  0.0
w_svpt                0.0
w_1stIn               0.0
w_1stWon              0.0
w_2ndWon              0.0
w_SvGms               0.0
w_bpSaved             0.0
w_bpFaced             0.0
l_ace                 0.0
l_df                  0.0
l_svpt                0.0
l_1stIn               0.0
l_1stWon              0.0
l_2ndWon              0.0
l_SvGms     

**Convert tourney_date column to datetime**

In [62]:
def convertDate(dataframe, date):
    dataframe[date] = pd.to_datetime(dataframe[date], format = '%Y%m%d', errors='coerce')
    return dataframe
df_clean2 = convertDate(df_clean1, 'tourney_date')
df_clean2.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tournament_year
0,2000-301,Auckland,Hard,32,A,2000-01-10,1,103163,Tommy Haas,R,...,39,29,17,4,7,11,1612,63,595,200001


**Convert categorial columns to numeric**

In [76]:
def encodeColumn(dataframe, column_list = []):
    df = pd.get_dummies(dataframe, columns= column_list)
    return df
atp_data = encodeColumn(df_clean2, column_list =['surface', 'tourney_name', 'tourney_level', 'winner_hand', 'winner_ioc', 'loser_ioc','round'])
atp_data.head(1)

Unnamed: 0,tourney_id,draw_size,tourney_date,match_num,winner_id,winner_name,winner_ht,winner_age,loser_id,loser_name,...,round_ER,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,round_round
0,2000-301,32,2000-01-10,1,103163,Tommy Haas,188,21.7713894593,101543,Jeff Tarango,...,0,0,0,0,0,1,0,0,0,0


In [78]:
def addDateFeatures(dataframe, date):
    dataframe['year'] = dataframe[date].dt.year
    dataframe['month'] = dataframe[date].dt.month
    dataframe['day'] = dataframe[date].dt.day
    return dataframe
atp_data = addDateFeatures(atp_data, 'tourney_date')

In [79]:
atp_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47522 entries, 0 to 64052
Columns: 424 entries, tourney_id to day
dtypes: datetime64[ns](1), float64(3), object(38), uint8(382)
memory usage: 34.2+ MB


**Target Variable**

In [89]:
def appendTarget(dataframe, winner, loser):
    dataframe[winner] = 1
    dataframe[loser] = 0
    return dataframe
atp_data = appendTarget(atp_data, 'winner_name', 'loser_name')
atp_data.head(1)

Unnamed: 0,tourney_id,draw_size,tourney_date,match_num,winner_id,winner_name,winner_ht,winner_age,loser_id,loser_name,...,round_R16,round_R32,round_R64,round_RR,round_SF,round_round,year,month,day,_name
0,2000-301,32,2000-01-10,1,103163,1,188,21.7713894593,101543,0,...,0,1,0,0,0,0,2000.0,1.0,10.0,0


In [None]:
X = df1['winner_name']
y = atp_data.drop(columns=['winner_name'],axis=1)
