In [1]:
import os
import string
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
PATH = os.path.abspath('')

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
pd.set_option('display.max_columns', 500)
sns.set_style("ticks")

In [4]:
def perc_print(a, b):
    return round((a-b)/a*100)

## Importation des données

In [5]:
data = pd.read_csv(PATH+'/data/Données_nettoyées.csv', index_col=0, low_memory=False)

Aperçu de la base de donnée.

In [6]:
data.head(3)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year
0,2000-301,Auckland,Hard,32,A,2000-01-10,1,103163,1.0,,Tommy Haas,R,188.0,GER,21.771389,101543,,,Jeff Tarango,L,180.0,USA,31.137577,7-5 4-6 7-5,3,R32,108.0,18.0,4.0,96.0,49.0,39.0,28.0,17.0,3.0,5.0,7.0,8.0,106.0,55.0,39.0,29.0,17.0,4.0,7.0,11.0,1612.0,63.0,595.0,2000
1,2000-301,Auckland,Hard,32,A,2000-01-10,2,102607,,Q,Juan Balcells,R,190.0,ESP,24.558522,102644,,,Franco Squillari,L,183.0,ARG,24.386037,7-5 7-5,3,R32,85.0,5.0,3.0,76.0,52.0,39.0,13.0,12.0,5.0,6.0,5.0,10.0,74.0,32.0,25.0,18.0,12.0,3.0,6.0,211.0,157.0,49.0,723.0,2000
2,2000-301,Auckland,Hard,32,A,2000-01-10,3,103252,,,Alberto Martin,R,175.0,ESP,21.390828,102238,,,Alberto Berasategui,R,173.0,ESP,26.53525,6-3 6-1,3,R32,56.0,0.0,0.0,55.0,35.0,25.0,12.0,8.0,1.0,1.0,0.0,6.0,56.0,33.0,20.0,7.0,8.0,7.0,11.0,48.0,726.0,59.0,649.0,2000


Afin de créer des variables propres à chaque joueurs, nous retravaillons nos colonnes _winner_name_ et _loser_name_ en dédoublant chaque ligne pour obtenir une ligne par joueur du match renommée _player_name_ et son adversaire dans la colonne _opponent_name_

In [8]:
data_winner = data.copy()
data_loser = data.copy()
data_winner.rename({'winner_name':'player_name'}, inplace=True)
data_loser.rename({'loser_name':'player_name'}, inplace=True)
data_winner['win_lose'] = ['w' for i in range(len(data_winner))]
data_loser['win_lose'] = ['l' for i in range(len(data_loser))]
data_winner = data_winner.rename(columns=lambda x: re.sub('winner','player',x))
data_winner = data_winner.rename(columns=lambda x: re.sub('^w_','player_',x))
data_winner = data_winner.rename(columns=lambda x: re.sub('loser','opponent',x))
data_winner = data_winner.rename(columns=lambda x: re.sub('^l_','opponent_',x))
data_loser = data_loser.rename(columns=lambda x: re.sub('loser','player',x))
data_loser = data_loser.rename(columns=lambda x: re.sub('^l_','player_',x))
data_loser = data_loser.rename(columns=lambda x: re.sub('winner','opponent',x))
data_loser = data_loser.rename(columns=lambda x: re.sub('^w_','opponent_',x))
data = pd.concat([data_winner, data_loser], axis=0, sort=False)
data.reset_index(inplace=True)

In [9]:
data.shape

(103844, 52)

## Création de variables relatives à la rencontre

Les attributs présents dans notre base sont les suivantes.

In [10]:
print('Features de base :')
print(data.columns.values)

Features de base :
['index' 'tourney_id' 'tourney_name' 'surface' 'draw_size' 'tourney_level'
 'tourney_date' 'match_num' 'player_id' 'player_seed' 'player_entry'
 'player_name' 'player_hand' 'player_ht' 'player_ioc' 'player_age'
 'opponent_id' 'opponent_seed' 'opponent_entry' 'opponent_name'
 'opponent_hand' 'opponent_ht' 'opponent_ioc' 'opponent_age' 'score'
 'best_of' 'round' 'minutes' 'player_ace' 'player_df' 'player_svpt'
 'player_1stIn' 'player_1stWon' 'player_2ndWon' 'player_SvGms'
 'player_bpSaved' 'player_bpFaced' 'opponent_ace' 'opponent_df'
 'opponent_svpt' 'opponent_1stIn' 'opponent_1stWon' 'opponent_2ndWon'
 'opponent_SvGms' 'opponent_bpSaved' 'opponent_bpFaced' 'player_rank'
 'player_rank_points' 'opponent_rank' 'opponent_rank_points' 'year'
 'win_lose']


A partir de ces données initiales, nous allons créer de nouvelles features dans le but de maximiser le signal inclu dans nos attributs. 

### 3.1 Ecart de classement entre les deux joueurs

In [11]:
data['diff_ranking'] = data.apply(lambda x: abs(x.player_rank - x.opponent_rank), axis=1)
data['diff_ranking'].head(3)

0     52.0
1    162.0
2     11.0
Name: diff_ranking, dtype: float64

### 3.2 Classement moyen des deux joueurs du match

In [12]:
data['avg_ranking'] = data.apply(lambda x: (x.player_rank + x.opponent_rank)/2, axis=1)
data['avg_ranking'].head(3)

0     37.0
1    130.0
2     53.5
Name: avg_ranking, dtype: float64

### 3.3 Les joueurs jouent de la même main

In [13]:
data['same_hand'] = data.apply(lambda x: 1 if x['player_hand']!=x['opponent_hand'] else 0, axis=1)
data['same_hand'].head(3)

0    1
1    1
2    0
Name: same_hand, dtype: int64

##  Création des variables caractéristiques du joueur 

Suite au dédoublement de nos lignes, nous allons créer des features différentes en fonction de chaque joueur.

In [14]:
nb_joueur = data['player_name'].unique().shape[0]
print('Nombre de joueur différents dans notre base :', nb_joueur)

Nombre de joueur différents dans notre base : 1355


Nous distinguons à partir d'ici les observations relatives à des Best_of 3 / Best_of 5 (matchs en 2 ou 3 sets gagnants). 

In [15]:
data_bo5 = data[data['best_of']==5].copy()
data_bo3 = data[data['best_of']==3].copy()

data_bo5.shape[0] + data_bo3.shape[0] == data.shape[0]

True

### Temps de match moyen d'un joueur

In [16]:
mean_time = data_bo3.groupby(['player_name'])['minutes'].mean()
data_bo3['player_mean_time'] = np.zeros(data_bo3.shape[0])
data_bo3['player_mean_time'] = data_bo3.apply(lambda x: mean_time[x['player_name']], axis=1)
data_bo3[['player_name', 'player_mean_time']].head(3)

Unnamed: 0,player_name,player_mean_time
0,Tommy Haas,94.272549
1,Juan Balcells,86.941176
2,Alberto Martin,94.424332


In [17]:
mean_time = data_bo5.groupby(['player_name'])['minutes'].mean()
data_bo5['player_mean_time'] = np.zeros(data_bo5.shape[0])
data_bo5['player_mean_time'] = data_bo5.apply(lambda x: mean_time[x['player_name']], axis=1)
data_bo5[['player_name', 'player_mean_time']].head(3)

Unnamed: 0,player_name,player_mean_time
255,Magnus Gustafsson,128.555556
329,Franco Squillari,142.777778
420,Thomas Enqvist,133.865385


### Temps moyen des 10 derniers matchs d'un joueur

On crée une moyenne mobile du temps moyen des 10 derniers matchs d'un joueur.

Cette opération étant amenée à être répétée, dans le but de rendre le code moins redondant nous créons une fonction qui nous permet de faire des variables représentant la dynamique d'un joueur.  

In [18]:
def dynamic_features(df, target_feature, new_feature_name, list_sort_features, window_size=10):
    # player
    player_new_feature_name = 'player_' + new_feature_name
    df[player_new_feature_name] = np.zeros(df.shape[0])
    df = df.sort_values(['player_name']+list_sort_features).copy()
    df[player_new_feature_name] = df.sort_values(['player_name']+list_sort_features).groupby('player_name')[target_feature].rolling(window=window_size,min_periods=1).mean().values
    df[player_new_feature_name+'_shifted'] = df.sort_values(['player_name']+list_sort_features).groupby('player_name')[player_new_feature_name].shift(1).values
    df.sort_values(['player_name']+list_sort_features, inplace=True)
    print(df[df['player_name']=='Roger Federer'][['player_name', target_feature, player_new_feature_name, player_new_feature_name+'_shifted']].head(5))
    print('\n')
    # opponent
    opponent_new_feature_name = 'opponent_' + new_feature_name
    df[opponent_new_feature_name] = np.zeros(df.shape[0])
    df = df.sort_values(['opponent_name']+list_sort_features).copy()
    df[opponent_new_feature_name] = df.sort_values(list_sort_features).groupby('opponent_name')[target_feature].rolling(window=window_size,min_periods=1).mean().values
    df[opponent_new_feature_name+'_shifted'] = df.sort_values(['opponent_name']+list_sort_features).groupby('opponent_name')[opponent_new_feature_name].shift(1).values
    df.sort_values(['opponent_name']+list_sort_features, inplace=True)
    print(df[df['opponent_name']=='Roger Federer'][['opponent_name', target_feature, opponent_new_feature_name, opponent_new_feature_name+'_shifted']].head(5))
    return df.copy()

Nous appliquons cette fonction.

In [19]:
data_bo3 = dynamic_features(df=data_bo3, target_feature='minutes', new_feature_name='mean_10time', list_sort_features=['tourney_date', 'match_num'], window_size=10)

         player_name  minutes  player_mean_10time  player_mean_10time_shifted
562    Roger Federer     45.0           45.000000                         NaN
52499  Roger Federer     84.0           64.500000                   45.000000
51925  Roger Federer     68.0           65.666667                   64.500000
1931   Roger Federer     72.0           67.250000                   65.666667
1945   Roger Federer     63.0           66.400000                   67.250000


       opponent_name  minutes  opponent_mean_10time  \
52484  Roger Federer     45.0             45.000000   
577    Roger Federer     84.0             64.500000   
3      Roger Federer     68.0             65.666667   
53853  Roger Federer     72.0             67.250000   
53867  Roger Federer     63.0             66.400000   

       opponent_mean_10time_shifted  
52484                           NaN  
577                       45.000000  
3                         64.500000  
53853                     65.666667  
53867    

In [20]:
data_bo5 = dynamic_features(df=data_bo5, target_feature='minutes', new_feature_name='mean_10time', list_sort_features=['tourney_date', 'match_num'], window_size=10)

         player_name  minutes  player_mean_10time  player_mean_10time_shifted
2529   Roger Federer    125.0          125.000000                         NaN
2565   Roger Federer    106.0          115.500000                  125.000000
54506  Roger Federer    101.0          110.666667                  115.500000
2097   Roger Federer    147.0          119.750000                  110.666667
2133   Roger Federer    132.0          122.200000                  119.750000


       opponent_name  minutes  opponent_mean_10time  \
54451  Roger Federer    125.0            125.000000   
54487  Roger Federer    106.0            115.500000   
2584   Roger Federer    101.0            110.666667   
54019  Roger Federer    147.0            119.750000   
54055  Roger Federer    132.0            122.200000   

       opponent_mean_10time_shifted  
54451                           NaN  
54487                    125.000000  
2584                     115.500000  
54019                    110.666667  
54055    

### Nombre de jeux moyen joué par un joueur

Nous commencons par extraire le nombre de jeux joués dans chaque match.

In [21]:
data_bo3['nb_games'] = data_bo3.apply(lambda x: x['score'].split(' '), axis=1)
data_bo3['nb_games'] = data_bo3.apply(lambda x: [string.split('-') for string in x['nb_games']], axis=1)
data_bo3['nb_games'] = data_bo3.apply(lambda x: [[int(elt[0]) for elt in liste if len(elt)>0] for liste in x['nb_games']], axis=1)
data_bo3['nb_games'] = data_bo3.apply(lambda x: [sum(liste) for liste in x['nb_games']], axis=1)
data_bo3['nb_games'] = data_bo3.apply(lambda x: sum(x['nb_games']), axis=1)

In [22]:
data_bo5['nb_games'] = data_bo5.apply(lambda x: x['score'].split(' '), axis=1)
data_bo5['nb_games'] = data_bo5.apply(lambda x: [string.split('-') for string in x['nb_games']], axis=1)
data_bo5['nb_games'] = data_bo5.apply(lambda x: [[int(elt[0]) for elt in liste if len(elt)>0] for liste in x['nb_games']], axis=1)
data_bo5['nb_games'] = data_bo5.apply(lambda x: [sum(liste) for liste in x['nb_games']], axis=1)
data_bo5['nb_games'] = data_bo5.apply(lambda x: sum(x['nb_games']), axis=1)

In [23]:
data_bo3['nb_games'].unique()

array([14, 15, 13, 28, 19, 21, 24, 17, 18, 34, 16, 27, 32, 22, 20, 25, 23,
       26, 30, 31, 36, 35, 33, 29, 39, 38, 12, 37, 10, 11])

In [24]:
data_bo5['nb_games'].unique()

array([41, 28, 37, 36, 23, 32, 22, 26, 49, 29, 25, 53, 21, 50, 24, 30, 47,
       35, 46, 40, 27, 38, 31, 52, 43, 39, 48, 55, 51, 34, 33, 45, 42, 44,
       56, 59, 20, 54, 60, 19, 57, 66, 61, 58, 64, 65, 62, 15, 63])

Un match de tennis fait au minimum 12 jeux, l'analyse du nombre de set joué dans nos donnée nous indique que certaines rencontrent finissent après 10 et 11 jeux. Nous supprimons ces matchs car ils ne respectent pas ce seuil (l'abandon n'est pas notifié dans la base).

In [25]:
nb_obs = data_bo3.shape[0]
data_bo3 = data_bo3[data_bo3['nb_games']>=12].copy()
print('{} observations supprimées ({}%).'.format(nb_obs - data_bo3.shape[0], perc_print(nb_obs, data_bo3.shape[0])))

6 observations supprimées (0%).


In [26]:
nb_obs = data_bo5.shape[0]
data_bo5 = data_bo5[data_bo5['nb_games']>=18].copy()
print('{} observations supprimées ({}%).'.format(nb_obs - data_bo5.shape[0], perc_print(nb_obs, data_bo5.shape[0])))

2 observations supprimées (0%).


A partir du nombre de jeux joués dans un match, nous déterminons le nombre de sets joué en moyenne pour chaque joueur ainsi que son advsersaire.

In [27]:
data_bo3['player_mean_games'] = np.zeros(data_bo3.shape[0])
for player in data_bo3['player_name'].unique():
    player_mean_games = data_bo3[data_bo3['player_name']==player]['nb_games'].mean()
    data_bo3['player_mean_games'].loc[data_bo3[data_bo3['player_name']==player].index] = player_mean_games.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [28]:
data_bo5['player_mean_games'] = np.zeros(data_bo5.shape[0])
for player in data_bo5['player_name'].unique():
    player_mean_games = data_bo5[data_bo5['player_name']==player]['nb_games'].mean()
    data_bo5['player_mean_games'].loc[data_bo5[data_bo5['player_name']==player].index] = player_mean_games.copy()

In [29]:
data_bo3['player_mean_games'].describe()

count    84700.000000
mean        23.022597
std          1.081867
min         12.000000
25%         22.536082
50%         22.992000
75%         23.537634
max         38.000000
Name: player_mean_games, dtype: float64

In [30]:
data_bo5['player_mean_games'].describe()

count    19136.000000
mean        35.959971
std          2.157068
min         21.000000
25%         34.716216
50%         36.053691
75%         37.240000
max         56.000000
Name: player_mean_games, dtype: float64

### Nombre de moyen de jeux d'un joueur sur ses 10 derniers matchs

On crée des moyennes mobiles du nombre de jeux joués par un joueur lors de ses 10 derniers matchs.

In [31]:
data_bo3 = dynamic_features(df=data_bo3, target_feature='nb_games', new_feature_name='mean_10games', list_sort_features=['tourney_date', 'match_num'], window_size=10)

         player_name  nb_games  player_mean_10games  \
562    Roger Federer        17                 17.0   
52499  Roger Federer        23                 20.0   
51925  Roger Federer        20                 20.0   
1931   Roger Federer        20                 20.0   
1945   Roger Federer        17                 19.4   

       player_mean_10games_shifted  
562                            NaN  
52499                         17.0  
51925                         20.0  
1931                          20.0  
1945                          20.0  


       opponent_name  nb_games  opponent_mean_10games  \
52484  Roger Federer        17                   17.0   
577    Roger Federer        23                   20.0   
3      Roger Federer        20                   20.0   
53853  Roger Federer        20                   20.0   
53867  Roger Federer        17                   19.4   

       opponent_mean_10games_shifted  
52484                            NaN  
577                     

In [32]:
data_bo5 = dynamic_features(df=data_bo5, target_feature='nb_games', new_feature_name='mean_10games', list_sort_features=['tourney_date', 'match_num'], window_size=10)

         player_name  nb_games  player_mean_10games  \
2529   Roger Federer        33            33.000000   
2565   Roger Federer        30            31.500000   
54506  Roger Federer        26            29.666667   
2097   Roger Federer        38            31.750000   
2133   Roger Federer        31            31.600000   

       player_mean_10games_shifted  
2529                           NaN  
2565                     33.000000  
54506                    31.500000  
2097                     29.666667  
2133                     31.750000  


       opponent_name  nb_games  opponent_mean_10games  \
54451  Roger Federer        33              33.000000   
54487  Roger Federer        30              31.500000   
2584   Roger Federer        26              29.666667   
54019  Roger Federer        38              31.750000   
54055  Roger Federer        31              31.600000   

       opponent_mean_10games_shifted  
54451                            NaN  
54487                   

### Suppression des lignes contenant les NA crée par shift

In [33]:
nb_obs = data_bo3.shape[0]
data_bo3.dropna(subset=['player_mean_10games_shifted',
                    'opponent_mean_10games_shifted',
                    'player_mean_10time_shifted', 
                    'opponent_mean_10time_shifted'],
           axis=0,
           inplace=True)
print('{} observations supprimées ({}%).'.format(nb_obs - data_bo3.shape[0], perc_print(nb_obs, data_bo3.shape[0])))

2478 observations supprimées (3%).


In [34]:
nb_obs = data_bo5.shape[0]
data_bo5.dropna(subset=['player_mean_10games_shifted',
                    'opponent_mean_10games_shifted',
                    'player_mean_10time_shifted', 
                    'opponent_mean_10time_shifted'],
           axis=0,
           inplace=True)
print('{} observations supprimées ({}%).'.format(nb_obs - data_bo5.shape[0], perc_print(nb_obs, data_bo5.shape[0])))

1386 observations supprimées (7%).


On remarque que le nombre d'observations supprimées n'est pas égale au double du nombre de joueurs uniques présent dans nos données. Après vérification, ce résultat est dû aux matchs dans lesquels les deux joueurs apparaissent pour la première fois dans notre base de donnée (et donc leur _shiffted_value_ est égale à NaN pour les deux sur la même ligne).

### 4.6 Features de différences

On recrée ici les variables _diff_ranking_ et _avg_ranking_ que nous avons étudié dans l'analyse descriptive des données.

In [35]:
data_bo3['diff_mean_10time_shifted'] = data_bo3.apply(lambda x: abs(x['player_mean_10time_shifted']-x['opponent_mean_10time_shifted']), axis=1)

In [36]:
data_bo5['diff_mean_10time_shifted'] = data_bo5.apply(lambda x: abs(x['player_mean_10time_shifted']-x['opponent_mean_10time_shifted']), axis=1)

In [37]:
data_bo3['diff_mean_10games_shifted'] = data_bo3.apply(lambda x: abs(x['player_mean_10games_shifted']-x['opponent_mean_10games_shifted']), axis=1)

In [38]:
data_bo5['diff_mean_10games_shifted'] = data_bo5.apply(lambda x: abs(x['player_mean_10games_shifted']-x['opponent_mean_10games_shifted']), axis=1)

In [39]:
data_bo5_2 = data_bo5.copy()
data_bo3_2 = data_bo3.copy()

In [40]:
data_bo5_2.shape

(17750, 68)

In [41]:
data_bo3_2.shape

(82222, 68)

#### Création d'une clef synthétique pour pouvoir droper les duplicates (ne plus avoir les matchs en double)

Afin de ne plus avoir qu'une ligne par match on crée un identifiant unique à l'aide du l'id du tournoi et du numéro de match. Ainsi nous pouvons effectuer un drop duplicates sur cette colonne pour inverser le dédoublement de ligne fait au préalable.

In [42]:
data_bo3['synth_key'] = data_bo3.apply(lambda x: x['tourney_id']+'-'+str(x['match_num']), axis=1)
data_bo3[['tourney_id', 'match_num', 'synth_key']].head(1)
len_before = len(data_bo3)
data_bo3 = data_bo3.copy()
data_bo3.drop_duplicates('synth_key', keep='first', inplace=True)
len_after = len(data_bo3)
if len_before == 2 * len_after:
    print('Dédoublement réussi (avant {} - après {})'.format(len_before, len_after))
else:
    print('Erreur lors du dédoublement des observations (avant {} - après {})'.format(len_before, len_after))

Dédoublement réussi (avant 82222 - après 41111)


In [43]:
data_bo5['synth_key'] = data_bo5.apply(lambda x: x['tourney_id']+'-'+str(x['match_num']), axis=1)
data_bo5[['tourney_id', 'match_num', 'synth_key']].head(1)
len_before = len(data_bo5)
data_bo5 = data_bo5.copy()
data_bo5.drop_duplicates('synth_key', keep='first', inplace=True)
len_after = len(data_bo5)
if len_before == 2 * len_after:
    print('Dédoublement réussi (avant {} - après {})'.format(len_before, len_after))
else:
    print('Erreur lors du dédoublement des observations (avant {} - après {})'.format(len_before, len_after))

Dédoublement réussi (avant 17750 - après 8875)


In [44]:
8909*2

17818

In [None]:
data