In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# Scraping

In [2]:
url = 'https://www.tennisexplorer.com/results/?type=atp-single&year=2023&month=05&day=29'
tables = pd.read_html(url)
print(len(tables))

5


In [3]:
print(tables[0])

              0                 1  2    3     4    5    6    7     8      9   \
0    French Open       French Open  S  1.0   2.0  3.0  4.0  5.0     H      A   
1          20:30     Sinner J. (8)  3  6.0   6.0  6.0  NaN  NaN  1.03  13.18   
2          20:30         Muller A.  0  1.0   4.0  1.0  NaN  NaN  1.03  13.18   
3          19:35     Van Assche L.  3  6.0   6.0  6.0  NaN  NaN  1.82   2.00   
4          19:35     Cecchinato M.  0  1.0   1.0  3.0  NaN  NaN  1.82   2.00   
..           ...               ... ..  ...   ...  ...  ...  ...   ...    ...   
547        10:40   Lavagno E. (10)  1  6.0   1.0  5.0  NaN  NaN  1.91   1.79   
548        10:40  Travaglia S. (7)  2  6.0   6.0  NaN  NaN  NaN  1.32   3.10   
549        10:40          Forti F.  0  2.0   4.0  NaN  NaN  NaN  1.32   3.10   
550        10:40     Sachko V. (9)  2  6.0   7.0  NaN  NaN  NaN  2.07   1.68   
551        10:40      Fonio G. (6)  0  4.0  61.0  NaN  NaN  NaN  2.07   1.68   

     10    11  
0   NaN   NaN  
1   NaN

In [4]:
def scrapurl(url):
    list_of_tables = []
    for j in range (1, 9): # 1st month to the 9th
        for i in range(1, 29): # 1st day to 29th 
            tables1 = pd.read_html(url+str(j)+'&day='+str(i))
            list_of_tables.append(tables1[0])
            i += 1
    return list_of_tables

list_of_tables = scrapurl('https://www.tennisexplorer.com/results/?type=atp-single&year=2023&month=0')
df = pd.concat(list_of_tables)

In [5]:
print(df.head())
print(df.shape)

           0                 1  2     3    4    5    6    7     8     9    10  \
0  United Cup        United Cup  S   1.0  2.0  3.0  4.0  5.0     H     A  NaN   
1       12:55           Gojo B.  2   7.0  6.0  NaN  NaN  NaN  1.34  3.22  NaN   
2       12:55          Coria F.  0  65.0  4.0  NaN  NaN  NaN  1.34  3.22  NaN   
3       11:05          Evans D.  2   6.0  1.0  6.0  NaN  NaN  1.26  3.84  NaN   
4       11:05  Ramos-Vinolas A.  1   3.0  6.0  3.0  NaN  NaN  1.26  3.84  NaN   

     11  
0   NaN  
1  info  
2  info  
3  info  
4  info  
(50054, 12)


Here we are with the 50k lines, let's see how many games we can have for the model we'll build later.

# Cleaning

In [6]:
df.to_csv('tenniscrap2.csv')

In [7]:
df = df.drop([0, 3, 4, 5, 6, 7, 10, 11], axis=1) # Get rid of useless columns
print(df.columns)
print(df.head())

Int64Index([1, 2, 8, 9], dtype='int64')
                  1  2     8     9
0        United Cup  S     H     A
1           Gojo B.  2  1.34  3.22
2          Coria F.  0  1.34  3.22
3          Evans D.  2  1.26  3.84
4  Ramos-Vinolas A.  1  1.26  3.84


In [8]:
df = df.rename(columns={1: 'Player 1', 2: 'Score', 8: 'Odds Player 1', 9: 'Odds Player 2'})

In [9]:
print(df.head())

           Player 1 Score Odds Player 1 Odds Player 2
0        United Cup     S             H             A
1           Gojo B.     2          1.34          3.22
2          Coria F.     0          1.34          3.22
3          Evans D.     2          1.26          3.84
4  Ramos-Vinolas A.     1          1.26          3.84


We create a "Tournois" column because as of now, the name of the tournament is in the column Player 1. 
To differ the name of a tournament and the name of player we look at the column "Player 1", if the string ends with a '.' or a ')', it means that it's a player because the format of the player's name is either "Lastname" "First letter of the firstname." or the latter with the ATP rank of the player between brackets. cf "condition *" 

In [10]:
# Create the column "Tournois" with None default values
df['Tournois'] = None

# Reset the DF Index
df = df.reset_index(drop=True)

# Putting the "Player 1" value in "Tournois" in the condition is matched
df.loc[~df['Player 1'].str.endswith((".", ")")), 'Tournois'] = df['Player 1']

# Putting "Tournois" in the first place
df = df[['Tournois'] + [col for col in df.columns if col != 'Tournois']]

print(df)


         Tournois                       Player 1  \
0      United Cup                     United Cup   
1            None                        Gojo B.   
2            None                       Coria F.   
3            None                       Evans D.   
4            None               Ramos-Vinolas A.   
...           ...                            ...   
50049        None  No schedule for this day yet.   
50050        None  No schedule for this day yet.   
50051        None  No schedule for this day yet.   
50052        None  No schedule for this day yet.   
50053        None  No schedule for this day yet.   

                               Score                  Odds Player 1  \
0                                  S                              H   
1                                  2                           1.34   
2                                  0                           1.34   
3                                  2                           1.26   
4                   

In [11]:
df = df[df['Player 1'] != "No schedule for this day yet."] # Getting rid of lines with no game

We forwardfill (ffill) the values in the column Tournois to replace the None values by the proper values of the tournament names.

In [12]:
df['Tournois'] = df['Tournois'].fillna(method='ffill')
df = df[df['Player 1'] != df['Tournois']]
print(df.head())

     Tournois          Player 1 Score Odds Player 1 Odds Player 2
1  United Cup           Gojo B.     2          1.34          3.22
2  United Cup          Coria F.     0          1.34          3.22
3  United Cup          Evans D.     2          1.26          3.84
4  United Cup  Ramos-Vinolas A.     1          1.26          3.84
5  United Cup        Hurkacz H.     2          1.35          3.19


Our main problem now is that the Players for one match are displayed one on top of the other, we want them to be next to each other in two separate columns. Before moving the players 2 in their proper column, we want to make sure we don't mix the matches, it would destroy the dataframe. To do so, we look at the odds of the games, if they are the same on two consecutive lines, it means the two lines are for the same game.
Disclaimer : The Player 1 is always the winner, it's built like this on the website.

In [13]:
import numpy as np

# Create the Match ID Column
df['Match ID'] = (df['Odds Player 1'] != df['Odds Player 1'].shift(1)) | (df['Odds Player 2'] != df['Odds Player 2'].shift(1))
df['Match ID'] = df['Match ID'].cumsum()

# Group by 'Match ID' and create 'Player 2'
df_grouped = df.groupby('Match ID').agg({
    'Tournois': 'first',
    'Player 1': lambda x: list(x),
    'Score': 'first',
    'Odds Player 1': 'first',
    'Odds Player 2': 'first'
}).reset_index()

df_grouped['Player 2'] = df_grouped['Player 1'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
df_grouped['Player 1'] = df_grouped['Player 1'].apply(lambda x: x[0])

# Clean the DataFrame and reorganize columns
df_grouped = df_grouped[['Tournois', 'Player 1', 'Player 2', 'Score', 'Odds Player 1', 'Odds Player 2']]

print(df_grouped)


           Tournois         Player 1          Player 2 Score Odds Player 1  \
0        United Cup          Gojo B.          Coria F.     2          1.34   
1        United Cup         Evans D.  Ramos-Vinolas A.     2          1.26   
2        United Cup       Hurkacz H.         Bublik A.     2          1.35   
3        United Cup      Kuzmanov D.          Bergs Z.     2          2.13   
4        United Cup          Otte O.        Svrcina D.     2          1.41   
...             ...              ...               ...   ...           ...   
31158  Futures 2023           Kim D.               NaN  None          None   
31159  Futures 2023           Kim D.               NaN  None          None   
31160  Futures 2023          Kang J.               NaN  None          None   
31161         Halle  Altmaier D. (4)               NaN  None          None   
31162         Halle   Eubanks C. (7)               NaN  None          None   

      Odds Player 2  
0              3.22  
1              3.84

In [14]:
print(df_grouped.isnull().sum())

Tournois             0
Player 1             0
Player 2         14021
Score               42
Odds Player 1    13990
Odds Player 2    13990
dtype: int64


In [15]:
# Getting rid of games without both odds
df = df_grouped.dropna(subset=['Odds Player 1', 'Odds Player 2']) 
print(df.isnull().sum())
print(df.head())

Tournois          0
Player 1          0
Player 2         31
Score            14
Odds Player 1     0
Odds Player 2     0
dtype: int64
     Tournois     Player 1          Player 2 Score Odds Player 1 Odds Player 2
0  United Cup      Gojo B.          Coria F.     2          1.34          3.22
1  United Cup     Evans D.  Ramos-Vinolas A.     2          1.26          3.84
2  United Cup   Hurkacz H.         Bublik A.     2          1.35          3.19
3  United Cup  Kuzmanov D.          Bergs Z.     2          2.13          1.71
4  United Cup      Otte O.        Svrcina D.     2          1.41          2.87


Now we wanna add a bit more info with ATP Ranks of all the players, even though it's hard to update it regularly, the rankings are not meant to change everyday so we'll proxy them for the whole year.

In [16]:
df_atp = pd.read_csv('/Users/corentinlequeux-peltier/Desktop/ATP.csv')
print(df_atp.head())

   Rank              Player
0     1      Novak Djokovic
1     2      Carlos Alcaraz
2     3     Daniil Medvedev
3     4       Jannik Sinner
4     5  Stefanos Tsitsipas


The problem here is that the names of the players don't match in the two Dataframes.

In [17]:
# Preparing the ATP Dataframe
df_atp[['Prenom', 'Nom']] = df_atp['Player'].str.split(' ', 1, expand=True)
df_atp = df_atp.drop('Prenom', axis=1)  # Deleting the firstname column, not so useful

df = df.copy()

# Spliting the Names so the columns of the two Dataframes can match
df.loc[:, 'Player 1'] = df['Player 1'].str.split(' ', 1, expand=True)[0]
df.loc[:, 'Player 2'] = df['Player 2'].str.split(' ', 1, expand=True)[0]

print(df.head())
print(df_atp.head())

     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2
0  United Cup      Gojo          Coria     2          1.34          3.22
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84
2  United Cup   Hurkacz         Bublik     2          1.35          3.19
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71
4  United Cup      Otte        Svrcina     2          1.41          2.87
   Rank              Player        Nom
0     1      Novak Djokovic   Djokovic
1     2      Carlos Alcaraz    Alcaraz
2     3     Daniil Medvedev   Medvedev
3     4       Jannik Sinner     Sinner
4     5  Stefanos Tsitsipas  Tsitsipas


In [18]:
# Merging for the Player 1 rank
df = df.merge(df_atp, left_on='Player 1', right_on='Nom', how='left', suffixes=('', '_x'))
df = df.rename(columns={'rank': 'Rank Player 1'})

# Merging for the Player 2 rank
df = df.merge(df_atp, left_on='Player 2', right_on='Nom', how='left', suffixes=('', '_y'))
df = df.rename(columns={'rank': 'Rank Player 2'})

print(df.head())


     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2  \
0  United Cup      Gojo          Coria     2          1.34          3.22   
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84   
2  United Cup   Hurkacz         Bublik     2          1.35          3.19   
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71   
4  United Cup      Otte        Svrcina     2          1.41          2.87   

    Rank            Player       Nom  Rank_y          Player_y    Nom_y  
0  115.0        Borna Gojo      Gojo   137.0    Federico Coria    Coria  
1   68.0      Daniel Evans     Evans     NaN               NaN      NaN  
2   16.0    Hubert Hurkacz   Hurkacz    94.0  Alexander Bublik   Bublik  
3  234.0  Dimitar Kuzmanov  Kuzmanov   160.0       Zizou Bergs    Bergs  
4  141.0        Oscar Otte      Otte   175.0   Dalibor Svrcina  Svrcina  


In [19]:
print(df.columns)

Index(['Tournois', 'Player 1', 'Player 2', 'Score', 'Odds Player 1',
       'Odds Player 2', 'Rank', 'Player', 'Nom', 'Rank_y', 'Player_y',
       'Nom_y'],
      dtype='object')


In [20]:
df = df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 9]] # Keeping the columns we want only
print(df.head())

     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2  \
0  United Cup      Gojo          Coria     2          1.34          3.22   
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84   
2  United Cup   Hurkacz         Bublik     2          1.35          3.19   
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71   
4  United Cup      Otte        Svrcina     2          1.41          2.87   

    Rank  Rank_y  
0  115.0   137.0  
1   68.0     NaN  
2   16.0    94.0  
3  234.0   160.0  
4  141.0   175.0  


In [21]:
df = df.rename(columns={'Rank': 'Rank Player 1'})
df = df.rename(columns={'Rank_y': 'Rank Player 2'})
print(df.head())

     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2  \
0  United Cup      Gojo          Coria     2          1.34          3.22   
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84   
2  United Cup   Hurkacz         Bublik     2          1.35          3.19   
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71   
4  United Cup      Otte        Svrcina     2          1.41          2.87   

   Rank Player 1  Rank Player 2  
0          115.0          137.0  
1           68.0            NaN  
2           16.0           94.0  
3          234.0          160.0  
4          141.0          175.0  


In [22]:
print(df.dtypes)
indices_1 = np.where(df['Odds Player 1'] == 'H')
print("Indices où 'H' est présent dans 'Odds Player 1':", indices_1)

Tournois          object
Player 1          object
Player 2          object
Score             object
Odds Player 1     object
Odds Player 2     object
Rank Player 1    float64
Rank Player 2    float64
dtype: object
Indices où 'H' est présent dans 'Odds Player 1': (array([  750,   879,   999,  8927,  9109,  9143,  9279,  9451,  9566,
        9639,  9689, 10440, 10454, 10476, 10636, 10712, 10727, 10818,
       10826, 10867, 10872, 11014, 11019, 11135, 11140, 11201, 11205,
       11241, 11243]),)


In [23]:
df = df[df['Odds Player 1'] != 'H']
print("Indices où 'H' est présent dans 'Odds Player 1':", indices_1)

Indices où 'H' est présent dans 'Odds Player 1': (array([  750,   879,   999,  8927,  9109,  9143,  9279,  9451,  9566,
        9639,  9689, 10440, 10454, 10476, 10636, 10712, 10727, 10818,
       10826, 10867, 10872, 11014, 11019, 11135, 11140, 11201, 11205,
       11241, 11243]),)


In [24]:
print(df.head())

     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2  \
0  United Cup      Gojo          Coria     2          1.34          3.22   
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84   
2  United Cup   Hurkacz         Bublik     2          1.35          3.19   
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71   
4  United Cup      Otte        Svrcina     2          1.41          2.87   

   Rank Player 1  Rank Player 2  
0          115.0          137.0  
1           68.0            NaN  
2           16.0           94.0  
3          234.0          160.0  
4          141.0          175.0  


Now we're going to think more about our model. The thing we want to assess here is weather the favorite won the game or not. As we said before, the Player 1 is always the winner. We're creating a column "Favorite Won" that is 1 the player with the smallest odds wins and 0 if not.

In [25]:
df['Favorite Won'] = (df['Odds Player 1'] < df['Odds Player 2']).astype(int)

In [26]:
print(df.head())
print(df.dtypes)
df['Odds Player 1'] = pd.to_numeric(df['Odds Player 1'], errors='coerce')
df['Odds Player 2'] = pd.to_numeric(df['Odds Player 2'], errors='coerce')


     Tournois  Player 1       Player 2 Score Odds Player 1 Odds Player 2  \
0  United Cup      Gojo          Coria     2          1.34          3.22   
1  United Cup     Evans  Ramos-Vinolas     2          1.26          3.84   
2  United Cup   Hurkacz         Bublik     2          1.35          3.19   
3  United Cup  Kuzmanov          Bergs     2          2.13          1.71   
4  United Cup      Otte        Svrcina     2          1.41          2.87   

   Rank Player 1  Rank Player 2  Favorite Won  
0          115.0          137.0             1  
1           68.0            NaN             1  
2           16.0           94.0             1  
3          234.0          160.0             0  
4          141.0          175.0             1  
Tournois          object
Player 1          object
Player 2          object
Score             object
Odds Player 1     object
Odds Player 2     object
Rank Player 1    float64
Rank Player 2    float64
Favorite Won       int64
dtype: object


In [27]:
print(df.dtypes)
df.to_csv('Tenniscrap3.csv')

Tournois          object
Player 1          object
Player 2          object
Score             object
Odds Player 1    float64
Odds Player 2    float64
Rank Player 1    float64
Rank Player 2    float64
Favorite Won       int64
dtype: object


# Model

In [28]:
label_encoder = LabelEncoder() # Turning categorical features into something the program can understand

categorical_features = ['Tournois', 'Player 1', 'Player 2', 'Score']
for col in categorical_features:
    df[col] = label_encoder.fit_transform(df[col])

In [29]:
X = df.drop('Favorite Won', axis=1) # Features
y = df['Favorite Won'] # Target

# Train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Had a NaN value in X_train and X_test, didn't really know if I should drop them or fill
imputer = SimpleImputer(strategy='most_frequent') 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(X_train)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9988489208633093


In [32]:
y_pred = clf.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

Precision:  0.9995901639344262
Recall:  0.9987714987714987
F1 Score:  0.9991806636624335
