# TENNIS PREDICTOR

## 1. Data Importing and Preprocessing

### 1.1 Importing Data 
- Go through all csv files and merge all games since 1989 in one big Pandas Dataframe.
- Rename columns with better names.

In [289]:
# imports .
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Get data file names
path = r'./data/atp_matches'
all_files = glob.glob(path + "/*.csv")


start_year_considered = 1989

tournament_old_new_columns = {
    "tourney_name": "Tourney",
    "surface": "Surface",
    "tourney_date": "Date", 
    "winner_name": "Player1", 
    "winner_age": "Player1Age", 
    "winner_ioc": "Player1Nationality",
    "loser_name": "Player2", 
    "loser_age": "Player2Age",
    "loser_ioc": "Player2Nationality",
    "score": "Score",
    "best_of": "BestOf"
}

df_tournaments = pd.DataFrame([])

for filename in all_files:
    start_year = filename.split("/")[-1].split("_")[-1].split(".")[0]
    if int(start_year) > start_year_considered: 
        df = pd.read_csv(filename, index_col = None)        
        df_tournaments = pd.concat([df_tournaments, df])

df_tournaments = df_tournaments[tournament_old_new_columns.keys()]
df_tournaments.rename(columns = tournament_old_new_columns, inplace = True)

### 1.2 Preprocessing

- Format Date column as a DateTime object.
- Sort rows by dates (older to newest)
- Drop games from exibition tournaments (Davis Cup and Laver Cup)
- Create Target column "Player1Won" (the value that we will try to predict)

In [290]:
# PREPROCESSING
# Format Date as DateTime.
df_tournaments["Date"] = df_tournaments["Date"].apply(
    lambda x: str(x)[0:4] + "-" + str(x)[4:6] + "-" + str(x)[6:]
)
df_tournaments['Date'] =  pd.to_datetime(df_tournaments['Date'])

# Sort by Date, Reset Index and drop Na and Davis/Laver Cup
df_tournaments = df_tournaments.sort_values(by = "Date").dropna()
df_tournaments = df_tournaments[~(df_tournaments.Tourney.str.contains("Davis"))]
df_tournaments = df_tournaments[~(df_tournaments.Tourney.str.contains("Laver"))]

# Create target column
df_tournaments["Player1Won"] = 1.0

- Use Geocoders to get country from tournament name
- Make new Columns "Location" that specifies country in which the tournament is played

In [291]:
from geopy.geocoders import Nominatim

## INITIALISE ONLY WHEN RERUNNING FROM SCRATCH
# tournament_to_country_dict = {}
#############################################

# Dictionary of Countries 
for tournament in df_tournaments.Tourney.unique():
           
    if tournament in tournament_to_country_dict:
        continue
    if "Cup" in tournament or "Finals" in tournament:
        tournament_to_country_dict[tournament] = "International"
    
    else: 
        t = tournament.replace("Masters", "").replace("ATP", "").replace("Indoor", "")\
                      .replace("Outdoor", "").replace("Olympics", "").replace("Open", "")
        geolocator = Nominatim(user_agent = "tennis_predictor")
        location = geolocator.geocode(t, language = "en")

        country = location.address.split(",")[-1]
        tournament_to_country_dict[tournament] = country


In [292]:
# Standardize Locations to Countries
df_tournaments["Location"] = df_tournaments["Tourney"].apply(lambda x: tournament_to_country_dict[x])

- Make dictionary of IOC to IOS codes
- Use CountryConverter to convert players' Nationalities in countries

In [293]:
import country_converter as coco

ioc_to_ios = {"ALG": "DZA", "ANG": "AGO", "ANT": "ATG", "ARU": "ABW",
    "BAH": "BHS", "BRN": "BHR", "BAN": "BGD", "BAR": "BRB", "BIZ": "BLZ", "BER": "BMU", "BHU": "BTN", "BOT": "BWA",
    "IVB": "VGB", "BRU": "BRN", "BUL": "BGR", "BUR": "BFA", "CAM": "KHM", "CAY": "CYM", "CHA": "TCD", "CHI": "CHL",
    "CGO": "COG", "CRC": "CRI", "CRO": "HRV", "DEN": "DNK", "ESA": "SLV", "GEQ": "GNQ", "FIJ": "FJI", "GAM": "GMB",
    "GER": "DEU", "GRE": "GRC", "GRN": "GRD", "GUA": "GTM", "GUI": "GIN", "GBS": "GNB", "HAI": "HTI", "HON": "HND", 
    "INA": "IDN", "IRI": "IRN", "KUW": "KWT", "LAT": "LVA", "LIB": "LBN", "LES": "LSO", "LBA": "LBY", "MAD": "MDG", 
    "MAW": "MWI", "MAS": "MYS", "MTN": "MRT", "MRI": "MUS", "MON": "MCO", "MGL": "MNG", "MYA": "MMR", "NEP": "NPL", 
    "NED": "NLD", "NCA": "NIC", "NIG": "NER", "NGR": "NGA", "OMA": "OMN", "PLE": "PSE", "PAR": "PRY", "PHI": "PHL", 
    "POR": "PRT", "PUR": "PRI", "SKN": "KNA", "VIN": "VCT", "SAM": "WSM", "KSA": "SAU", "SEY": "SYC", "SIN": "SGP", 
    "SLO": "SVN", "SOL": "SLB", "RSA": "ZAF", "SRI": "LKA", "SUD": "SDN", "SUI": "CHE", "TPE": "TWN", "TAN": "TZA", 
    "TOG": "TGO", "TGA": "TON", "TRI": "TTO", "UAE": "ARE", "ISV": "VIR", "URU": "URY", "VAN": "VUT", "VIE": "VNM", 
    "YUG": "SRB","ZAM": "ZMB", "ZIM": "ZWE"
}

In [294]:
coco_converter = coco.CountryConverter()

df_tournaments["Player1Nationality"] = df_tournaments["Player1Nationality"].apply( \
    lambda x: ioc_to_ios[x] if x in ioc_to_ios else x \
)

df_tournaments["Player1Nationality"] = \
    coco_converter.convert(
        names = df_tournaments["Player1Nationality"].to_numpy(), to = "name_short"
    )

df_tournaments["Player2Nationality"] = df_tournaments["Player2Nationality"].apply( \
    lambda x: ioc_to_ios[x] if x in ioc_to_ios else x \
)

df_tournaments["Player2Nationality"] = \
    coco_converter.convert(
        names = df_tournaments["Player2Nationality"].to_numpy(), to = "name_short"
    )


- Custom-define function to parse the score (disregard anything containing alphabetic characters)
- Compute four new columns for games and sets won and lost by player 1.
- Compute two new binary columns to flag if player1 and player2 respectively are playing in their home country

In [295]:
def parseScore(score):
    sets_won = 0
    sets_lost = 0
    games_won = 0
    games_lost = 0
    
    sets = score.split(" ")
    for s in sets:
        if not(s):
            break
        games = s.split("-")
        
        games_p1 = games[0]
        games_p2 = games[1][0:2] if len(games[1]) and games[1].isnumeric() > 1 else games[1][0]
        
        if not games_p1.isnumeric() or not games_p2.isnumeric():
            break
        
        games_won += int(games_p1)
        games_lost += int(games_p2)
        if games_p1 > games_p2:
            sets_won += 1
        else:
            sets_lost += 1
    return [sets_won, sets_lost, games_won, games_lost]

In [296]:
# Remove retirements and walk overs
df_tournaments = df_tournaments[~df_tournaments.Score.str.contains(r'[a-zA-Z]')]


# Parse Scores in Games Won, Games Lost, Sets Won, Sets Lost
df_tournaments["Player1SetsWon"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[0]
)
df_tournaments["Player1SetsLost"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[1]                                                       
)
df_tournaments["Player1GamesWon"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[2]
)
df_tournaments["Player1GamesLost"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[3]                                                       
)

In [297]:
df_tournaments["Player1PlaysHome"] = \
    df_tournaments["Player1Nationality"] == df_tournaments["Location"]
df_tournaments["Player2PlaysHome"] = \
    df_tournaments["Player2Nationality"] == df_tournaments["Location"]

- Reorder columns for easier readability
- Drop score column
- Randomly select 50% of the rows and swap data for player1 and player2
- For these rows, the target column also has to be changed to 0, instead of 1.

In [298]:
## Order Columns and restructure like this:
# winner = player1
# loser = player2
# shuffle winner-loser if rand(0, 1) > 0.5
# create a target Player1Won that is 1 if player1 won, 0 if they lost.
# drop score column, not needed anymore

columns_final_order = ["Date", "Tourney", "Location", "Surface", "BestOf",
                       "Player1", "Player1Age", "Player1Nationality", "Player1PlaysHome",
                       "Player2", "Player2Age", "Player2Nationality", "Player2PlaysHome",
                       "Player1SetsWon", "Player1SetsLost", "Player1GamesWon", 
                       "Player1GamesLost", "Player1Won"]

df_tournaments = df_tournaments.reset_index()[columns_final_order]

In [299]:
# swap 50% of the players to balance dataset.
random_indexes = np.array(df_tournaments.sample(frac = 0.50, axis = 'rows').index)

pairs_cols_toswap = [[5, 9], [6, 10], [7, 11], [8, 12], [13, 14], [15, 16]]
for pair in pairs_cols_toswap:
    tmp = df_tournaments.iloc[random_indexes, pair[0]].copy()
    df_tournaments.iloc[random_indexes, pair[0]] = df_tournaments.iloc[random_indexes, pair[1]]
    df_tournaments.iloc[random_indexes, pair[1]] = tmp

# Player 1 when swapped lost
df_tournaments.iloc[random_indexes, 17] = 0.0

### 1.3 Data Exporting
- Display final dataset
- Export dataset in CSV for immediate future use.

In [302]:
## UNCOMMENT ONLY WHEN NEEDING TO RERUN THE WHOLE PREPROCESSING ##
# df_tournaments.to_csv("data/processed_data.csv", index = False)

## 2 Glicko2 Score

### 2.1 Preprocessed Data Import

In [366]:
print(processed_df["Surface"].unique())

['Hard' 'Carpet' 'Clay' 'Grass']


In [358]:
processed_df = pd.read_csv("data/processed_data.csv", index_col = None) 
display(processed_df)

Unnamed: 0,Date,Tourney,Location,Surface,BestOf,Player1,Player1Age,Player1Nationality,Player1PlaysHome,Player2,Player2Age,Player2Nationality,Player2PlaysHome,Player1SetsWon,Player1SetsLost,Player1GamesWon,Player1GamesLost,Player1Won
0,1990-01-01,Wellington,New Zealand,Hard,3,Thomas Hogstedt,26.2,Sweden,False,Shuzo Matsuoka,22.1,Japan,False,2,1,18,15,1.0
1,1990-01-01,Wellington,New Zealand,Hard,3,Brett Steven,20.6,New Zealand,False,Lars Jonsson,19.5,Sweden,False,0,2,9,13,0.0
2,1990-01-01,Wellington,New Zealand,Hard,3,Richard Fromberg,19.6,Australia,False,Olivier Delaitre,22.5,France,False,2,0,13,7,1.0
3,1990-01-01,Wellington,New Zealand,Hard,3,Jens Woehrmann,22.3,Germany,False,Paul Chamberlin,27.7,United States,False,1,2,10,18,0.0
4,1990-01-01,Wellington,New Zealand,Hard,3,Magnus Gustafsson,22.9,Sweden,False,Gilad Bloom,22.8,Israel,False,0,2,6,13,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92776,2022-08-29,Us Open,United States,Hard,5,Steve Johnson,32.6,United States,True,Grigor Dimitrov,31.2,Bulgaria,False,0,3,7,18,0.0
92777,2022-08-29,Us Open,United States,Hard,5,Lorenzo Musetti,20.4,Italy,False,David Goffin,31.7,Belgium,False,3,2,26,27,1.0
92778,2022-08-29,Us Open,United States,Hard,5,Gijs Brouwer,26.4,Netherlands,False,Adrian Mannarino,34.1,France,False,3,0,18,11,1.0
92779,2022-08-29,Us Open,United States,Hard,5,Daniil Medvedev,26.5,Russia,False,Arthur Rinderknech,27.1,France,False,3,0,19,10,1.0


In [341]:
import math

def G(theta):
    return 1 / math.sqrt(1 + 3 * (theta / b) ** 2)

def E(mu, mu_j, theta_j): 
    return 1 / (1 + math.exp(-G(theta_j) * (mu - mu_j)))

def V(mu, list_mu_j, list_theta_j):
    l = 0
    for mu_j, theta_j in zip(list_mu_j, list_theta_j):
        l += (G(theta_j) ** 2) * E(mu, mu_j, theta_j) * (1 - E(mu, mu_j, theta_j))
                                                  
    return 1 / l
                                                  
def Delta(mu, list_mu_j, list_theta_j, list_score_j):
    l = 0
    for mu_j, theta_j, score_j in zip(list_mu_j, list_theta_j, list_score_j):
        l += G(theta_j) * (score_j - E(mu, mu_j, theta_j))
                           
    return V(mu, list_mu_j, list_theta_j) * l

def F(x, delta, v, theta, sigma):
    return \
        (math.exp(x) * (delta ** 2 - theta ** 2 - v - math.exp(x))) / \
        ((2 * theta ** 2 + v + math.exp(x)) ** 2) - \
        (x - math.log(sigma ** 2)) / 0.04    
    

In [343]:
from tqdm import tqdm
## Glicko Score dictionary ## 
############# Init:      1500     350         0.06
############# [Clay -  [Rating, Variance, Volatility]]
# namePlayer: [Grass - [Rating, Variance, Volatility]]
############# [Hard -  [Rating, Variance, Volatility]]

init_glicko = [[0, 350 / 173.7178, 0.06]] * 3

all_players = np.unique(np.array([processed_df.Player1, processed_df.Player2]))

glicko_dict = {key: init_glicko for key in all_players}
surface_dict = {"Clay": 0, "Grass": 1, "Hard": 2}

## Computing Glicko score for each tournament separately:
curr_tourney = processed_df["Tourney"][0]
curr_surface = surface_dict[processed_df["Surface"][0]]

r_step, phi_step, sigma_step = (0, 0, 0)

## Curr Player Dictionary ##
# namePlayer: [p1_mu, p1_theta, p1_sigma, list_mu_p1, list_theta_p1, list_scores]
############################

curr_players_dict = {}

for i, row in tqdm(processed_df.iterrows()):
    if curr_tourney == row["Tourney"]:
        p1_mu = glicko_dict[row.Player1][0][curr_surface]
        p1_theta = glicko_dict[row.Player1][1][curr_surface]
        p1_sigma = glicko_dict[row.Player1][2][curr_surface]
        p1_score = row.Player1Won
        
        p2_mu = glicko_dict[row.Player2][0][curr_surface]
        p2_theta = glicko_dict[row.Player2][1][curr_surface]
        p2_sigma = glicko_dict[row.Player2][2][curr_surface]
        p2_score = 1 - p1_score
        
        if row.Player1 not in curr_players_dict:
            curr_players_dict[row.Player1] = \
                (p1_mu, p1_theta, p1_sigma, [p2_mu], [p2_theta], [p1_score])
        else : 
            curr_players_dict[row.Player1][3].append(p2_mu)
            curr_players_dict[row.Player1][4].append(p2_theta)
            curr_players_dict[row.Player1][5].append(p1_score)
            
        
        if row.Player2 not in curr_players_dict:
            curr_players_dict[row.Player2] = \
                (p2_mu, p2_theta, p2_sigma, [p1_mu], [p1_theta], [p2_score])
        else : 
            curr_players_dict[row.Player2][3].append(p1_mu)
            curr_players_dict[row.Player2][4].append(p1_theta)
            curr_players_dict[row.Player2][5].append(p2_score)
    
    else: 
        for player in curr_players_dict.keys():     
            p_mu, p_theta, p_sigma, list_mu, list_theta, p_scores = curr_players_dict[player]
            
            # Compute sigma step
            a = math.log(p_sigma ** 2)
            b = 1
            delta = Delta(p_mu, list_mu, list_theta, p_scores)
            v = V(p_mu, list_mu, list_theta)
            
            if delta ** 2 > (p_theta ** 2 + v):
                b = math.log(delta ** 2 - p_theta ** 2 - v)
            else :
                k = 1
                while F(a - k * 0.2, delta, v, p_theta, p_sigma) < 0:
                    k += 1
                    b = a - k * 0.2

            
            fa = F(a, delta, v, p_theta, p_sigma)
            fb = F(b, delta, v, p_theta, p_sigma)
            
            while abs(b - a) > 0.1:
                c = a + (a - b) *  fa / (fb - fa)
                
                fc = F(c, delta, v, p_theta, p_sigma)
                
                if fc * fb <= 0:
                    a, fa = (b, fb)
                else: 
                    fa = fa / 2
                
                b, fb = (c, fc)
            
            sigma_step = math.exp(a / 2)
            
            # Compute theta step
            theta_step = \
                1 / math.sqrt((1 / math.sqrt(p_theta ** 2 + sigma_step ** 2) + 1 / v))
            
            # Compute mu step
            mu_step = p_mu
            for mu_opp, theta_opp, sc in zip(list_mu, list_theta, p_scores):
                mu_step += (theta_step ** 2) * G(theta_opp) \
                            * (sc - E(p_mu, mu_opp, theta_opp))
            
            # Finally change glicko in dictionary
            new_glicko = glicko_dict[player]                                      
            new_glicko[curr_surface] = [mu_step, theta_step, sigma_step]
            glicko_dict[player] = new_glicko
        
        curr_tourney = row["Tourney"]
        curr_surface = surface_dict[surface_dict[row.Surface]]


92781it [54:46, 28.23it/s] 


In [365]:
## TODO: GLICKO
processed_df["Glicko"] = 1500
print(glicko_dict)

{'Aaron Krickstein': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Abdul Hamid Makhkamov': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Abdulla Hajji': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Adam Chadaj': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Adam Kennedy': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Adam Malik': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467896]], 'Adam Pavlasek': [[0, 2.014761872416068, 0.06], [0, 2.014761872416068, 0.06], [0.01842572624286437, 0.2883795089753077, 0.06007971560467

In [364]:
 processed_df.drop(columns = ["Date", "Tourney", "Location", "Player1", "Player2",
                              "Player1Nationality", "Player2Nationality",
                             "Player1SetsWon", "Player1SetsLost",
                             "Player1GamesWon", "Player1GamesLost"], inplace = True)

display(processed_df)

KeyError: "['Date', 'Tourney', 'Location', 'Player1', 'Player2', 'Player1Nationality', 'Player2Nationality', 'Player1SetsWon', 'Player1SetsLost', 'Player1GamesWon', 'Player1GamesLost'] not found in axis"

In [361]:
# Initial split: 70-15-15

df_train = processed_df.sample(frac = 0.7, random_state = 200) #random state is a seed value
val_and_test = processed_df.drop(df_train.index)
# df_val = val_and_test.sample(frac = 0.5, random_state = 200)
# df_test = val_and_test.drop(df_val.index)

df_test = val_and_test

In [362]:
from sklearn.naive_bayes import GaussianNB

y_train = df_train["Player1Won"]
x_train = df_train.drop(columns = ["Player1Won"])
display(x_train)
gnb = GaussianNB()
trained_gnb = gnb.fit(x_train, y_train)
y_pred = trained_gnb.predict(x_train)
# y_proba = trained_gnb.predict_proba(x_train)

print("Number of mislabeled points out of a total %d points : %d" \
      % (x_train.shape[0], (y_train != y_pred).sum()))

print(f"Accuracy: {(y_train == y_pred).sum() / (len(y_train))}")
# for pred, act, prob in zip(y_pred, y_train, y_proba):
#     if pred != act:
#         print(f"Misclassified as {pred} with proba: {prob[0]}")

print(f"1.0 preds: {(y_train == 1.0).sum()}, 0.0 preds: {(y_train == 0.0).sum()}")

Unnamed: 0,Surface,BestOf,Player1Age,Player1PlaysHome,Player2Age,Player2PlaysHome,Glicko
81277,Hard,3,35.7,False,32.0,False,
16541,Hard,3,23.5,False,26.8,False,
67206,Clay,3,28.0,False,26.9,False,
7332,Clay,3,24.1,False,19.0,False,
17033,Clay,3,24.6,False,29.6,False,
...,...,...,...,...,...,...,...
86789,Hard,3,26.1,False,38.2,False,
84245,Hard,3,34.0,False,24.8,False,
37090,Clay,3,27.1,False,26.6,False,
50130,Hard,3,30.5,False,26.1,False,


ValueError: could not convert string to float: 'Hard'

In [349]:
y_test = df_test["Player1Won"]
x_test = df_test.drop(columns = ["Player1Won"])
display(x_test)
y_pred = trained_gnb.predict(x_test)
# y_proba = trained_gnb.predict_proba(x_test)

print("Number of mislabeled points out of a total %d points : %d" \
      % (x_test.shape[0], (y_test != y_pred).sum()))

print(f"Accuracy: {(y_test == y_pred).sum() / (len(y_test))}")


Unnamed: 0,BestOf,Player1Age,Player1PlaysHome,Player2Age,Player2PlaysHome,Glicko
0,3,26.2,False,22.1,False,1500
3,3,22.3,False,27.7,False,1500
4,3,22.9,False,22.8,False,1500
7,3,18.2,False,21.9,False,1500
8,3,24.7,False,21.1,False,1500
...,...,...,...,...,...,...
92771,5,32.0,False,34.6,False,1500
92773,5,23.9,False,21.0,False,1500
92774,5,26.3,True,25.3,False,1500
92777,5,20.4,False,31.7,False,1500


Number of mislabeled points out of a total 27834 points : 13853
Accuracy: 0.5022993461234462
