# TENNIS PREDICTOR

## 1. Data Importing and Preprocessing

### 1.1 Importing Data 
- Go through all csv files and merge all games since 1989 in one big Pandas Dataframe.
- Rename columns with better names.

In [76]:
# imports .
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Get data file names
path = r'./data/atp_matches'
all_files = glob.glob(path + "/*.csv")


start_year_considered = 1989

tournament_old_new_columns = {
   "tourney_name": "Tourney",
    "tourney_date": "Date", 
    "winner_name": "Player1", 
    "winner_age": "Player1Age", 
    "winner_ioc": "Player1Nationality",
    "loser_name": "Player2", 
    "loser_age": "Player2Age",
    "loser_ioc": "Player2Nationality",
    "score": "Score",
    "best_of": "BestOf"
}

df_tournaments = pd.DataFrame([])

for filename in all_files:
    start_year = filename.split("/")[-1].split("_")[-1].split(".")[0]
    if int(start_year) > start_year_considered: 
        df = pd.read_csv(filename, index_col = None)        
        df_tournaments = pd.concat([df_tournaments, df])

df_tournaments = df_tournaments[tournament_old_new_columns.keys()]
df_tournaments.rename(columns = tournament_old_new_columns, inplace = True)


### 1.2 Preprocessing

- Format Date column as a DateTime object.
- Sort rows by dates (older to newest)
- Drop games from exibition tournaments (Davis Cup and Laver Cup)
- Create Target column "Player1Won" (the value that we will try to predict)

In [64]:
# PREPROCESSING
# Format Date as DateTime.
df_tournaments["Date"] = df_tournaments["Date"].apply(lambda x: str(x)[0:4] + "-" + str(x)[4:6] + "-" + str(x)[6:])
df_tournaments['Date'] =  pd.to_datetime(df_tournaments['Date'])

# Sort by Date, Reset Index and drop Na and Davis/Laver Cup
df_tournaments = df_tournaments.sort_values(by = "Date").reset_index(drop = True).dropna()
df_tournaments = df_tournaments[~(df_tournaments.Tourney.str.contains("Davis"))]
df_tournaments = df_tournaments[~(df_tournaments.Tourney.str.contains("Laver"))]

# Create target column
df_tournaments["Player1Won"] = 1

- Use Geocoders to get country from tournament name
- Make new Columns "Location" that specifies country in which the tournament is played

In [65]:
from geopy.geocoders import Nominatim

## INITIALISE ONLY WHEN RERUNNING FROM SCRATCH
# tournament_to_country_dict = {}
#############################################

# Dictionary of Countries 
for tournament in df_tournaments.Tourney.unique():
           
    if tournament in tournament_to_country_dict:
        continue
    if "Cup" in tournament or "Finals" in tournament:
        tournament_to_country_dict[tournament] = "International"
    
    else: 
        t = tournament.replace("Masters", "").replace("ATP", "").replace("Indoor", "")\
                      .replace("Outdoor", "").replace("Olympics", "").replace("Open", "")
        geolocator = Nominatim(user_agent = "tennis_predictor")
        location = geolocator.geocode(t, language = "en")

        country = location.address.split(",")[-1]
        tournament_to_country_dict[tournament] = country


In [66]:
# Standardize Locations to Countries
df_tournaments["Location"] = df_tournaments["Tourney"].apply(lambda x: tournament_to_country_dict[x])

- Make dictionary of IOC to IOS codes
- Use CountryConverter to convert players' Nationalities in countries

In [67]:
import country_converter as coco

ioc_to_ios = {"ALG": "DZA", "ANG": "AGO", "ANT": "ATG", "ARU": "ABW",
    "BAH": "BHS", "BRN": "BHR", "BAN": "BGD", "BAR": "BRB", "BIZ": "BLZ", "BER": "BMU", "BHU": "BTN", "BOT": "BWA",
    "IVB": "VGB", "BRU": "BRN", "BUL": "BGR", "BUR": "BFA", "CAM": "KHM", "CAY": "CYM", "CHA": "TCD", "CHI": "CHL",
    "CGO": "COG", "CRC": "CRI", "CRO": "HRV", "DEN": "DNK", "ESA": "SLV", "GEQ": "GNQ", "FIJ": "FJI", "GAM": "GMB",
    "GER": "DEU", "GRE": "GRC", "GRN": "GRD", "GUA": "GTM", "GUI": "GIN", "GBS": "GNB", "HAI": "HTI", "HON": "HND", 
    "INA": "IDN", "IRI": "IRN", "KUW": "KWT", "LAT": "LVA", "LIB": "LBN", "LES": "LSO", "LBA": "LBY", "MAD": "MDG", 
    "MAW": "MWI", "MAS": "MYS", "MTN": "MRT", "MRI": "MUS", "MON": "MCO", "MGL": "MNG", "MYA": "MMR", "NEP": "NPL", 
    "NED": "NLD", "NCA": "NIC", "NIG": "NER", "NGR": "NGA", "OMA": "OMN", "PLE": "PSE", "PAR": "PRY", "PHI": "PHL", 
    "POR": "PRT", "PUR": "PRI", "SKN": "KNA", "VIN": "VCT", "SAM": "WSM", "KSA": "SAU", "SEY": "SYC", "SIN": "SGP", 
    "SLO": "SVN", "SOL": "SLB", "RSA": "ZAF", "SRI": "LKA", "SUD": "SDN", "SUI": "CHE", "TPE": "TWN", "TAN": "TZA", 
    "TOG": "TGO", "TGA": "TON", "TRI": "TTO", "UAE": "ARE", "ISV": "VIR", "URU": "URY", "VAN": "VUT", "VIE": "VNM", 
    "YUG": "SRB","ZAM": "ZMB", "ZIM": "ZWE"
}

In [68]:
coco_converter = coco.CountryConverter()

df_tournaments["Player1Nationality"] = df_tournaments["Player1Nationality"].apply( \
    lambda x: ioc_to_ios[x] if x in ioc_to_ios else x \
)

df_tournaments["Player1Nationality"] = \
    coco_converter.convert(names = df_tournaments["Player1Nationality"].to_numpy(), to = "name_short")

df_tournaments["Player2Nationality"] = df_tournaments["Player2Nationality"].apply( \
    lambda x: ioc_to_ios[x] if x in ioc_to_ios else x \
)

df_tournaments["Player2Nationality"] = \
    coco_converter.convert(names = df_tournaments["Player2Nationality"].to_numpy(), to = "name_short")


- Custom-define function to parse the score (disregard anything containing alphabetic characters)
- Compute four new columns for games and sets won and lost by player 1.
- Compute two new binary columns to flag if player1 and player2 respectively are playing in their home country

In [69]:
def parseScore(score):
    sets_won = 0
    sets_lost = 0
    games_won = 0
    games_lost = 0
    
    sets = score.split(" ")
    for s in sets:
        if not(s):
            break
        games = s.split("-")
        
        games_p1 = games[0]
        games_p2 = games[1][0:2] if len(games[1]) and games[1].isnumeric() > 1 else games[1][0]
        
        if not games_p1.isnumeric() or not games_p2.isnumeric():
            break
        
        games_won += int(games_p1)
        games_lost += int(games_p2)
        if games_p1 > games_p2:
            sets_won += 1
        else:
            sets_lost += 1
    return [sets_won, sets_lost, games_won, games_lost]

In [70]:
# Remove retirements and walk overs
df_tournaments = df_tournaments[~df_tournaments.Score.str.contains(r'[a-zA-Z]')]


# Parse Scores in Games Won, Games Lost, Sets Won, Sets Lost
df_tournaments["Player1SetsWon"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[0]
)
df_tournaments["Player1SetsLost"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[1]                                                       
)
df_tournaments["Player1GamesWon"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[2]
)
df_tournaments["Player1GamesLost"] = df_tournaments["Score"].apply( \
    lambda x: parseScore(x)[3]                                                       
)

In [71]:
df_tournaments["Player1PlaysHome"] = df_tournaments["Player1Nationality"] == df_tournaments["Location"]
df_tournaments["Player2PlaysHome"] = df_tournaments["Player2Nationality"] == df_tournaments["Location"]

- Reorder columns for easier readability
- Drop score column
- Randomly select 50% of the rows and swap data for player1 and player2
- For these rows, the target column also has to be changed to 0, instead of 1.

In [72]:
## Order Columns and restructure like this:
# winner = player1
# loser = player2
# shuffle winner-loser if rand(0, 1) > 0.5
# create a target Player1Won that is 1 if player1 won, 0 if they lost.
# drop score column, not needed anymore

columns_final_order = ["Date", "Tourney", "Location", "BestOf",
                       "Player1", "Player1Age", "Player1Nationality", "Player1PlaysHome",
                       "Player2", "Player2Age", "Player2Nationality", "Player2PlaysHome",
                       "Player1SetsWon", "Player1SetsLost", "Player1GamesWon", "Player1GamesLost",
                       "Player1Won"]

df_tournaments = df_tournaments[columns_final_order]

In [73]:
# swap 50% of the players to balance dataset.
random_indexes = np.array(df.sample(frac = 0.50, axis = 'rows').index)

pairs_cols_toswap = [[4, 8], [5, 9], [6, 10], [7, 11], [12, 13], [14, 15]]
for pair in pairs_cols_toswap:
    tmp = df_tournaments.iloc[random_indexes, pair[0]].copy()
    df_tournaments.iloc[random_indexes, pair[0]] = df_tournaments.iloc[random_indexes, pair[1]]
    df_tournaments.iloc[random_indexes, pair[1]] = tmp

# Player 1 when swapped lost
df_tournaments.iloc[random_indexes, 16] = 0

Unnamed: 0,Date,Tourney,Location,BestOf,Player1,Player1Age,Player1Nationality,Player1PlaysHome,Player2,Player2Age,Player2Nationality,Player2PlaysHome,Player1SetsWon,Player1SetsLost,Player1GamesWon,Player1GamesLost,Player1Won
0,1990-01-01,Wellington,New Zealand,3,Thomas Hogstedt,26.2,Sweden,False,Shuzo Matsuoka,22.1,Japan,False,2,1,18,15,1
1,1990-01-01,Wellington,New Zealand,3,Lars Jonsson,19.5,Sweden,False,Brett Steven,20.6,New Zealand,False,2,0,13,9,1
2,1990-01-01,Wellington,New Zealand,3,Richard Fromberg,19.6,Australia,False,Olivier Delaitre,22.5,France,False,2,0,13,7,1
3,1990-01-01,Wellington,New Zealand,3,Paul Chamberlin,27.7,United States,False,Jens Woehrmann,22.3,Germany,False,2,1,18,10,1
4,1990-01-01,Wellington,New Zealand,3,Gilad Bloom,22.8,Israel,False,Magnus Gustafsson,22.9,Sweden,False,2,0,13,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105356,2022-08-29,Us Open,United States,5,Grigor Dimitrov,31.2,Bulgaria,False,Steve Johnson,32.6,United States,True,3,0,18,7,1
105357,2022-08-29,Us Open,United States,5,Lorenzo Musetti,20.4,Italy,False,David Goffin,31.7,Belgium,False,3,2,26,27,1
105358,2022-08-29,Us Open,United States,5,Gijs Brouwer,26.4,Netherlands,False,Adrian Mannarino,34.1,France,False,3,0,18,11,1
105359,2022-08-29,Us Open,United States,5,Daniil Medvedev,26.5,Russia,False,Arthur Rinderknech,27.1,France,False,3,0,19,10,1


### 1.3 Data Exporting
- Display final dataset
- Export dataset in CSV for immediate future use.

In [75]:
display(df_tournaments)
df_tournaments.to_csv("data/processed_data.csv", index = False)