## About

This notebook is aimed at gaining an estimate of the relationship between a player's ranking and their probability of participating in various tournaments.

In [1]:
using DataFrames, CSV, HTTP, Plots, StatsPlots

In [2]:
doubles_data_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_doubles_"

cols = Vector{String}([
    "tourney_name", "tourney_level", "tourney_date", 
    "winner1_id", "winner1_name", "winner1_rank",
    "winner2_id", "winner2_name", "winner2_rank",
    "loser1_id", "loser1_name", "loser1_rank", 
    "loser2_id", "loser2_name", "loser2_rank"
])

tournament_encodings = Dict{String, String}(
    "Australian Open" => "Grandslam",
    "French Open" => "Grandslam",
    "Roland Garros" => "Grandslam", 
    "Wimbledon" => "Grandslam",
    "US Open" => "Grandslam",
    "Us Open" => "Grandslam",
    "Indian Wells Masters" => "ATP1000",
    "Miami Masters" => "ATP1000",
    "Monte Carlo Masters" => "ATP1000",
    "Rome Masters" => "ATP1000",
    "Madrid Masters" => "ATP1000",
    "Montreal Masters" => "ATP1000",
    "Cincinnati Masters" => "ATP1000",
    "Shanghai Masters" => "ATP1000",
    "Paris Masters" => "ATP1000",
    "Toronto Masters" => "ATP1000",
    "Canada Masters" => "ATP1000",
    "Queen's Club" => "ATP500", 
    "Vienna" => "ATP500", 
    "Rio De Janeiro" => "ATP500", 
    "Dubai" => "ATP500", 
    "Halle" => "ATP500", 
    "Rio de Janeiro" => "ATP500", 
    "Rotterdam" => "ATP500", 
    "Washington" => "ATP500",
    "Barcelona" => "ATP500", 
    "Hamburg" => "ATP500", 
    "Tokyo" => "ATP500", 
    "Beijing" => "ATP500", 
    "Beijing" => "ATP500", 
    "Basel" => "ATP500",
    "Acapulco" => "ATP500", 
    "Istanbul" => "ATP250", 
    "Estoril" => "ATP250", 
    "Chengdu" => "ATP250", 
    "Stockholm" => "ATP250", 
    "Buenos Aires" => "ATP250", 
    "Auckland" => "ATP250", 
    "Delray Beach" => "ATP250", 
    "Nottingham" => "ATP250", 
    "Brisbane"=> "ATP250", 
    "Shenzhen" => "ATP250", 
    "Stuttgart" => "ATP250", 
    "Lyon" => "ATP250", 
    "Nice" => "ATP250", 
    "Atlanta" => "ATP250", 
    "Antalya" => "ATP250", 
    "Sydney" => "ATP250", 
    "Geneva" => "ATP250", 
    "Los Angeles" => "ATP250", 
    "Metz" => "ATP250", 
    "Quito" => "ATP250", 
    "Eastbourne" => "ATP250", 
    "Dusseldorf" => "ATP250", 
    "Casablanca" => "ATP250", 
    "Newport" => "ATP250", 
    "Houston" => "ATP250", 
    "Sofia" => "ATP250", 
    "Marrakech" => "ATP250", 
    "s-Hertogenbosch" => "ATP250", 
    "Costa Do Sauipe" => "ATP250", 
    "St.Petersburg" => "ATP250", 
    "S-Hertogenbosch" => "ATP250", 
    "Gstaad" => "ATP250", 
    "Munich" => "ATP250", 
    "Bastad" => "ATP250", 
    "Bogota" => "ATP250", 
    "Santiago" => "ATP250", 
    "Pune" => "ATP250", 
    "Moscow" => "ATP250", 
    "San Jose" => "ATP250", 
    "Bangkok" => "ATP250", 
    "Kitzbuhel" => "ATP250", 
    "New York" => "ATP250", 
    "Montpellier" => "ATP250", 
    "St. Petersburg" => "ATP250", 
    "Bucharest" => "ATP250", 
    "Budapest" => "ATP250", 
    "Johannesburg" => "ATP250", 
    "Sao Paulo" => "ATP250", 
    "Los Cabos" => "ATP250", 
    "Umag" => "ATP250", 
    "Memphis" => "ATP250", 
    "Belgrade" => "ATP250", 
    "Valencia" => "ATP250", 
    "Doha" => "ATP250", 
    "Winston-Salem" => "ATP250", 
    "Vina del Mar" => "ATP250", 
    "New Haven" => "ATP250", 
    "Winston Salem" => "ATP250", 
    "Chennai" => "ATP250", 
    "Marseille" => "ATP250", 
    "London" => "ATP250", 
    "Zagreb" => "ATP250", 
    "Antwerp" => "ATP250", 
    "Cabo San Lucas" => "ATP250", 
    "Kuala Lumpur" => "ATP250"
)
return nothing

In [3]:
results = Dict{String, Dict{String, Vector{Float64}}}()

for year in 2015:2020
    url = string(doubles_data_url, "$year.csv")
    http_response = HTTP.get(url)
    file = CSV.File(http_response.body)
    df = DataFrame(file)
    
    filtered = df[:, cols]

    p1 = rename(
        filtered[:, [1,2,3,4,5,6]],
        :winner1_id => :player_id, 
        :winner1_name => :player_name, 
        :winner1_rank => :player_rank)

    p2 = rename(
        filtered[:, [1,2,3,7,8,9]],
        :winner2_id => :player_id, 
        :winner2_name => :player_name, 
        :winner2_rank => :player_rank)

    p3 = rename(
        filtered[:, [1,2,3,10,11,12]],
        :loser1_id => :player_id, 
        :loser1_name => :player_name, 
        :loser1_rank => :player_rank)

    p4 = rename(
        filtered[:, [1,2,3,13,14,15]],
        :loser2_id => :player_id, 
        :loser2_name => :player_name, 
        :loser2_rank => :player_rank)

    players = unique([p1;p2;p3;p4])

    for tournament in unique(players[:, "tourney_name"])
        tournament_data = coalesce.(players[players[!,:tourney_name] .== tournament,:],0)
        player_count = size(tournament_data)[1]
        if haskey(tournament_encodings, tournament)
            level = tournament_encodings[tournament]
            for (first,second) in [
                (1,50),(51,100),(101,200),(201,300),(301,400),(401,500),(501,600),(601,700)
            ]
                proportion = size(
                    tournament_data[
                        (tournament_data[!,:player_rank].>=first) .& 
                        (tournament_data[!,:player_rank].<=second), :]
                )[1] / player_count

                key = "$first-to-$second"
                if(haskey(results, level))
                    haskey(results[level], key) ?
                     push!(results[level][key], proportion) :
                     results[level][key] = Vector{Float64}([proportion])
                else
                    results[level] = Dict{String, Vector{Float64}}(
                        key => Vector{Float64}([proportion]))
                end
            end
        end
    end
end

└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:635
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:635
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:635
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:635
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:635
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.julia\packages\CSV\9LsxT\src\file.jl:634
└ @ CSV C:\Users\alida\.j

In [11]:
matrix = []

for key in keys(results)
    push!(matrix, [
        results[key]["1-to-50"],
        results[key]["51-to-100"],
        results[key]["101-to-200"], 
        results[key]["201-to-300"], 
        results[key]["301-to-400"], 
        results[key]["401-to-500"],
        results[key]["501-to-600"],
        results[key]["601-to-700"],
    ])
end

labels = ["1-50" "51-100" "101-200" "201-300" "301-400" "401-500" "501-600" "601-700"]

p1 = boxplot(
    matrix[1],  
    title="Grandslam",
    labels=labels,
    ylabel="Participation %",
    titlefont=font(20,"Computer Modern"),
    palette=:YlGn_8,
    xticks=false,
    legendtitle="Ranking",
    yguidefont=font(10,"Computer Modern"),
)

p2 = boxplot(
    matrix[2],  
    title="ATP1000",
    labels=labels,
    ylabel="Participation %",
    titlefont=font(20,"Computer Modern"),
    palette=:YlGn_8,
    xticks=false,
    legendtitle="Ranking",
    yguidefont=font(10,"Computer Modern"),
)

p3 = boxplot(
    matrix[3],  
    title="ATP500",
    labels=labels,
    ylabel="Participation %",
    titlefont=font(20,"Computer Modern"),
    palette=:YlGn_8,
    xticks=false,
    legendtitle="Ranking",
    yguidefont=font(10,"Computer Modern"),
)

p4 = boxplot(
    matrix[4],  
    title="ATP250",
    labels=labels,
    ylabel="Participation %",
    titlefont=font(20,"Computer Modern"),
    palette=:YlGn_8,
    xticks=false,
    legendtitle="Ranking",
    yguidefont=font(10,"Computer Modern"),
)

plot(p1, p2, p3, p4, layout=(2,2), size=(800, 600))
yticks!(collect(0:0.05:0.8))
savefig("probabilities.png")