In [1]:
from wn.data import prepare_matches, DataInterface, tr
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from torch import nn

import pickle
import os

### First we'll construct the pure per-match tabular features.

In [2]:
if os.path.exists("data/processed_data.pkl"):

    with open("data/processed_data.pkl", "rb") as f:
        players, matches = pickle.load(f)

else:

    print("Loading match list")
    match_list = [f"../tennis_atp/atp_matches_{year}.csv" for year in range(1968, 2018)]
    matches = prepare_matches(match_list)

    print("Loading players")
    players = pd.read_csv("../tennis_atp/atp_players.csv")

    # Add days elapsed from 1900
    print("Transforming match dates")
    matches.tourney_date = pd.to_datetime(matches.tourney_date.astype("str"))
    matches["days_elapsed_date"] = (
        matches.tourney_date - pd.to_datetime("19000101")
    ).dt.days

    # Removing missing birthday players for now
    print("Transforming birth dates and removing missing")
    players.dob = pd.to_datetime(players.dob.astype("str"), errors="coerce")
    players = players[~players.dob.isna()].reset_index(drop=True)
    players["days_elapsed_dob"] = (players.dob - pd.to_datetime("19000101")).dt.days

    # Find last match dates
    # TKTK: There's a better way to do this
    # players["last_match_date"] = [
    #     matches[matches.winner_id.eq(r.player_id) | matches.loser_id.eq(r.player_id)].days_elapsed_date.max()
    #     for r in players.itertuples()
    # ]

    # Remove matches with players with unknown birthdays
    matches = matches.loc[
        matches.winner_id.isin(players.player_id)
        & matches.loser_id.isin(players.player_id)
    ].reset_index(drop=True)

    # (Hopefully temporarily) remove matches where neither player has
    # a known rank
    # print("Removing missing ranks")
    # matches = matches.loc[
    #     ~matches.winner_rank.isna() | ~matches.loser_rank.isna()
    # ].reset_index(drop=True)

    print("Saving processed data")
    with open("data/processed_data.pkl", "wb") as f:
        pickle.dump((players, matches), f)

In [3]:
desired_cols = [
    "winner_rank",
    "winner_hand",
    "loser_rank",
    "loser_hand",
    "surface",
    "tourney_level",
    "days_elapsed_date",
]

augmented_matches = (
    matches.merge(players, "inner", left_on="winner_id", right_on="player_id")
    .loc[:, desired_cols + ["days_elapsed_dob", "loser_id"]]
    .rename({"days_elapsed_dob": "winner_dob"}, axis=1)
    .merge(players, "inner", left_on="loser_id", right_on="player_id")
    .loc[:, desired_cols + ["winner_dob", "days_elapsed_dob"]]
    .rename({"days_elapsed_dob": "loser_dob"}, axis=1)
)

In [4]:
winner_matches = (
    augmented_matches[
        [
            "winner_rank",
            "winner_hand",
            "winner_dob",
            "loser_rank",
            "loser_hand",
            "loser_dob",
            "surface",
            "tourney_level",
            "days_elapsed_date",
        ]
    ]
    .fillna(-1)
    .assign(won=1)
    .rename(
        {
            "winner_rank": "p1_rank",
            "winner_hand": "p1_hand",
            "winner_dob": "p1_dob",
            "loser_rank": "p2_rank",
            "loser_hand": "p2_hand",
            "loser_dob": "p2_dob",
        },
        axis=1,
    )
)

loser_matches = (
    augmented_matches[
        [
            "loser_rank",
            "loser_hand",
            "loser_dob",
            "winner_rank",
            "winner_hand",
            "winner_dob",
            "surface",
            "tourney_level",
            "days_elapsed_date",
        ]
    ]
    .fillna(-1)
    .assign(won=0)
    .rename(
        {
            "loser_rank": "p1_rank",
            "loser_hand": "p1_hand",
            "loser_dob": "p1_dob",
            "winner_rank": "p2_rank",
            "winner_hand": "p2_hand",
            "winner_dob": "p2_dob",
        },
        axis=1,
    )
)

condensed_matches = pd.concat([winner_matches, loser_matches])

In [5]:
match_interface = DataInterface(
    {
        "p1_rank": "numeric",
        "p1_hand": "categorical",
        "p1_dob": "time",
        "p2_rank": "numeric",
        "p2_hand": "categorical",
        "p2_dob": "time",
        "surface": "categorical",
        "tourney_level": "categorical",
        "days_elapsed_date": "time",
    }
)

match_interface.complete(condensed_matches)

In [6]:
# Save the interface
with open("data/match_interface.pkl", "wb") as f:
    pickle.dump(match_interface, f)

In [7]:
# Now rather expensive to encode, needs to be saved
input_data = {
    k: tr(condensed_matches[k], k, match_interface) for k in match_interface.type_map
}

# Also save the labels
y = torch.tensor(condensed_matches.won.to_numpy(), dtype=torch.float).unsqueeze(1)

Encoding p1_rank
Encoding p1_hand
Encoding p1_dob
Encoding p2_rank
Encoding p2_hand
Encoding p2_dob
Encoding surface
Encoding tourney_level
Encoding days_elapsed_date


In [8]:
# Save the tensor dict and labels
with open("data/tensor_list.pkl", "wb") as f:
    pickle.dump((input_data, y), f)

### Now we'll construct the historical match features.

This is all basically identical to the above, except that we need a slightly
different look at the data.

In [9]:
desired_cols = [
    "winner_rank",
    "winner_hand",
    "loser_rank",
    "loser_hand",
    "surface",
    "tourney_level",
    "days_elapsed_date",
]

augmented_matches = (
    matches.merge(players, "inner", left_on="winner_id", right_on="player_id")
    .loc[:, desired_cols + ["days_elapsed_dob", "loser_id", "winner_id"]]
    .rename({"days_elapsed_dob": "winner_dob"}, axis=1)
    .merge(players, "inner", left_on="loser_id", right_on="player_id")
    .loc[:, desired_cols + ["winner_dob", "days_elapsed_dob", "winner_id", "loser_id"]]
    .rename({"days_elapsed_dob": "loser_dob"}, axis=1)
)

In [10]:
winner_matches = (
    augmented_matches[
        [
            "winner_id",
            "winner_rank",
            "loser_rank",
            "loser_hand",
            "loser_dob",
            "surface",
            "tourney_level",
            "days_elapsed_date",
        ]
    ]
    .fillna(-1)
    .assign(won=1)
    .rename(
        {
            "winner_id": "p1_id",
            "winner_rank": "p1_rank",
            "loser_rank": "p2_rank",
            "loser_hand": "p2_hand",
            "loser_dob": "p2_dob",
        },
        axis=1,
    )
)

loser_matches = (
    augmented_matches[
        [
            "loser_id",
            "loser_rank",
            "winner_rank",
            "winner_hand",
            "winner_dob",
            "surface",
            "tourney_level",
            "days_elapsed_date",
        ]
    ]
    .fillna(-1)
    .assign(won=0)
    .rename(
        {
            "loser_id": "p1_id",
            "loser_rank": "p1_rank",
            "winner_rank": "p2_rank",
            "winner_hand": "p2_hand",
            "winner_dob": "p2_dob",
        },
        axis=1,
    )
)

# The sort here is super important. It allows us to do relatively optimized
# data construction on the fly while training the model, later.

condensed_matches = (
    pd.concat([winner_matches, loser_matches])
    .sort_values(["p1_id", "days_elapsed_date"])
    .reset_index(drop=True)
)

In [11]:
history_interface = DataInterface(
    {
        "p1_rank": "numeric",
        "p2_rank": "numeric",
        "p2_hand": "categorical",
        "p2_dob": "time",
        "surface": "categorical",
        "tourney_level": "categorical",
        "days_elapsed_date": "time",
        "won": "categorical",
    }
)

history_interface.complete(condensed_matches)

# Encode data to save
input_data = {
    k: tr(condensed_matches[k], k, history_interface)
    for k in history_interface.type_map
}

# And the player IDs
pid = torch.tensor(condensed_matches.p1_id, dtype=torch.int).unsqueeze(1)

Encoding p1_rank
Encoding p2_rank
Encoding p2_hand
Encoding p2_dob
Encoding surface
Encoding tourney_level
Encoding days_elapsed_date
Encoding won


In [13]:
# Save everything

with open("data/history_tensor_list.pkl", "wb") as f:
    pickle.dump((input_data, pid), f)

with open("data/history_interface.pkl", "wb") as f:
    pickle.dump(history_interface, f)