In [None]:
from wn.data import prepare_matches, DataInterface, tr
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from torch import nn

import pickle
import os

### This needs to be organized still.

In [None]:
if os.path.exists("data/processed_data.pkl"):

    with open("data/processed_data.pkl", "rb") as f:
        players, matches = pickle.load(f)

else:

    print("Loading match list")
    match_list = [f"../tennis_atp/atp_matches_{year}.csv" for year in range(1968, 2018)]
    matches = prepare_matches(match_list)

    print("Loading players")
    players = pd.read_csv("../tennis_atp/atp_players.csv")
    
    # Add days elapsed from 1900
    print("Transforming match dates")
    matches.tourney_date = pd.to_datetime(matches.tourney_date.astype("str"))
    matches["days_elapsed_date"] = (matches.tourney_date - pd.to_datetime("19000101")).dt.days

    # Removing missing birthday players for now
    print("Transforming birth dates and removing missing")
    players.dob = pd.to_datetime(players.dob.astype("str"), errors="coerce")
    players = players[~players.dob.isna()].reset_index(drop=True)
    players["days_elapsed_dob"] = (players.dob - pd.to_datetime("19000101")).dt.days
    # players["last_match_date"] = [
    #     matches[matches.winner_id.eq(r.player_id) | matches.loser_id.eq(r.player_id)].days_elapsed_date.max()
    #     for r in players.itertuples()
    # ]

    # Remove matches with players with unknown birthdays
    matches = matches.loc[
        matches.winner_id.isin(players.player_id)
        & matches.loser_id.isin(players.player_id)
    ].reset_index(drop=True)

    # (Hopefully temporarily) remove matches where neither player has
    # a known rank
    print("Removing missing ranks")
    matches = matches.loc[
        ~matches.winner_rank.isna() | ~matches.loser_rank.isna()
    ].reset_index(drop=True)

    print("Saving processed data")
    with open("data/processed_data.pkl", "wb") as f:
        pickle.dump((players, matches), f)

In [None]:
desired_cols = [
    "winner_rank",
    "winner_hand",
    "loser_rank",
    "loser_hand",
    "surface",
    "tourney_level",
    "days_elapsed_date",
]

augmented_matches = (
    matches
    .merge(players, "inner", left_on="winner_id", right_on="player_id")
    .loc[:, desired_cols + ["days_elapsed_dob", "loser_id"]]
    .rename({"days_elapsed_dob": "winner_dob"}, axis=1)
    .merge(players, "inner", left_on="loser_id", right_on="player_id")
    .loc[:, desired_cols + ["winner_dob", "days_elapsed_dob"]]
    .rename({"days_elapsed_dob": "loser_dob"}, axis=1)
)

In [None]:
winner_matches = augmented_matches[[
    "winner_rank",
    "winner_hand",
    "winner_dob",
    "loser_rank",
    "loser_hand",
    "loser_dob",
    "surface",
    "tourney_level",
    "days_elapsed_date",
]].fillna(-1).assign(won=1).rename({
    "winner_rank": "p1_rank",
    "winner_hand": "p1_hand",
    "winner_dob": "p1_dob",
    "loser_rank": "p2_rank",
    "loser_hand": "p2_hand",
    "loser_dob": "p2_dob",
}, axis=1)

loser_matches = augmented_matches[[
    "loser_rank",
    "loser_hand",
    "loser_dob",
    "winner_rank",
    "winner_hand",
    "winner_dob",
    "surface",
    "tourney_level",
    "days_elapsed_date",
]].fillna(-1).assign(won=0).rename({
    "loser_rank": "p1_rank",
    "loser_hand": "p1_hand",
    "loser_dob": "p1_dob",
    "winner_rank": "p2_rank",
    "winner_hand": "p2_hand",
    "winner_dob": "p2_dob",
}, axis=1)

condensed_matches = pd.concat([winner_matches, loser_matches])

In [None]:
match_interface = DataInterface({
    "p1_rank": "numeric",
    "p1_hand": "categorical",
    "p1_dob": "time",
    "p2_rank": "numeric",
    "p2_hand": "categorical",
    "p2_dob": "time",
    "surface": "categorical",
    "tourney_level": "categorical",
    "days_elapsed_date": "time",
})

match_interface.complete(condensed_matches)

In [None]:
# Save the interface
with open("data/match_interface.pkl", "wb") as f:
    pickle.dump(match_interface, f)

In [None]:
# Now rather expensive to encode, needs to be saved
input_data = {
    k: tr(condensed_matches[k], k, match_interface) 
    for k in match_interface.type_map
}

# Also save the labels
y = torch.tensor(
    condensed_matches.won.to_numpy(), dtype=torch.float
).unsqueeze(1)

In [None]:
# Save the tensor dict and labels
with open("data/tensor_list.pkl", "wb") as f:
    pickle.dump((input_data, y), f)