In [1]:
from wn.data import prepare_matches, DataInterface
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

import pickle
import os

So the idea is to:

  1) Sample a random match
  2) Identify both players
  3) Retrieve their fixed tabular features
  4) Add the match time to the tabular features
  4) Retrieve the players' last *n* matches
  5) Predict for both players (separately)

In [2]:
if os.path.exists("processed_data.pkl"):

    with open("processed_data.pkl", "rb") as f:
        players, matches = pickle.load(f)

else:

    match_list = [f"../tennis_atp/atp_matches_{year}.csv" for year in range(1968, 2018)]
    matches = prepare_matches(match_list)

    players = pd.read_csv("../tennis_atp/atp_players.csv")
    
    # Add days elapsed from 1900
    matches.tourney_date = pd.to_datetime(matches.tourney_date.astype("str"))
    matches["days_elapsed_date"] = (matches.tourney_date - pd.to_datetime("19000101")).dt.days

    # Removing missing birthday players for now
    players.dob = pd.to_datetime(players.dob.astype("str"), errors="coerce")
    players = players[~players.dob.isna()].reset_index(drop=True)
    players["days_elapsed_dob"] = (players.dob - pd.to_datetime("19000101")).dt.days
    players["last_match_date"] = [
        matches[matches.winner_id.eq(r.player_id) | matches.loser_id.eq(r.player_id)].days_elapsed_date.max()
        for r in players.itertuples()
    ]

    # Remove matches with players with unknown birthdays
    matches = matches.loc[
        matches.winner_id.isin(players.player_id)
        & matches.loser_id.isin(players.player_id)
    ].reset_index(drop=True)

    with open("processed_data.pkl", "wb") as f:
        pickle.dump((players, matches), f)

In [3]:
pi = DataInterface({"hand": "categorical", "days_elapsed_dob": "time"})
pi.complete(players)

In [13]:
# Player hand mapping
torch.tensor([pi.type_map["hand"][1][x] for x in players.hand.fillna("U")], dtype=torch.int)



tensor([0, 0, 0,  ..., 2, 2, 2])

In [10]:
class MatchDataset(Dataset):

    def __init__(self, matches, players, t_interface, m_interface, s_interface):

        super().__init__()

        self.matches = matches
        self.players = players

        # The idea here is to specify what data you want and in what form.
        self.t_interface = t_interface
        self.m_interface = m_interface
        self.s_interface = s_interface

        # Make an index
        self.index = (
            [(row, row.winner_id) for row in self.matches.itertuples()] 
            + [(row, row.loser_id) for row in self.matches.itertuples()] 
        )

    def __len__(self):
        return len(self.index)

    def _apply_interface(self, interface, dt):

        return [
            torch.tensor([(v[1][1] - dt[col]) / (v[1][1] - v[1][0]) for col, v in interface.numeric().items()], dtype=torch.float).unsqueeze(0),
            torch.tensor([dt[col].item() for col in interface.time()], dtype=torch.float).unsqueeze(0),
            torch.tensor([v[1][dt[col].item()] for col, v in interface.categorical().items()], dtype=torch.int).unsqueeze(0),
        ]

    def __getitem__(self, idx):

        match, player_id = self.index[idx]
        match_date = match.days_elapsed_date

        # TKTK better ordering on matches.
        player_matches = self.matches.loc[
            (self.matches.winner_id.eq(player_id) | self.matches.loser_id.eq(player_id)) 
            & self.matches.days_elapsed_date.ge(match_date - 365) 
            & self.matches.days_elapsed_date.lt(match_date)
        ]

        # Get the player features
        pfs = self.players.loc[players.player_id.eq(player_id)]
        p_features = self._apply_interface(self.t_interface, pfs)

        # Get the match features
        # m_features = self._apply_interface(self.m_interface, match)

        # return 

        return {
            "match_features": match,
            "match_history": player_matches,
            # "fixed_features": player_fixed_features,
        }

In [15]:
matches.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'days_elapsed_date'],
      dtype='object')

In [None]:
pi = DataInterface({"hand": "categorical", "days_elapsed_dob": "time"})
mi = DataInterface(
    {
        "surface": "categorical",
        "tourney_level": "categorical",
        "round": "categorical",
        "days_elapsed_date": "time"
    }
)

In [11]:
ds = MatchDataset(matches, players, pi, None, None)

In [12]:
ds[20_001]

{'match_features': Pandas(Index=20001, tourney_id='1974-411', tourney_name='Chicago', surface='Carpet', draw_size=32, tourney_level='A', tourney_date=Timestamp('1974-07-22 00:00:00'), match_num=18, winner_id=100261, winner_seed=nan, winner_entry=nan, winner_name='Brian Gottfried', winner_hand='R', winner_ht=183.0, winner_ioc='USA', winner_age=22.4, loser_id=100205, loser_seed=nan, loser_entry=nan, loser_name='Paul Gerken', loser_hand='R', loser_ht=185.0, loser_ioc='USA', loser_age=24.3, score='6-4 6-3', best_of=3, round='R16', minutes=nan, w_ace=nan, w_df=nan, w_svpt=nan, w_1stIn=nan, w_1stWon=nan, w_2ndWon=nan, w_SvGms=nan, w_bpSaved=nan, w_bpFaced=nan, l_ace=nan, l_df=nan, l_svpt=nan, l_1stIn=nan, l_1stWon=nan, l_2ndWon=nan, l_SvGms=nan, l_bpSaved=nan, l_bpFaced=nan, winner_rank=31.0, winner_rank_points=0.0, loser_rank=52.0, loser_rank_points=0.0, days_elapsed_date=27230),
 'match_history':       tourney_id   tourney_name surface  draw_size tourney_level tourney_date  \
 16268   1973

In [6]:
pi.complete(players)

In [16]:
pi.type_sizes()

{'numeric': 0, 'time': 1, 'categorical': 1}

In [18]:
pi.numeric()

{}

In [20]:
pi.categorical()

{'hand': ('categorical', {'R': 0, 'L': 1, 'U': 2, 'A': 3, nan: 4})}