In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import requests
import warnings

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None
warnings.filterwarnings("ignore")

## Atp Matches File

In [6]:
# Function to list files in a GitHub repository
def list_github_files(repo_name: str) -> list | None:
    url = f"https://api.github.com/repos/{repo_name}/contents/"
    response = requests.get(url)
    if response.status_code == 200:
        files = response.json()
        file_names = [file["name"] for file in files]
        return file_names
    else:
        print(f"Failed to retrieve files: {response.status_code}")


def read_csv_from_github(repo_name: str, file_name: str) -> pd.DataFrame:
    """
    Reads a CSV file from a GitHub repository into a Pandas DataFrame.
    Parameters:
        repo_name (str): The name of the GitHub repository (e.g., "owner/repo").
        file_name (str): The name of the CSV file.
    Returns:
        pd.DataFrame: A Pandas DataFrame containing the data from the CSV file.
    """

    file_url = f"https://raw.githubusercontent.com/{repo_name}/master/{file_name}"
    response = requests.get(file_url, verify=False)
    csv_content = response.content.decode("utf-8")
    df = pd.read_csv(StringIO(csv_content))
    return df

In [7]:
# repo_name = "JeffSackmann/tennis_atp"
# file_names = list_github_files(repo_name)

# # Filter files to only include CSV files
# file_names = [file for file in file_names if file.endswith(".csv")]

# # Filter files to only include those that follow the "atp_matches_YYYY" pattern
# atp_matches = [file for file in file_names if re.match(r"atp_matches_\d{4}\.csv", file)]

# # Read and aggregate ATP matches data
# atp_matches_dfs = [read_csv_from_github(repo_name, file) for file in atp_matches]
# atp_matches_df = pd.concat(atp_matches_dfs, ignore_index=True)

# # Display df info for ATP matches
# print(
#     "ATP Matches DataFrame Info:",
#     "\nShape:",
#     atp_matches_df.shape,
#     "\nColumns:",
#     atp_matches_df.columns,
# )

In [8]:
# # Convert the 'tourney_date' column to datetime format
# atp_matches_df["tourney_date"] = pd.to_datetime(
#     atp_matches_df["tourney_date"], format="%Y%m%d"
# )
# # Create tournay_year column
# atp_matches_df["tourney_year"] = atp_matches_df["tourney_date"].dt.year
# # Filter for matches after selected year
# selected_year = 1991
# atp_matches_df = atp_matches_df[
#     atp_matches_df["tourney_year"] >= selected_year
# ].reset_index(drop=True)
# # Filter for tourney levels in ['G', 'F', 'M', 'A']
# atp_matches_df = atp_matches_df[
#     atp_matches_df["tourney_level"].isin(["G", "F", "M", "A"])
# ].reset_index(drop=True)
# # Exclude Laver Cup matches
# atp_matches_df = atp_matches_df[
#     ~atp_matches_df["tourney_name"].str.contains("Laver Cup")
# ].reset_index(drop=True)

# # Count the number of NaN values for each column in the ATP matches DataFrame
# nan_counts_matches = atp_matches_df.isna().sum() / atp_matches_df.shape[0] * 100
# print("Dataframe shape: ", atp_matches_df.shape)
# print("NaN % in ATP matches DataFrame from {}:".format(selected_year))
# print(nan_counts_matches)

In [9]:
# atp_matches_df.head()

In [10]:
# # Save the file in the data folder as pickle
# atp_matches_df.to_pickle("../data/matches_results.pkl")

In [11]:
atp_matches_df = pd.read_pickle("../data/matches_results.pkl")
print("Dataframe shape: ", atp_matches_df.shape)

Dataframe shape:  (98355, 50)


### save atp_matches file
todo:
1. save the file
2. create loader that checks latest files and updates if there's new files

## Player Stats Historical

In [12]:
# cols = [
#     "tourney_id",
#     "tourney_name",
#     "surface",
#     "draw_size",
#     "tourney_level",
#     "tourney_date",
#     "tourney_year",
#     "match_num",  # tournament info
#     "winner_id",
#     "winner_seed",
#     "winner_entry",
#     "winner_name",
#     "winner_hand",
#     "winner_ht",
#     "winner_ioc",
#     "winner_age",
#     "winner_rank",
#     "winner_rank_points",  # winner info
#     "w_ace",
#     "w_df",
#     "w_svpt",
#     "w_1stIn",
#     "w_1stWon",
#     "w_2ndWon",
#     "w_SvGms",
#     "w_bpSaved",
#     "w_bpFaced",  # winner stats
#     "loser_id",
#     "loser_seed",
#     "loser_entry",
#     "loser_name",
#     "loser_hand",
#     "loser_ht",
#     "loser_ioc",
#     "loser_age",
#     "loser_rank",
#     "loser_rank_points",  # loser info
#     "l_ace",
#     "l_df",
#     "l_svpt",
#     "l_1stIn",
#     "l_1stWon",
#     "l_2ndWon",
#     "l_SvGms",
#     "l_bpSaved",
#     "l_bpFaced",
#     "score",
#     "best_of",
#     "round",
#     "minutes",  # match info
# ]

In [13]:
# player_stats_hist_df = atp_matches_df[cols].copy()
# player_stats_hist_df_l = atp_matches_df[cols].copy()

# # add a column to flag winner and loser
# player_stats_hist_df["results"] = 1
# player_stats_hist_df_l["results"] = 0
# player_stats_hist_df.head()

In [14]:
# # Rename columns to remove "winner_" prefix and "w_" prefix only if they start the column name
# player_stats_hist_df.columns = [
#     col.replace("winner_", "player_", 1)
#     if col.startswith("winner_")
#     else col.replace("w_", "p_", 1)
#     if col.startswith("w_")
#     else col.replace("loser_", "opponent_", 1)
#     if col.startswith("loser_")
#     else col.replace("l_", "o_", 1)
#     if col.startswith("l_")
#     else col
#     for col in player_stats_hist_df.columns
# ]

# player_stats_hist_df_l.columns = [
#     col.replace("loser_", "player_", 1)
#     if col.startswith("loser_")
#     else col.replace("l_", "p_", 1)
#     if col.startswith("l_")
#     else col.replace("winner_", "opponent_", 1)
#     if col.startswith("winner_")
#     else col.replace("w_", "o_", 1)
#     if col.startswith("w_")
#     else col
#     for col in player_stats_hist_df_l.columns
# ]

In [15]:
# # Append the loser stats to the winner stats
# player_stats_hist_df = pd.concat(
#     [player_stats_hist_df, player_stats_hist_df_l], ignore_index=True
# )
# player_stats_hist_df.head()

In [16]:
# player_stats_hist_df.shape

In [17]:
# # First ensure the DataFrame is properly sorted by the specified order
# player_stats_hist_df = player_stats_hist_df.sort_values(
#     by=["player_id", "tourney_date", "tourney_id", "match_num"]
# ).reset_index(drop=True)

# player_stats_hist_df.head()

In [18]:
# from typing import Union, List
# import pandas as pd


# def add_rolling_stats(
#     df: pd.DataFrame,
#     stats_columns: Union[str, List[str]],
#     agg_type: str = "mean",
#     window: int = 5,
#     min_periods: int = 1,
#     shift_periods: int = 1,
#     group_col: str = "player_id",
#     sort_cols: List[str] = ["player_id", "tourney_date", "tourney_id", "match_num"],
# ) -> pd.DataFrame:
#     """
#     Add rolling statistics to a DataFrame for each player.

#     Parameters:
#     -----------
#     df : pd.DataFrame
#         The input DataFrame
#     player_id_col : str
#         Column name for player identification
#     sort_cols : List[str]
#         Columns to sort by to ensure proper chronological order
#     stats_columns : Union[str, List[str], None]
#         Column(s) to calculate rolling statistics for
#     agg_type : str
#         Type of aggregation ('mean', 'sum', 'std', 'min', 'max', 'median')
#     window : int
#         Number of periods for the rolling window
#     min_periods : int
#         Minimum number of observations required to have a value
#     shift_periods : int
#         Number of periods to shift (1 = exclude current match, 0 = include current match)

#     Returns:
#     --------
#     pd.DataFrame
#         DataFrame with added rolling statistics columns
#     """

#     # Make a copy to avoid modifying the original DataFrame
#     result_df = df.copy()

#     # Ensure proper sorting
#     result_df = result_df.sort_values(by=sort_cols).reset_index(drop=True)

#     # Convert single column to list
#     if isinstance(stats_columns, str):
#         stats_columns = [stats_columns]

#     # Calculate rolling statistics for each column
#     for col in stats_columns:
#         if col in result_df.columns:
#             # Create column name for the new statistic
#             new_col_name = f"{col}_{agg_type}_last{window}"

#             # Calculate rolling statistic
#             rolling_obj = result_df.groupby(group_col)[col].rolling(
#                 window=window, min_periods=min_periods
#             )

#             # Apply the specified aggregation
#             if agg_type == "mean":
#                 rolling_stat = rolling_obj.mean()
#             elif agg_type == "sum":
#                 rolling_stat = rolling_obj.sum()
#             elif agg_type == "std":
#                 rolling_stat = rolling_obj.std()
#             elif agg_type == "min":
#                 rolling_stat = rolling_obj.min()
#             elif agg_type == "max":
#                 rolling_stat = rolling_obj.max()
#             elif agg_type == "median":
#                 rolling_stat = rolling_obj.median()
#             else:
#                 raise ValueError(f"Unsupported aggregation type: {agg_type}")

#             # Apply shift and add to DataFrame
#             result_df[new_col_name] = rolling_stat.shift(shift_periods).reset_index(
#                 level=0, drop=True
#             )

#             # Set first record for each player to NaN
#             first_records = result_df.groupby(group_col).head(1).index
#             result_df.loc[first_records, new_col_name] = pd.NA
#         else:
#             print(f"Warning: Column '{col}' not found in DataFrame")

#     return result_df

In [19]:
# To Test the function:

# # Add rolling averages for multiple stats
# stats_columns = ["p_ace", "p_df", "p_svpt", "p_1stIn", "p_1stWon", "p_2ndWon", "results"]
# player_stats_hist_df = add_rolling_stats(
#     player_stats_hist_df, stats_columns=stats_columns, agg_type="mean", window=5
# )
#
# # Add rolling standard deviation for serve points
# player_stats_hist_df = add_rolling_stats(
#     player_stats_hist_df, stats_columns="p_svpt", agg_type="min", window=2
# )
#
# # Add rolling sum for aces over last 3 matches
# player_stats_hist_df = add_rolling_stats(
#     player_stats_hist_df, stats_columns="results", agg_type="sum", window=3
# )
#
# # Verify the results
# player_stats_hist_df[
#     [
#         "player_id",
#         "tourney_date",
#         "tourney_id",
#         "match_num",
#         "p_df",
#         "p_df_mean_last5",
#         "p_svpt",
#         "p_svpt_min_last2",
#         "results",
#         "results_sum_last3"
#     ]
# ].head(10)

In [20]:
# cols_to_mean = [
#     "p_ace",
#     "p_df",
#     "p_svpt",
#     "p_1stIn",
#     "p_1stWon",
#     "p_2ndWon",
#     "p_SvGms",
#     "p_bpSaved",
#     "p_bpFaced",  # player stats
#     "opponent_rank_points",  # opponent info
#     "o_ace",
#     "o_df",
#     "o_svpt",
#     "o_1stIn",
#     "o_1stWon",
#     "o_2ndWon",
#     "o_SvGms",
#     "o_bpSaved",
#     "o_bpFaced",  # opponent stats
#     "minutes",  # match info
# ]

# cols_to_sum = [
#     "minutes",  # match info
#     "results",  # match wins
# ]

In [21]:
# player_stats_hist_df = add_rolling_stats(
#     player_stats_hist_df, stats_columns=cols_to_mean, agg_type="mean", window=5
# )

# player_stats_hist_df = add_rolling_stats(
#     player_stats_hist_df, stats_columns=cols_to_sum, agg_type="sum", window=5
# )

In [22]:
# player_stats_hist_df.shape

In [23]:
# player_stats_hist_df.head(10)

In [24]:
# # Save player_stats_hist_df info DataFrame as pickle
# player_stats_hist_df.to_pickle("../data/player_stats_hist.pkl")

In [25]:
# read the player_stats_hist_df info from the pickle file
player_stats_hist_df = pd.read_pickle("../data/player_stats_hist.pkl")
print("Dataframe shape: ", player_stats_hist_df.shape)
player_stats_hist_df.head()

Dataframe shape:  (196710, 73)


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,tourney_year,match_num,player_id,player_seed,player_entry,player_name,player_hand,player_ht,player_ioc,player_age,player_rank,player_rank_points,p_ace,p_df,p_svpt,p_1stIn,p_1stWon,p_2ndWon,p_SvGms,p_bpSaved,p_bpFaced,opponent_id,opponent_seed,opponent_entry,opponent_name,opponent_hand,opponent_ht,opponent_ioc,opponent_age,opponent_rank,opponent_rank_points,o_ace,o_df,o_svpt,o_1stIn,o_1stWon,o_2ndWon,o_SvGms,o_bpSaved,o_bpFaced,score,best_of,round,minutes,results,p_ace_mean_last5,p_df_mean_last5,p_svpt_mean_last5,p_1stIn_mean_last5,p_1stWon_mean_last5,p_2ndWon_mean_last5,p_SvGms_mean_last5,p_bpSaved_mean_last5,p_bpFaced_mean_last5,opponent_rank_points_mean_last5,o_ace_mean_last5,o_df_mean_last5,o_svpt_mean_last5,o_1stIn_mean_last5,o_1stWon_mean_last5,o_2ndWon_mean_last5,o_SvGms_mean_last5,o_bpSaved_mean_last5,o_bpFaced_mean_last5,minutes_mean_last5,minutes_sum_last5,results_sum_last5
0,1992-409,Atlanta,Clay,32.0,A,1992-04-27,1992,12,100282,,WC,Guillermo Vilas,L,180.0,ARG,39.6,444.0,38.0,0.0,2.0,111.0,94.0,55.0,9.0,17.0,2.0,8.0,101334,3.0,,Alexander Volkov,L,188.0,RUS,25.1,18.0,1247.0,2.0,3.0,126.0,71.0,51.0,20.0,16.0,10.0,15.0,5-7 7-6(6) 6-3,3,R32,167.0,0,,,,,,,,,,,,,,,,,,,,,,
1,1992-323,Bordeaux,Clay,32.0,A,1992-09-14,1992,7,100282,,WC,Guillermo Vilas,L,180.0,ARG,40.0,410.0,47.0,0.0,3.0,81.0,41.0,26.0,20.0,13.0,5.0,9.0,102001,,,German Lopez,R,193.0,ESP,20.7,99.0,425.0,11.0,2.0,79.0,46.0,35.0,17.0,12.0,5.0,6.0,6-4 3-6 6-0,3,R32,109.0,0,0.0,2.0,111.0,94.0,55.0,9.0,17.0,2.0,8.0,1247.0,2.0,3.0,126.0,71.0,51.0,20.0,16.0,10.0,15.0,167.0,167.0,0.0
2,1991-411,Chicago,Carpet,32.0,A,1991-02-25,1991,10,100284,,WC,Jimmy Connors,L,178.0,USA,38.4,990.0,2.0,0.0,1.0,41.0,33.0,13.0,3.0,7.0,2.0,7.0,101409,,,Jaime Yzaga,R,170.0,PER,23.3,65.0,502.0,0.0,2.0,49.0,33.0,24.0,7.0,8.0,1.0,2.0,6-3 6-0,3,R32,58.0,0,,,,,,,,,,,,,,,,,,,,,,
3,1991-403,Miami Masters,Hard,96.0,M,1991-03-15,1991,15,100284,,WC,Jimmy Connors,L,178.0,USA,38.5,961.0,3.0,0.0,2.0,69.0,47.0,27.0,10.0,10.0,5.0,9.0,101274,,,Udo Riglewski,R,185.0,GER,24.6,98.0,364.0,3.0,4.0,70.0,33.0,21.0,13.0,10.0,5.0,11.0,6-4 6-4,3,R128,107.0,1,0.0,1.0,41.0,33.0,13.0,3.0,7.0,2.0,7.0,502.0,0.0,2.0,49.0,33.0,24.0,7.0,8.0,1.0,2.0,58.0,58.0,0.0
4,1991-403,Miami Masters,Hard,96.0,M,1991-03-15,1991,47,100284,,WC,Jimmy Connors,L,178.0,USA,38.5,961.0,3.0,0.0,3.0,50.0,33.0,15.0,8.0,9.0,2.0,7.0,101750,28.0,,Cristiano Caratti,R,178.0,ITA,20.7,37.0,767.0,3.0,3.0,66.0,43.0,25.0,12.0,10.0,3.0,6.0,6-4 6-3,3,R64,80.0,0,0.0,1.5,55.0,40.0,20.0,6.5,8.5,3.5,8.0,433.0,1.5,3.0,59.5,33.0,22.5,10.0,9.0,3.0,6.5,82.5,165.0,1.0


### Todo on player stats
todo:
1. surface stats
2. elo
3. other interesting stats features?
4. create also most recent stats file per player (tp be used for inference!!)

## Tournaments File

In [26]:
# tournament_info_df = atp_matches_df[
#     [
#         "tourney_id",
#         "tourney_name",
#         "surface",
#         "draw_size",
#         "tourney_level",
#         "tourney_date",
#         "tourney_year",
#     ]
# ].copy()
# tournament_info_df = tournament_info_df.drop_duplicates().reset_index(drop=True)
# tournament_info_df.shape

In [27]:
# tournament_info_df.head()

In [28]:
# assert tournament_info_df.tourney_id.nunique() == tournament_info_df.shape[0], (
#     "tourney_id should be unique in tournament_info_df"
# )

In [29]:
# # write the tournament info to a pickle file
# tournament_info_df.to_pickle("../data/tournament_info.pkl")

In [30]:
# read the tournament info from the pickle file
tournament_info_df = pd.read_pickle("../data/tournament_info.pkl")
print("Dataframe shape: ", tournament_info_df.shape)
tournament_info_df.head()

Dataframe shape:  (2396, 7)


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,tourney_year
0,1991-301,Auckland,Hard,32.0,A,1991-01-07,1991
1,1991-338,Sydney Outdoor,Hard,32.0,A,1991-01-07,1991
2,1991-580,Australian Open,Hard,128.0,G,1991-01-14,1991
3,1991-201,Guaruja,Hard,32.0,A,1991-02-04,1991
4,1991-408,Milan,Carpet,32.0,A,1991-02-04,1991


## ML Dataset (Training, Val and Test)

In [32]:
atp_matches_df.shape

(98355, 50)

In [None]:
# Create a new column with a list of [winner_id, loser_id] for each row
atp_matches_df["player_ids"] = atp_matches_df.apply(
    lambda row: [row["winner_id"], row["loser_id"]], axis=1
)

# Randomly select one as player_1 and the other as player_2
rand_choice = np.random.randint(0, 2, size=len(atp_matches_df))
atp_matches_df["player_1"] = atp_matches_df.apply(
    lambda row: row["player_ids"][rand_choice[row.name]], axis=1
)
atp_matches_df["player_2"] = atp_matches_df.apply(
    lambda row: row["player_ids"][1 - rand_choice[row.name]], axis=1
)

# Create a column "winner": 0 if player_1 is the first element of player_ids, else 1
atp_matches_df["winner"] = atp_matches_df.apply(
    lambda row: 0 if row["player_1"] == row["player_ids"][0] else 1, axis=1
)

atp_matches_df[["player_ids", "player_1", "player_2", "winner"]].head()

Unnamed: 0,player_ids,player_1,player_2,winner
0,"[101142, 101746]",101746,101142,1
1,"[101613, 100587]",101613,100587,0
2,"[101179, 101601]",101179,101601,0
3,"[101117, 101332]",101117,101332,0
4,"[101901, 101735]",101735,101901,1


In [45]:
atp_matches_df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_year', 'player_ids', 'col_1', 'col_2', 'player_1', 'player_2',
       'winner'],
      dtype='object')

In [None]:
ml_dataset = atp_matches_df[
    ["tourney_id", "tourney_date", "match_num", "player_1", "player_2", "winner"]
].copy()


In [51]:
ml_dataset.head()

Unnamed: 0,tourney_id,tourney_date,match_num,player_1,player_2,winner
0,1991-301,1991-01-07,1,101746,101142,1
1,1991-301,1991-01-07,2,101613,100587,0
2,1991-301,1991-01-07,3,101179,101601,0
3,1991-301,1991-01-07,4,101117,101332,0
4,1991-301,1991-01-07,5,101735,101901,1


In [52]:
player_stats_hist_df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,tourney_year,match_num,player_id,player_seed,player_entry,player_name,player_hand,player_ht,player_ioc,player_age,player_rank,player_rank_points,p_ace,p_df,p_svpt,p_1stIn,p_1stWon,p_2ndWon,p_SvGms,p_bpSaved,p_bpFaced,opponent_id,opponent_seed,opponent_entry,opponent_name,opponent_hand,opponent_ht,opponent_ioc,opponent_age,opponent_rank,opponent_rank_points,o_ace,o_df,o_svpt,o_1stIn,o_1stWon,o_2ndWon,o_SvGms,o_bpSaved,o_bpFaced,score,best_of,round,minutes,results,p_ace_mean_last5,p_df_mean_last5,p_svpt_mean_last5,p_1stIn_mean_last5,p_1stWon_mean_last5,p_2ndWon_mean_last5,p_SvGms_mean_last5,p_bpSaved_mean_last5,p_bpFaced_mean_last5,opponent_rank_points_mean_last5,o_ace_mean_last5,o_df_mean_last5,o_svpt_mean_last5,o_1stIn_mean_last5,o_1stWon_mean_last5,o_2ndWon_mean_last5,o_SvGms_mean_last5,o_bpSaved_mean_last5,o_bpFaced_mean_last5,minutes_mean_last5,minutes_sum_last5,results_sum_last5
0,1992-409,Atlanta,Clay,32.0,A,1992-04-27,1992,12,100282,,WC,Guillermo Vilas,L,180.0,ARG,39.6,444.0,38.0,0.0,2.0,111.0,94.0,55.0,9.0,17.0,2.0,8.0,101334,3.0,,Alexander Volkov,L,188.0,RUS,25.1,18.0,1247.0,2.0,3.0,126.0,71.0,51.0,20.0,16.0,10.0,15.0,5-7 7-6(6) 6-3,3,R32,167.0,0,,,,,,,,,,,,,,,,,,,,,,
1,1992-323,Bordeaux,Clay,32.0,A,1992-09-14,1992,7,100282,,WC,Guillermo Vilas,L,180.0,ARG,40.0,410.0,47.0,0.0,3.0,81.0,41.0,26.0,20.0,13.0,5.0,9.0,102001,,,German Lopez,R,193.0,ESP,20.7,99.0,425.0,11.0,2.0,79.0,46.0,35.0,17.0,12.0,5.0,6.0,6-4 3-6 6-0,3,R32,109.0,0,0.0,2.0,111.0,94.0,55.0,9.0,17.0,2.0,8.0,1247.0,2.0,3.0,126.0,71.0,51.0,20.0,16.0,10.0,15.0,167.0,167.0,0.0
2,1991-411,Chicago,Carpet,32.0,A,1991-02-25,1991,10,100284,,WC,Jimmy Connors,L,178.0,USA,38.4,990.0,2.0,0.0,1.0,41.0,33.0,13.0,3.0,7.0,2.0,7.0,101409,,,Jaime Yzaga,R,170.0,PER,23.3,65.0,502.0,0.0,2.0,49.0,33.0,24.0,7.0,8.0,1.0,2.0,6-3 6-0,3,R32,58.0,0,,,,,,,,,,,,,,,,,,,,,,
3,1991-403,Miami Masters,Hard,96.0,M,1991-03-15,1991,15,100284,,WC,Jimmy Connors,L,178.0,USA,38.5,961.0,3.0,0.0,2.0,69.0,47.0,27.0,10.0,10.0,5.0,9.0,101274,,,Udo Riglewski,R,185.0,GER,24.6,98.0,364.0,3.0,4.0,70.0,33.0,21.0,13.0,10.0,5.0,11.0,6-4 6-4,3,R128,107.0,1,0.0,1.0,41.0,33.0,13.0,3.0,7.0,2.0,7.0,502.0,0.0,2.0,49.0,33.0,24.0,7.0,8.0,1.0,2.0,58.0,58.0,0.0
4,1991-403,Miami Masters,Hard,96.0,M,1991-03-15,1991,47,100284,,WC,Jimmy Connors,L,178.0,USA,38.5,961.0,3.0,0.0,3.0,50.0,33.0,15.0,8.0,9.0,2.0,7.0,101750,28.0,,Cristiano Caratti,R,178.0,ITA,20.7,37.0,767.0,3.0,3.0,66.0,43.0,25.0,12.0,10.0,3.0,6.0,6-4 6-3,3,R64,80.0,0,0.0,1.5,55.0,40.0,20.0,6.5,8.5,3.5,8.0,433.0,1.5,3.0,59.5,33.0,22.5,10.0,9.0,3.0,6.5,82.5,165.0,1.0


In [None]:
# left join with player_stats_hist_df
ml_dataset = ml_dataset.merge(
    player_stats_hist_df,
    how="left",
    left_on=["player_1", "tourney_id", "tourney_date", "match_num"],
    right_on=["player_id", "tourney_id", "tourney_date", "match_num"],
    suffixes=("", "_p1"),
)

# add also p2 stats
ml_dataset = ml_dataset.merge(
    player_stats_hist_df,
    how="left",
    left_on=["player_2", "tourney_id", "tourney_date", "match_num"],
    right_on=["player_id", "tourney_id", "tourney_date", "match_num"],
    suffixes=("", "_p2"),
)

# add tournament info
ml_dataset = ml_dataset.merge(
    tournament_info_df,
    how="left",
    left_on="tourney_id",
    right_on="tourney_id",
    suffixes=("", "_t"),
)

ml_dataset.head()

# clean columns: TODO

Unnamed: 0,tourney_id,tourney_date,match_num,player_1,player_2,winner,tourney_name,surface,draw_size,tourney_level,tourney_year,player_id,player_seed,player_entry,player_name,player_hand,player_ht,player_ioc,player_age,player_rank,player_rank_points,p_ace,p_df,p_svpt,p_1stIn,p_1stWon,p_2ndWon,p_SvGms,p_bpSaved,p_bpFaced,opponent_id,opponent_seed,opponent_entry,opponent_name,opponent_hand,opponent_ht,opponent_ioc,opponent_age,opponent_rank,opponent_rank_points,o_ace,o_df,o_svpt,o_1stIn,o_1stWon,o_2ndWon,o_SvGms,o_bpSaved,o_bpFaced,score,best_of,round,minutes,results,p_ace_mean_last5,p_df_mean_last5,p_svpt_mean_last5,p_1stIn_mean_last5,p_1stWon_mean_last5,p_2ndWon_mean_last5,p_SvGms_mean_last5,p_bpSaved_mean_last5,p_bpFaced_mean_last5,opponent_rank_points_mean_last5,o_ace_mean_last5,o_df_mean_last5,o_svpt_mean_last5,o_1stIn_mean_last5,o_1stWon_mean_last5,o_2ndWon_mean_last5,o_SvGms_mean_last5,o_bpSaved_mean_last5,o_bpFaced_mean_last5,minutes_mean_last5,minutes_sum_last5,results_sum_last5,tourney_name_p2,surface_p2,draw_size_p2,tourney_level_p2,tourney_year_p2,player_id_p2,player_seed_p2,player_entry_p2,player_name_p2,player_hand_p2,player_ht_p2,player_ioc_p2,player_age_p2,player_rank_p2,player_rank_points_p2,p_ace_p2,p_df_p2,p_svpt_p2,p_1stIn_p2,p_1stWon_p2,p_2ndWon_p2,p_SvGms_p2,p_bpSaved_p2,p_bpFaced_p2,opponent_id_p2,opponent_seed_p2,opponent_entry_p2,opponent_name_p2,opponent_hand_p2,opponent_ht_p2,opponent_ioc_p2,opponent_age_p2,opponent_rank_p2,opponent_rank_points_p2,o_ace_p2,o_df_p2,o_svpt_p2,o_1stIn_p2,o_1stWon_p2,o_2ndWon_p2,o_SvGms_p2,o_bpSaved_p2,o_bpFaced_p2,score_p2,best_of_p2,round_p2,minutes_p2,results_p2,p_ace_mean_last5_p2,p_df_mean_last5_p2,p_svpt_mean_last5_p2,p_1stIn_mean_last5_p2,p_1stWon_mean_last5_p2,p_2ndWon_mean_last5_p2,p_SvGms_mean_last5_p2,p_bpSaved_mean_last5_p2,p_bpFaced_mean_last5_p2,opponent_rank_points_mean_last5_p2,o_ace_mean_last5_p2,o_df_mean_last5_p2,o_svpt_mean_last5_p2,o_1stIn_mean_last5_p2,o_1stWon_mean_last5_p2,o_2ndWon_mean_last5_p2,o_SvGms_mean_last5_p2,o_bpSaved_mean_last5_p2,o_bpFaced_mean_last5_p2,minutes_mean_last5_p2,minutes_sum_last5_p2,results_sum_last5_p2,tourney_name_t,surface_t,draw_size_t,tourney_level_t,tourney_date_t,tourney_year_t
0,1991-301,1991-01-07,1,101746,101142,1,Auckland,Hard,32.0,A,1991,101746,,,Renzo Furlan,R,175.0,ITA,20.6,78.0,459.0,3.0,0.0,46.0,30.0,17.0,7.0,8.0,2.0,6.0,101142,1.0,,Emilio Sanchez,R,180.0,ESP,25.6,9.0,1487.0,1.0,0.0,53.0,37.0,30.0,7.0,9.0,5.0,6.0,6-4 6-1,3,R32,63.0,0,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991,101142,1.0,,Emilio Sanchez,R,180.0,ESP,25.6,9.0,1487.0,1.0,0.0,53.0,37.0,30.0,7.0,9.0,5.0,6.0,101746,,,Renzo Furlan,R,175.0,ITA,20.6,78.0,459.0,3.0,0.0,46.0,30.0,17.0,7.0,8.0,2.0,6.0,6-4 6-1,3,R32,63.0,1,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991-01-07,1991
1,1991-301,1991-01-07,2,101613,100587,0,Auckland,Hard,32.0,A,1991,101613,,Q,Malivai Washington,R,180.0,USA,21.5,94.0,371.0,5.0,1.0,56.0,25.0,17.0,20.0,9.0,1.0,2.0,100587,,WC,Steve Guy,R,188.0,NZL,31.8,220.0,114.0,4.0,7.0,56.0,30.0,22.0,6.0,8.0,7.0,11.0,6-3 6-2,3,R32,72.0,1,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991,100587,,WC,Steve Guy,R,188.0,NZL,31.8,220.0,114.0,4.0,7.0,56.0,30.0,22.0,6.0,8.0,7.0,11.0,101613,,Q,Malivai Washington,R,180.0,USA,21.5,94.0,371.0,5.0,1.0,56.0,25.0,17.0,20.0,9.0,1.0,2.0,6-3 6-2,3,R32,72.0,0,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991-01-07,1991
2,1991-301,1991-01-07,3,101179,101601,0,Auckland,Hard,32.0,A,1991,101179,,,Jean Philippe Fleurian,R,185.0,FRA,25.3,77.0,468.0,2.0,4.0,80.0,55.0,35.0,16.0,12.0,2.0,4.0,101601,,WC,Brett Steven,R,185.0,NZL,21.6,212.0,116.0,1.0,3.0,68.0,43.0,24.0,14.0,11.0,4.0,8.0,2-6 6-1 6-2,3,R32,101.0,1,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991,101601,,WC,Brett Steven,R,185.0,NZL,21.6,212.0,116.0,1.0,3.0,68.0,43.0,24.0,14.0,11.0,4.0,8.0,101179,,,Jean Philippe Fleurian,R,185.0,FRA,25.3,77.0,468.0,2.0,4.0,80.0,55.0,35.0,16.0,12.0,2.0,4.0,2-6 6-1 6-2,3,R32,101.0,0,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991-01-07,1991
3,1991-301,1991-01-07,4,101117,101332,0,Auckland,Hard,32.0,A,1991,101117,,,Eric Jelen,R,180.0,GER,25.8,65.0,502.0,0.0,1.0,82.0,55.0,35.0,14.0,13.0,6.0,10.0,101332,8.0,,Gilad Bloom,L,173.0,ISR,23.8,72.0,483.0,3.0,2.0,96.0,61.0,38.0,15.0,13.0,8.0,12.0,6-3 1-6 6-4,3,R32,108.0,1,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991,101332,8.0,,Gilad Bloom,L,173.0,ISR,23.8,72.0,483.0,3.0,2.0,96.0,61.0,38.0,15.0,13.0,8.0,12.0,101117,,,Eric Jelen,R,180.0,GER,25.8,65.0,502.0,0.0,1.0,82.0,55.0,35.0,14.0,13.0,6.0,10.0,6-3 1-6 6-4,3,R32,108.0,0,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991-01-07,1991
4,1991-301,1991-01-07,5,101735,101901,1,Auckland,Hard,32.0,A,1991,101735,3.0,,Richard Fromberg,R,196.0,AUS,20.6,28.0,876.0,1.0,3.0,49.0,25.0,21.0,12.0,9.0,4.0,6.0,101901,,Q,Chuck Adams,R,185.0,USA,19.7,190.0,142.0,4.0,4.0,65.0,46.0,34.0,12.0,10.0,2.0,2.0,6-3 6-4,3,R32,65.0,0,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991,101901,,Q,Chuck Adams,R,185.0,USA,19.7,190.0,142.0,4.0,4.0,65.0,46.0,34.0,12.0,10.0,2.0,2.0,101735,3.0,,Richard Fromberg,R,196.0,AUS,20.6,28.0,876.0,1.0,3.0,49.0,25.0,21.0,12.0,9.0,4.0,6.0,6-3 6-4,3,R32,65.0,1,,,,,,,,,,,,,,,,,,,,,,,Auckland,Hard,32.0,A,1991-01-07,1991


In [54]:
# Save the ml_dataset DataFrame to a pickle file
ml_dataset.to_pickle("../data/ml_dataset.pkl")