In [1]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from rapidfuzz import process, fuzz
import numpy as np

In [134]:
df_matches = pd.read_parquet("s3://matchedge-pipeline/data/clean/merged_matches.parquet")

## 1. Clean Columns

---
### 1.0. Drop Columns

```**Will Need This For Script**```

In [136]:
df_matches.drop(columns=["stats_link", "p1_max_speed", 
                         "p2_max_speed", "p2_1st_serve_average_speed",
                         "p1_1st_serve_average_speed", "p1_2nd_serve_average_speed",
                         "p2_2nd_serve_average_speed"
                         ], inplace=True)   # Dont need the stats are mostly NaN
df_matches.drop(columns=["player_1_scores", "player_2_scores"], inplace=True)     # Redundant
df_matches.drop(columns=["p1_net_points_won", "p2_net_points_played", "p1_net_points_played", "p2_net_points_won"], inplace=True)   # Dont need the stats are mostly NaN
df_matches.drop(columns=['p1_service_points_won', 'p1_return_points_won', 'p1_total_points_won',
                'p2_service_points_won', 'p2_return_points_won', 'p2_total_points_won'], inplace=True)  # Dont need the stats are mostly NaN

---
### 2.0. Clean Surfaces
#### Missing Tournament info from tournaments that start in 2024 end in 2025
Fill in missing tournament info. All coming from tourn_id = 339, 336


In [137]:
df_tournament = pd.read_csv('/Users/samueleferrucci/Documents/Coding/Projects/Tennis ML/data/clean/all_tournaments.csv')

# Create a lookup DataFrame with the tournament_id as index
df_tournament_sub = df_tournament[(df_tournament['id']==339) | (df_tournament['id']==336)]
tournament_lookup = df_tournament_sub.set_index('id')[['level', 'location', 'surface']]

# Fill missing values in df_matches by mapping from lookup
for col in ['level', 'location', 'surface']:
    df_matches[col] = df_matches[col].fillna(
        df_matches['tournament_id'].map(tournament_lookup[col])
    )

#### Standardise Surfaces
Make all lower case then turn back into correct data types

```**Will Need This For Script**```

In [138]:
df_matches["surface"] = df_matches["surface"].apply(lambda x: x.lower())

tournament_order = [
    'Next Gen ATP Finals',
    'ATP 250',
    'ATP 500',
    'ATP 1000',
    'Nitto ATP Finals',
    'Grand Slam'
]
df_matches['level'] = pd.Categorical(df_matches['level'], categories=tournament_order, ordered=True)
df_matches['location'] = df_matches['location'].astype('string')
df_matches['surface'] = df_matches['surface'].astype('category')

---

### 3.0. Fix Result

```**Will Need This For Script**```

In [139]:
# If Set 1 is not completed --> Retired
mask = (df_matches["p1_set1"] != 6) & (df_matches["p1_set1"] != 7) &\
    (df_matches["p2_set1"]!=6) & (df_matches["p2_set1"]!=7) & (df_matches["result"] == "Completed") &\
        ~(df_matches["p1_set1"].isna())
df_matches.loc[mask, "result"] = "RET"


# If set 2 is missing --> Retired
mask = (df_matches["result"]=="Completed") & df_matches["p1_set2"].isna()
df_matches.loc[mask, "result"] = "RET"


# If set 2 is not completed --> Retired
mask = (df_matches["p1_set2"] != 6) & (df_matches["p1_set2"] != 7) &\
    (df_matches["p2_set2"]!=6) & (df_matches["p2_set2"]!=7) & (df_matches["result"] == "Completed")
df_matches.loc[mask, "result"] = "RET"


# If Set 3 is not completed --> Retired
mask = (df_matches["p1_set3"] != 6) & (df_matches["p1_set3"] != 7) &\
    (df_matches["p2_set3"]!=6) & (df_matches["p2_set3"]!=7) & (df_matches["result"] == "Completed") &\
        ~(df_matches["p1_set3"].isna())
df_matches.loc[mask, "result"] = "RET"


# If Set 4 is not completed --> Retired
mask = (df_matches["p1_set4"] != 6) & (df_matches["p1_set4"] != 7) &\
    (df_matches["p2_set4"]!=6) & (df_matches["p2_set4"]!=7) & (df_matches["result"] == "Completed") &\
        ~(df_matches["p1_set4"].isna())
df_matches.loc[mask, "result"] = "RET"


# If Set 5 is not completed --> Retired
mask = (df_matches["p1_set5"] != 6) & (df_matches["p1_set5"] != 7) &\
    (df_matches["p2_set5"]!=6) & (df_matches["p2_set5"]!=7) & (df_matches["result"] == "Completed") &\
        ~(df_matches["p1_set5"].isna())
df_matches.loc[mask, "result"] = "RET"


# Best of 5 and set 3 is NaN and not Qualification --> Retired
mask = (df_matches["result"]=="Completed") & df_matches["p1_set3"].isna() & (df_matches["best_of"] == 5) &\
    ~(df_matches["match_round"].str.contains('Qual'))
df_matches.loc[mask, "result"] = "RET"

---
### 4.0. Change Qualifiers to best of 3 and 3rd round Qualification to best of 5

Only wimbledon 3rd round Qualification if best-of-five.
Change ALL 1st, 2nd round of Qualification where best-of-five --> best-of-three

```**Will Need This For Script**```

In [140]:
# 1st and 2nd round Qual change to best-of-3
mask = (df_matches["match_round"].str.contains('Qual')) & ~(df_matches["match_round"].str.contains('3rd Round Qualifying')) &\
    (df_matches["best_of"] == 5)
df_matches.loc[mask, "best_of"] = 3

# 3rd round Qual but NOT Wimby change to best-of-3
mask = ~(df_matches["tournament_id"] == 540) & (df_matches["match_round"].str.contains('3rd Round Qualifying')) &\
    (df_matches["best_of"] == 5)
df_matches.loc[mask, "best_of"] = 3

---
---

## 2. Split Matches

```**Will Need This For Script**```
---
### 1.0. Primary key

In [141]:
df_matches["match_uid"] = (
    df_matches["tournament_id"].astype(str) + "_" +
    df_matches["match_date"].astype(str) + "_" +
    df_matches["match_id"].astype(str)
)

In [142]:
# Reorder so match_uid is #1 column
df_matches = df_matches.loc[:, ['match_uid', 'match_date', 'player_1', 'player_2', 'duration', 'match_round',
       'winner', 'result', 'match_id', 'tournament_id', 'p1_id', 'p2_id',
       'p1_set1', 'p1_set2', 'p1_set3', 'p1_set4', 'p1_set5', 'p2_set1',
       'p2_set2', 'p2_set3', 'p2_set4', 'p2_set5', 'best_of', 'winner_id',
       'year', 'level', 'location', 'surface', 'p1_serve_rating', 'p1_aces',
       'p1_double_faults', 'p1_first_serve', 'p1_1st_serve_points_won',
       'p1_2nd_serve_points_won', 'p1_break_points_saved',
       'p1_service_games_played', 'p1_return_rating',
       'p1_1st_serve_return_points_won', 'p1_2nd_serve_return_points_won',
       'p1_break_points_converted', 'p1_return_games_played', 'p1_winners',
       'p1_unforced_errors', 'p2_serve_rating', 'p2_aces', 'p2_double_faults',
       'p2_first_serve', 'p2_1st_serve_points_won', 'p2_2nd_serve_points_won',
       'p2_break_points_saved', 'p2_service_games_played', 'p2_return_rating',
       'p2_1st_serve_return_points_won', 'p2_2nd_serve_return_points_won',
       'p2_break_points_converted', 'p2_return_games_played', 'p2_winners',
       'p2_unforced_errors', 'p2_break_point_opportunities',
       'p1_break_point_opportunities']]

---
### 2.0 Split Players

In [143]:
df_matches.columns

Index(['match_uid', 'match_date', 'player_1', 'player_2', 'duration',
       'match_round', 'winner', 'result', 'match_id', 'tournament_id', 'p1_id',
       'p2_id', 'p1_set1', 'p1_set2', 'p1_set3', 'p1_set4', 'p1_set5',
       'p2_set1', 'p2_set2', 'p2_set3', 'p2_set4', 'p2_set5', 'best_of',
       'winner_id', 'year', 'level', 'location', 'surface', 'p1_serve_rating',
       'p1_aces', 'p1_double_faults', 'p1_first_serve',
       'p1_1st_serve_points_won', 'p1_2nd_serve_points_won',
       'p1_break_points_saved', 'p1_service_games_played', 'p1_return_rating',
       'p1_1st_serve_return_points_won', 'p1_2nd_serve_return_points_won',
       'p1_break_points_converted', 'p1_return_games_played', 'p1_winners',
       'p1_unforced_errors', 'p2_serve_rating', 'p2_aces', 'p2_double_faults',
       'p2_first_serve', 'p2_1st_serve_points_won', 'p2_2nd_serve_points_won',
       'p2_break_points_saved', 'p2_service_games_played', 'p2_return_rating',
       'p2_1st_serve_return_points_won', 'p

In [148]:
p1_df = df_matches.copy()
p2_df = df_matches.copy()

p1_df.rename(columns={
    "player_1": "player",
    "p1_id": "id",
    "p1_set1": "set1",
    "p1_set2": "set2",
    "p1_set3": "set3",
    "p1_set4": "set4",
    "p1_set5": "set5",
    "p1_serve_rating": "serve_rating",
    "p1_aces": "aces",
    "p1_double_faults": "double_faults",
    "p1_first_serve": "first_serve",
    "p1_1st_serve_points_won": "1st_serve_points_won",
    "p1_2nd_serve_points_won": "2nd_serve_points_won",
    "p1_break_points_saved": "break_points_saved",
    "p1_service_games_played": "service_games_played",
    "p1_return_rating": "return_rating",
    "p1_1st_serve_return_points_won": "1st_serve_return_points_won",
    "p1_2nd_serve_return_points_won": "2nd_serve_return_points_won",
    "p1_break_points_converted": "break_points_converted",
    "p1_return_games_played": "return_games_played",
    "p1_winners": "winners",
    "p1_unforced_errors": "unforced_errors",
    "p1_break_point_opportunities": "break_point_opportunities",

    # Player 2 / opponent
    "player_2": "opponent",
    "player_2_scores": "opponent_scores",
    "p2_id": "opponent_id",
    "p2_set1": "opponent_set1",
    "p2_set2": "opponent_set2",
    "p2_set3": "opponent_set3",
    "p2_set4": "opponent_set4",
    "p2_set5": "opponent_set5",
    "p2_serve_rating": "opponent_serve_rating",
    "p2_aces": "opponent_aces",
    "p2_double_faults": "opponent_double_faults",
    "p2_first_serve": "opponent_first_serve",
    "p2_1st_serve_points_won": "opponent_1st_serve_points_won",
    "p2_2nd_serve_points_won": "opponent_2nd_serve_points_won",
    "p2_break_points_saved": "opponent_break_points_saved",
    "p2_service_games_played": "opponent_service_games_played",
    "p2_return_rating": "opponent_return_rating",
    "p2_1st_serve_return_points_won": "opponent_1st_serve_return_points_won",
    "p2_2nd_serve_return_points_won": "opponent_2nd_serve_return_points_won",
    "p2_break_points_converted": "opponent_break_points_converted",
    "p2_return_games_played": "opponent_return_games_played",
    "p2_winners": "opponent_winners",
    "p2_unforced_errors": "opponent_unforced_errors",
    "p2_break_point_opportunities": "opponent_break_point_opportunities"
}, inplace=True)

p2_df.rename(columns={
    # Player 2 / main player
    "player_2": "player",
    "player_2_scores": "player_scores",
    "p2_id": "id",
    "p2_set1": "set1",
    "p2_set2": "set2",
    "p2_set3": "set3",
    "p2_set4": "set4",
    "p2_set5": "set5",
    "p2_serve_rating": "serve_rating",
    "p2_aces": "aces",
    "p2_double_faults": "double_faults",
    "p2_first_serve": "first_serve",
    "p2_1st_serve_points_won": "1st_serve_points_won",
    "p2_2nd_serve_points_won": "2nd_serve_points_won",
    "p2_break_points_saved": "break_points_saved",
    "p2_service_games_played": "service_games_played",
    "p2_return_rating": "return_rating",
    "p2_1st_serve_return_points_won": "1st_serve_return_points_won",
    "p2_2nd_serve_return_points_won": "2nd_serve_return_points_won",
    "p2_break_points_converted": "break_points_converted",
    "p2_return_games_played": "return_games_played",
    "p2_winners": "winners",
    "p2_unforced_errors": "unforced_errors",
    "p2_break_point_opportunities": "break_point_opportunities",

    # Player 1 / opponent
    "player_1": "opponent",
    "player_1_scores": "opponent_scores",
    "p1_id": "opponent_id",
    "p1_set1": "opponent_set1",
    "p1_set2": "opponent_set2",
    "p1_set3": "opponent_set3",
    "p1_set4": "opponent_set4",
    "p1_set5": "opponent_set5",
    "p1_serve_rating": "opponent_serve_rating",
    "p1_aces": "opponent_aces",
    "p1_double_faults": "opponent_double_faults",
    "p1_first_serve": "opponent_first_serve",
    "p1_1st_serve_points_won": "opponent_1st_serve_points_won",
    "p1_2nd_serve_points_won": "opponent_2nd_serve_points_won",
    "p1_break_points_saved": "opponent_break_points_saved",
    "p1_service_games_played": "opponent_service_games_played",
    "p1_return_rating": "opponent_return_rating",
    "p1_1st_serve_return_points_won": "opponent_1st_serve_return_points_won",
    "p1_2nd_serve_return_points_won": "opponent_2nd_serve_return_points_won",
    "p1_break_points_converted": "opponent_break_points_converted",
    "p1_return_games_played": "opponent_return_games_played",
    "p1_winners": "opponent_winners",
    "p1_unforced_errors": "opponent_unforced_errors",
    "p1_break_point_opportunities": "opponent_break_point_opportunities",
}, inplace=True)

In [149]:
df_matches = pd.concat([p1_df, p2_df], ignore_index=True)

In [152]:
df_matches.rename(columns={
    'first_serve': 'first_serve_percentage', 
    '1st_serve_points_won': '1st_serve_percentage_won', 
    '2nd_serve_points_won': '2nd_serve_percentage_won', 
    'break_points_saved': 'break_points_percentage_saved', 
    '1st_serve_return_points_won': '1st_serve_return_percentage_won',
    '2nd_serve_return_points_won': '2nd_serve_return_percentage_won', 
    'break_points_converted': 'break_points_converted_percentage'
}, inplace=True)
df_matches.rename(columns={
    'opponent_first_serve': 'opponent_first_serve_percentage',
    'opponent_1st_serve_points_won': 'opponent_1st_serve_percentage_won',
    'opponent_2nd_serve_points_won': 'opponent_2nd_serve_percentage_won',
    'opponent_break_points_saved': 'opponent_break_points_percentage_saved',
    'opponent_1st_serve_return_points_won': 'opponent_1st_serve_return_percentage_won',
    'opponent_2nd_serve_return_points_won': 'opponent_2nd_serve_return_percentage_won',
    'opponent_break_points_converted': 'opponent_break_points_converted_percentage'
}, inplace=True)


---
---
## 3. Create Some New Features

```**Will Need This For Script**```
---
### 1.0. Target

Column as 0 if player lost or 1 if player won

In [153]:
df_matches["target"] = (df_matches["id"] == df_matches["winner_id"]).astype(int)

---
### 2.0. Straight Sets Won: bool

In [154]:
def straight_sets_win(row):
    if row["target"] == 0:
        return 0 
    if row["best_of"] == 3 and pd.isna(row["set3"]):
        return 1
    if row["best_of"] == 5 and pd.isna(row["set4"]):
        return 1
    return 0

df_matches["straight_sets_win"] = df_matches.apply(straight_sets_win, axis=1)

---
### 3.0. Straight Sets Lost: bool

In [155]:
def straight_sets_loss(row):
    if row["target"] == 1:
        return 0  # Player won, can't be a straight-sets loss
    if row["best_of"] == 3 and pd.isna(row["opponent_set3"]):
        return 1
    if row["best_of"] == 5 and pd.isna(row["opponent_set4"]):
        return 1
    return 0

df_matches["straight_sets_loss"] = df_matches.apply(straight_sets_loss, axis=1)

---
### 4.0. Num of Sets Won/Lost

In [156]:
# List of player and opponent set columns in order
player_sets = ['set1', 'set2', 'set3', 'set4', 'set5']
opponent_sets = ['opponent_set1', 'opponent_set2', 'opponent_set3', 'opponent_set4', 'opponent_set5']

# Compute per-set win booleans for player
set_wins = pd.DataFrame({
    p: ((df_matches[p] == 6) & (df_matches[o] <= 4)) |  # normal win
       ((df_matches[p] == 7) & (df_matches[o] >= 5))    # tiebreak win
    for p, o in zip(player_sets, opponent_sets)
})

# Count wins per row for player
df_matches["sets_won"] = set_wins.sum(axis=1)

# Compute per-set win booleans for opponent
set_loses = pd.DataFrame({
    o: ((df_matches[o] == 6) & (df_matches[p] <= 4)) |  # normal win
       ((df_matches[o] == 7) & (df_matches[p] >= 5))    # tiebreak win
    for p, o in zip(player_sets, opponent_sets)
})

# Count wins for opponent per row
df_matches["sets_lost"] = set_loses.sum(axis=1)

---
### 5.0. Num of Sets

In [157]:
df_matches["sets_played"] = df_matches["sets_won"] + df_matches["sets_lost"]
# ensure duration is numeric minutes
df_matches["duration"] = pd.to_timedelta(df_matches["duration"]).dt.total_seconds() / 60  

---
---
## 4.0. Investigate Missing Values
```**Will Need This For Script**```


#### We Will imput 0 -Reasoning for imputing 0 in break point stats

---
#### Explanation for identical distributions of missing break point stats

Upon inspecting the missing values for `break_points_percentage_saved` and `break_points_converted_percentage`, we observed that the **distributions across tournaments are exactly the same**.  

This is due to the way the dataset was structured: each match has been **duplicated** so that the player and opponent are swapped in separate rows. In other words, every match appears twice — once with Player A vs Player B, and once with Player B vs Player A.  

Because of this duplication, any match where **no break point opportunities occurred** (i.e., 0/0) will appear as missing values in **both rows**. As a result, the missing value pattern is **identical for both statistics** across the dataset, even though each missing value corresponds to a legitimate 0/0 situation rather than a true NaN.  

This justifies our approach to **impute 0 for these columns**, reflecting that no break points were available to be saved or converted in these matches.

---


After inspecting the distribution of missing values in the break point statistics, we observed the following:

1. 'break_points_percentage_saved' and 'break_points_converted_percentage' are missing across a wide range of tournaments (59 tournaments), unlike other stats such as 'first_serve_percentage' which have missing values concentrated in only a few tournaments.

2. This pattern suggests that the missing values are not due to missing data collection, but rather because the player had **zero break point opportunities** in those matches (i.e., denominator was effectively 0 or 0/0). I fixed the cleaning script now to return 0 instead of NaN if 0/0 and NaN if NaN.

3. Matches where the player retired early could also cause missing values, but these are rare and constitute only a small fraction of the missing cases.

Based on this reasoning, it is appropriate to **impute 0** for these columns, since a player with zero opportunities could not have saved or converted any break points. Imputing 0 preserves the semantic meaning of the statistic and ensures consistency for downstream analysis or modeling.






In [158]:
df_matches['break_points_percentage_saved'] = df_matches['break_points_percentage_saved'].fillna(0)
df_matches['break_points_converted_percentage'] = df_matches['break_points_converted_percentage'].fillna(0)
df_matches['opponent_break_points_percentage_saved'] = df_matches['break_points_percentage_saved'].fillna(0)
df_matches['opponent_break_points_converted_percentage'] = df_matches['break_points_converted_percentage'].fillna(0)

---
### 1.0. Impute Duration 

Calculate median duration for how many sets played

**For US Open**

In [159]:
# Step 1: median durations by sets played in US Open (560)
median_duration_GS_hard = df_matches.loc[df_matches["tournament_id"] == 560] \
    .assign(sets_played=lambda x: x["sets_won"] + x["sets_lost"]) \
    .groupby("sets_played")["duration"].median()

**For Wimby**

In [160]:
median_duration_wimby = df_matches.loc[(df_matches["tournament_id"] == 540) & (df_matches["year"]==2024)] \
    .assign(sets_played=lambda x: x[["sets_won", "sets_lost"]].sum(axis=1)) \
    .groupby("sets_played")["duration"].median()

**For Clay Courts**

In [161]:
median_duration_clay = (
    df_matches.loc[df_matches["surface"] == "clay"]
    .assign(sets_played=lambda x: x[["sets_won", "sets_lost"]].sum(axis=1))
    .groupby("sets_played")["duration"]
    .median()
)
median_per_set = median_duration_clay / median_duration_clay.index  # get median duration per set
median_duration_clay.loc[4] = median_per_set.median() * 4
median_duration_clay.loc[5] = median_per_set.median() * 5

Define imputing function

In [162]:
def impute_tournament_duration(df, tournament_id, year, median_durations):
    """
    Impute missing duration values for a specific tournament and year
    based on precomputed median durations per sets played.

    Parameters:
        df (pd.DataFrame): The matches DataFrame
        tournament_id: The tournament identifier
        year: The tournament year
        median_durations (pd.Series or dict): median durations indexed by sets_played

    Returns:
        pd.DataFrame: DataFrame with missing durations imputed
    """
    df = df.copy()
    mask = (df["tournament_id"] == tournament_id) &\
           (df["year"] == year) &\
           (df["duration"].isna() | df["duration"].isnull() | df["duration"].eq("Missing value"))
   
    df.loc[mask, "imputed_duration"] = 1 
    df.loc[mask, "duration"] = df.loc[mask, "sets_played"].map(median_durations)
    
    return df


create ```imputed_duration``` where 0 mean scraped duration and 1, imouted duration

In [163]:
df_matches["imputed_duration"] = np.where(df_matches["duration"].notna(), 0, np.nan)

Impute durations

In [164]:
df_match_imputed = impute_tournament_duration(df_matches, 580, 2025, median_duration_GS_hard)
df_match_imputed = impute_tournament_duration(df_match_imputed, 520, 2025, median_duration_clay)
df_match_imputed = impute_tournament_duration(df_match_imputed, 540, 2025, median_duration_wimby)

---
### 2.0 Impute Winners & Unforced Errors

In [165]:
# Assume df has columns: 'player', 'match_date', 'winners', 'unforced_errors'
df = df_match_imputed.sort_values(['player', 'match_date'])

# Compute rolling mean over the last N matches
df['winners_roll'] = df.groupby('player')['winners'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())
df['unforced_errors_roll'] = df.groupby('player')['unforced_errors'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())
df['opponent_winners_roll'] = df.groupby('player')['opponent_winners'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())
df['opponent_unforced_errors_roll'] = df.groupby('player')['opponent_unforced_errors'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())


# Remove rows without winners_rolling
df = df.loc[~((df["winners_roll"].isna()) & (df["winners"].isna()))].copy()
df = df.loc[~((df["opponent_winners_roll"].isna()) & (df["opponent_winners"].isna()))].copy()

Create imputed ```bool``` column

In [166]:
df["imputed_winners"] = 0
df["imputed_unforced_errors"] = 0
df["imputed_opponent_winners"] = 0
df["imputed_opponent_unforced_errors"] = 0

Now impute winners and unfocred with rolling **IF** they are missing 

In [167]:
mask = df['winners'].isna()                                 # record which rows are missing
df.loc[mask, 'imputed_winners'] = 1                         # flag those rows as imputed
df['winners'] = df['winners'].fillna(df['winners_roll'])    # fill NaNs

mask = df['unforced_errors'].isna()                                 
df.loc[mask, 'imputed_unforced_errors'] = 1 
df['unforced_errors'] = df['unforced_errors'].fillna(df['unforced_errors_roll'])

mask = df['opponent_winners'].isna()                                 
df.loc[mask, 'imputed_opponent_winners'] = 1 
df['opponent_winners'] = df['opponent_winners'].fillna(df['opponent_winners_roll'])

mask = df['opponent_unforced_errors'].isna()                                 
df.loc[mask, 'imputed_opponent_unforced_errors'] = 1 
df['opponent_unforced_errors'] = df['opponent_unforced_errors'].fillna(df['opponent_unforced_errors_roll'])


df.drop(columns=["unforced_errors_roll", "opponent_winners_roll", "opponent_unforced_errors_roll", 
        "winners_roll", "winners_roll"], inplace=True)

---
### Impute 1st Serve Percentage

In [168]:
df.loc[df["first_serve_percentage"].isna()].sort_values("match_date", ascending=True)


df_imputed_serve = df.sort_values(['player', 'match_date'])

# Compute rolling mean over the last N matches
df_imputed_serve['first_serve_percentage_roll'] = df_imputed_serve.groupby('player')['first_serve_percentage'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())
df_imputed_serve['opponent_first_serve_percentage_roll'] = df_imputed_serve.groupby('player')['opponent_first_serve_percentage'].transform(lambda x: x.shift().rolling(20, min_periods=1).median())


df_imputed_serve["imputed_first_serve_percentage"] = 0
df_imputed_serve["imputed_opponent_first_serve_percentage"] = 0


mask = df_imputed_serve['first_serve_percentage'].isna()                                                 # record which rows are missing
df_imputed_serve.loc[mask, 'imputed_first_serve_percentage'] = 1                                                       # flag those rows as imputed
df_imputed_serve['first_serve_percentage'] = df_imputed_serve['first_serve_percentage'].fillna(df_imputed_serve['first_serve_percentage_roll'])    # fill NaNs

mask = df_imputed_serve['opponent_first_serve_percentage'].isna()                                 
df_imputed_serve.loc[mask, 'imputed_opponent_first_serve_percentage'] = 1 
df_imputed_serve['opponent_first_serve_percentage'] = df_imputed_serve['opponent_first_serve_percentage'].fillna(df_imputed_serve['opponent_first_serve_percentage_roll'])


df_imputed_serve.drop(columns=["first_serve_percentage_roll", "opponent_first_serve_percentage_roll"], inplace=True)

---
### Remove Final Match 

In [291]:
df_matches = df_imputed_serve[(~df_imputed_serve["2nd_serve_percentage_won"].isna()) & (~df_imputed_serve["2nd_serve_return_percentage_won"].isna())].copy()

In [292]:
# 1. Compute rolling stats for players only
def compute_player_rolling(df, rolling_features, window=5):
    
    df = df.sort_values(["id", "match_date", "match_round"])
    
    for col in rolling_features:
        df[col + "_shifted"] = df.groupby("id")[col].shift(1)
    
    
    # 1b. Compute rolling median over last 5 previous matches
    for col in rolling_features:
        shifted_col = col + "_shifted"
        new_col = col + f"_last_{window}_matches"
        df[new_col] = (
            df.groupby("id")[shifted_col]
            .rolling(window=window, min_periods=1)
            .median()
            .reset_index(level=0, drop=True)
        )
        # 1c. Fill missing values (no prior matches) with 0
        df[new_col] = df[new_col].fillna(0)
    
    # 1d. Clean up shifted helper columns
    shifted_columns = [col + "_shifted" for col in rolling_features]
    df.drop(columns=shifted_columns, inplace=True, errors='ignore')
    return df
    
    

rolling_features = [
    "aces", "double_faults", "first_serve_percentage",
    "1st_serve_percentage_won", "2nd_serve_percentage_won",
    "break_points_percentage_saved", "service_games_played",
    "return_rating", "1st_serve_return_percentage_won",
    "2nd_serve_return_percentage_won",
    "break_points_converted_percentage", "return_games_played",
    "winners", "unforced_errors", "break_point_opportunities"
]

df_matches = compute_player_rolling(df_matches, rolling_features, window=5)

# ================================================================
#  STEP 2: Last match date feature
# ================================================================
df_matches["last_match_date"] = df_matches.groupby("id")["match_date"].shift(1)
df_matches["last_match_date"] = df_matches["last_match_date"].fillna("2024-01-01 00:00:00")

# ================================================================
#  STEP 3: One-hot encode surface + imputed features
# ================================================================
df_matches = pd.get_dummies(df_matches, columns=["surface"], dtype=int)
surface_cols = ["surface_clay", "surface_grass", "surface_hard", "surface_hard (indoor)", 
       'imputed_duration', 'imputed_winners', 'imputed_unforced_errors', 'imputed_first_serve_percentage']

# ================================================================
#  STEP 4: Rolling surface/imputed features (last 10 matches)
# ================================================================
# 4a. Shift each column to exclude current match
for col in surface_cols:
    df_matches[col + "_shifted"] = df_matches.groupby("id")[col].shift(1)

# 4b. Compute rolling sum over last 10 previous matches
for col in surface_cols:
    shifted_col = col + "_shifted"
    new_col = col.replace("surface_", "") + f"_last_{10}_matches"
    df_matches[new_col] = (
        df_matches.groupby("player")[shifted_col]
        .rolling(window=10, min_periods=1)
        .sum()
        .reset_index(level=0, drop=True)
    )
    # 4c. Fill missing with 0
    df_matches[new_col] = df_matches[new_col].fillna(0)

# 4d. Clean up shifted helper columns
shifted_columns = [col + "_shifted" for col in surface_cols]
df_matches.drop(columns=shifted_columns, inplace=True, errors='ignore')

# ================================================================
#  STEP 5: Cumulative match counter
# ================================================================
# This gives the number of matches already played by each player
df_matches['match_number'] = df_matches.groupby('player').cumcount()

---
#### Matches Won On Each Surface

In [293]:
####################
# Clay
####################
# Clay matches played
df_matches['shift_rolling_clay_match'] = df_matches.groupby(['player'])['surface_clay'].shift(1)
df_matches['rolling_clay_match'] = (
    df_matches.groupby("player")['shift_rolling_clay_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_clay_matches_won'] = df_matches.loc[df_matches['surface_clay'] == 1].groupby("player")['target'].shift(1)
df_matches['clay_matches_won'] = (
    df_matches.loc[df_matches['surface_clay'] == 1].groupby("player")['shift_rolling_clay_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_clay'] = df_matches["clay_matches_won"] / df_matches["rolling_clay_match"]
# Drop matches won
df_matches.drop(columns=["clay_matches_won", "shift_rolling_clay_match", "shift_rolling_clay_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_clay_match', 'rolling_win_rate_clay']] = df_matches[['rolling_clay_match', 'rolling_win_rate_clay']].fillna(0)


####################
# Hard
####################
# Hard matches played
df_matches['shift_rolling_hard_match'] = df_matches.groupby(['player'])['surface_hard'].shift(1)
df_matches['rolling_hard_match'] = (
    df_matches.groupby("player")['shift_rolling_hard_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_hard_matches_won'] = df_matches.loc[df_matches['surface_hard'] == 1].groupby("player")['target'].shift(1)
df_matches['hard_matches_won'] = (
    df_matches.loc[df_matches['surface_hard'] == 1].groupby("player")['shift_rolling_hard_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_hard'] = df_matches["hard_matches_won"] / df_matches["rolling_hard_match"]


# Drop matches won
df_matches.drop(columns=["hard_matches_won", "shift_rolling_hard_match", "shift_rolling_hard_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_hard_match', 'rolling_win_rate_hard']] = df_matches[['rolling_hard_match', 'rolling_win_rate_hard']].fillna(0)


####################
# Grass
####################
# Grass matches played
df_matches['shift_rolling_grass_match'] = df_matches.groupby(['player'])['surface_grass'].shift(1)
df_matches['rolling_grass_match'] = (
    df_matches.groupby("player")['shift_rolling_grass_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_grass_matches_won'] = df_matches.loc[df_matches['surface_grass'] == 1].groupby("player")['target'].shift(1)
df_matches['grass_matches_won'] = (
    df_matches.loc[df_matches['surface_grass'] == 1].groupby("player")['shift_rolling_grass_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_grass'] = df_matches["grass_matches_won"] / df_matches["rolling_grass_match"]


# Drop matches won
df_matches.drop(columns=["grass_matches_won", "shift_rolling_grass_match", "shift_rolling_grass_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_grass_match', 'rolling_win_rate_grass']] = df_matches[['rolling_grass_match', 'rolling_win_rate_grass']].fillna(0)


####################
# Hard (Indoor)
####################
# Hard matches played
df_matches['shift_rolling_hard (indoor)_match'] = df_matches.groupby(['player'])['surface_hard (indoor)'].shift(1)
df_matches['rolling_hard (indoor)_match'] = (
    df_matches.groupby("player")['shift_rolling_hard (indoor)_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_hard (indoor)_matches_won'] = df_matches.loc[df_matches['surface_hard (indoor)'] == 1].groupby("player")['target'].shift(1)
df_matches['hard (indoor)_matches_won'] = (
    df_matches.loc[df_matches['surface_hard (indoor)'] == 1].groupby("player")['shift_rolling_hard (indoor)_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_hard (indoor)'] = df_matches["hard (indoor)_matches_won"] / df_matches["rolling_hard (indoor)_match"]

# Drop matches won
df_matches.drop(columns=["hard (indoor)_matches_won", "shift_rolling_hard (indoor)_match", "shift_rolling_hard (indoor)_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_hard (indoor)_match', 'rolling_win_rate_hard (indoor)']] = df_matches[['rolling_hard (indoor)_match', 'rolling_win_rate_hard (indoor)']].fillna(0)

In [294]:
# 2. Keep only the player side (drop the old opponent_X columns)
cols_to_drop = [c for c in df_matches.columns 
                if c != "opponent_id" and c.startswith("opponent_")]

df_matches = df_matches.drop(columns=cols_to_drop)


In [295]:
# 3. Build an opponent-feature dataframe from player stats
df_opponent = df_matches.copy()

# Define patterns of columns we want to treat as player stats
patterns = ["rolling_", "imputed_", "_last_", "_match_number", "surface_", "serve_rating",
            "aces", "double_faults", "first_serve_percentage",
            "1st_serve_percentage_won", "2nd_serve_percentage_won",
            "break_points_percentage_saved", "service_games_played",
            "return_rating", "1st_serve_return_percentage_won",
            "2nd_serve_return_percentage_won",
            "break_points_converted_percentage", "return_games_played",
            "winners", "unforced_errors", "break_point_opportunities",
            "straight_sets_win", "straight_sets_loss", "sets_won",
            "sets_lost", "sets_played", "match_number", "last_match_date"]


rename_map = {}
for col in df_opponent.columns:
    if any(p in col for p in patterns):
        rename_map[col] = f"opponent_{col}"

# Rename player stats -> opponent stats
df_opponent = df_opponent.rename(columns=rename_map)

# Keep only the opponent stats + IDs for merging
df_opponent = df_opponent[["match_uid", "id"] + list(rename_map.values())]
# df_opponent = df_opponent.rename(columns={"id": "opponent_id"})

# 4. Merge opponent stats back into the main match dataframe
df_matches = df_matches.merge(
    df_opponent,
    left_on=["match_uid", "opponent_id"],
    right_on=["match_uid", "id"],
    how="left",
    suffixes=("", "_drop")
)

# Drop the duplicate id from the right side
df_matches = df_matches.drop(columns=["id_drop"], errors="ignore")

In [296]:
# Now calculate days since last match
df_matches['days_since_last_match'] = (df_matches['match_date'] - df_matches['last_match_date']).dt.days
df_matches['opponent_days_since_last_match'] = (df_matches['match_date'] - df_matches['opponent_last_match_date']).dt.days

---
#### Matches Won On Each Surface

In [None]:
####################
# Clay
####################
# Clay matches played
df_matches['shift_rolling_clay_match'] = df_matches.groupby(['id'])['surface_clay'].shift(1)
df_matches['rolling_clay_match'] = (
    df_matches.groupby("id")['shift_rolling_clay_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_clay_matches_won'] = df_matches.loc[df_matches['surface_clay'] == 1].groupby("id")['target'].shift(1)
df_matches['clay_matches_won'] = (
    df_matches.loc[df_matches['surface_clay'] == 1].groupby("id")['shift_rolling_clay_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_clay'] = df_matches["clay_matches_won"] / df_matches["rolling_clay_match"]
# Drop matches won
df_matches.drop(columns=["clay_matches_won", "shift_rolling_clay_match", "shift_rolling_clay_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_clay_match', 'rolling_win_rate_clay']] = df_matches[['rolling_clay_match', 'rolling_win_rate_clay']].fillna(0)


####################
# Hard
####################
# Hard matches played
df_matches['shift_rolling_hard_match'] = df_matches.groupby(['id'])['surface_hard'].shift(1)
df_matches['rolling_hard_match'] = (
    df_matches.groupby("id")['shift_rolling_hard_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_hard_matches_won'] = df_matches.loc[df_matches['surface_hard'] == 1].groupby("id")['target'].shift(1)
df_matches['hard_matches_won'] = (
    df_matches.loc[df_matches['surface_hard'] == 1].groupby("id")['shift_rolling_hard_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_hard'] = df_matches["hard_matches_won"] / df_matches["rolling_hard_match"]


# Drop matches won
df_matches.drop(columns=["hard_matches_won", "shift_rolling_hard_match", "shift_rolling_hard_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_hard_match', 'rolling_win_rate_hard']] = df_matches[['rolling_hard_match', 'rolling_win_rate_hard']].fillna(0)


####################
# Grass
####################
# Grass matches played
df_matches['shift_rolling_grass_match'] = df_matches.groupby(['id'])['surface_grass'].shift(1)
df_matches['rolling_grass_match'] = (
    df_matches.groupby("id")['shift_rolling_grass_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_grass_matches_won'] = df_matches.loc[df_matches['surface_grass'] == 1].groupby("id")['target'].shift(1)
df_matches['grass_matches_won'] = (
    df_matches.loc[df_matches['surface_grass'] == 1].groupby("id")['shift_rolling_grass_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_grass'] = df_matches["grass_matches_won"] / df_matches["rolling_grass_match"]


# Drop matches won
df_matches.drop(columns=["grass_matches_won", "shift_rolling_grass_match", "shift_rolling_grass_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_grass_match', 'rolling_win_rate_grass']] = df_matches[['rolling_grass_match', 'rolling_win_rate_grass']].fillna(0)


####################
# Hard (Indoor)
####################
# Hard matches played
df_matches['shift_rolling_hard (indoor)_match'] = df_matches.groupby(['id'])['surface_hard (indoor)'].shift(1)
df_matches['rolling_hard (indoor)_match'] = (
    df_matches.groupby("id")['shift_rolling_hard (indoor)_match']
    .cumsum()
)
# Matches won
df_matches['shift_rolling_hard (indoor)_matches_won'] = df_matches.loc[df_matches['surface_hard (indoor)'] == 1].groupby("id")['target'].shift(1)
df_matches['hard (indoor)_matches_won'] = (
    df_matches.loc[df_matches['surface_hard (indoor)'] == 1].groupby("id")['shift_rolling_hard (indoor)_matches_won']
    .cumsum()
)
# Calculate ratio win
df_matches['rolling_win_rate_hard (indoor)'] = df_matches["hard (indoor)_matches_won"] / df_matches["rolling_hard (indoor)_match"]

# Drop matches won
df_matches.drop(columns=["hard (indoor)_matches_won", "shift_rolling_hard (indoor)_match", "shift_rolling_hard (indoor)_matches_won"], inplace=True)
# # 1c. Fill missing values (no prior matches) with 0
df_matches[['rolling_hard (indoor)_match', 'rolling_win_rate_hard (indoor)']] = df_matches[['rolling_hard (indoor)_match', 'rolling_win_rate_hard (indoor)']].fillna(0)

### Remove double ```match_uid```

In [297]:
df = df_matches.drop_duplicates(subset=["match_uid"])

In [298]:
df

Unnamed: 0,match_uid,match_date,player,opponent,duration,match_round,winner,result,match_id,tournament_id,...,opponent_rolling_clay_match,opponent_rolling_win_rate_clay,opponent_rolling_hard_match,opponent_rolling_win_rate_hard,opponent_rolling_grass_match,opponent_rolling_win_rate_grass,opponent_rolling_hard (indoor)_match,opponent_rolling_win_rate_hard (indoor),days_since_last_match,opponent_days_since_last_match
0,429_2024-10-20_qs012,2024-10-20,A. Andreev,L. Djere,92.000000,1st Round Qualifying,L. Djere,Completed,qs012,429,...,2.0,0.000000,3.0,0.000000,0.0,0.0,0.0,0.0,293,42.0
1,339_2024-12-28_qs019,2024-12-28,A. Andreev,C. Tseng,172.616667,1st Round Qualifying,A. Andreev,Completed,qs019,339,...,2.0,0.000000,4.0,0.250000,0.0,0.0,0.0,0.0,69,76.0
2,339_2024-12-29_qs009,2024-12-29,A. Andreev,B. Bonzi,20.350000,2nd Round Qualifying,B. Bonzi,RET,qs009,339,...,0.0,0.000000,4.0,0.500000,0.0,0.0,7.0,0.0,1,1.0
3,580_2025-01-06_qs105,2025-01-06,A. Andreev,A. Daniel Vallejo,82.000000,1st Round Qualifying,A. Daniel Vallejo,Completed,qs105,580,...,,,,,,,,,8,
4,416_2025-05-05_qs042,2025-05-05,F. Arnaboldi,T. Monteiro,186.016667,1st Round Qualifying,F. Arnaboldi,Completed,qs042,416,...,11.0,0.454545,6.0,0.000000,0.0,0.0,2.0,0.0,490,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7334,4713_2024-09-24_ms024,2024-09-24,D. Yevseyev,Z. Zhang,122.000000,Round of 32,Z. Zhang,Completed,ms024,4713,...,3.0,0.000000,3.0,0.333333,0.0,0.0,0.0,0.0,0,16.0
7344,560_2024-09-08_qs081,2024-09-08,E. Ymer,P. Kypson,132.000000,1st Round Qualifying,P. Kypson,Completed,qs081,560,...,,,,,,,,,35,
7377,540_2025-06-26_qs024,2025-06-26,B. Zhukayev,L. Pavlovic,166.500000,3rd Round Qualifying,B. Zhukayev,Completed,qs024,540,...,,,,,,,,,3,
7388,520_2025-05-20_qs121,2025-05-20,G. Zeppieri,J. Choinski,89.300000,1st Round Qualifying,G. Zeppieri,Completed,qs121,520,...,,,,,,,,,14,


---
---
## Feature Selection

---
### 1.0 Features Being Used

In [288]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [304]:
features = ['duration',
    'aces_last_5_matches',
    'double_faults_last_5_matches',
    'first_serve_percentage_last_5_matches',
    '1st_serve_percentage_won_last_5_matches',
    '2nd_serve_percentage_won_last_5_matches',
    'break_points_percentage_saved_last_5_matches',
    'service_games_played_last_5_matches',
    '1st_serve_return_percentage_won_last_5_matches',
    '2nd_serve_return_percentage_won_last_5_matches',
    'break_points_converted_percentage_last_5_matches',
    'return_games_played_last_5_matches',
    'winners_last_5_matches',
    'unforced_errors_last_5_matches',
    'opponent_aces_last_5_matches',
    'opponent_double_faults_last_5_matches',
    'opponent_first_serve_percentage_last_5_matches',
    'opponent_1st_serve_percentage_won_last_5_matches',
    'opponent_2nd_serve_percentage_won_last_5_matches',
    'opponent_break_points_percentage_saved_last_5_matches',
    'opponent_service_games_played_last_5_matches',
    'opponent_1st_serve_return_percentage_won_last_5_matches',
    'opponent_2nd_serve_return_percentage_won_last_5_matches',
    'opponent_break_points_converted_percentage_last_5_matches',
    'opponent_return_games_played_last_5_matches',
    'opponent_winners_last_5_matches',
    'opponent_unforced_errors_last_5_matches',
    'opponent_break_point_opportunities_last_5_matches',
    'break_point_opportunities_last_5_matches',

    
    # Rolling imputed flags
    'imputed_duration_last_10_matches',
    'opponent_imputed_opponent_winners',
    'imputed_unforced_errors_last_10_matches',
    'opponent_imputed_winners_last_10_matches',
    'opponent_imputed_unforced_errors_last_10_matches',
    'imputed_first_serve_percentage_last_10_matches',
    'opponent_imputed_first_serve_percentage_last_10_matches',
    
    # Rolling surface counts
    'clay_last_10_matches',
    'opponent_clay_last_10_matches',
    'grass_last_10_matches',
    'opponent_grass_last_10_matches',
    'hard_last_10_matches',
    'opponent_hard_last_10_matches',
    'hard (indoor)_last_10_matches',
    'opponent_hard (indoor)_last_10_matches',
    
    # Other engineered features
    'match_number',            # cumulative number of matches played (experience)
    'opponent_match_number',
    'days_since_last_match',          # time since last match (could be transformed)
    'opponent_days_since_last_match',
    
    'rolling_clay_match', 'rolling_win_rate_clay', 'rolling_hard_match',
    'rolling_win_rate_hard', 'rolling_grass_match',
    'rolling_win_rate_grass', 'rolling_hard (indoor)_match',
    'rolling_win_rate_hard (indoor)',
    'opponent_rolling_clay_match',
    'opponent_rolling_win_rate_clay',
    "opponent_rolling_hard_match",
    "opponent_rolling_win_rate_hard",
    "opponent_rolling_grass_match",
    "opponent_rolling_win_rate_grass",
    "opponent_rolling_hard (indoor)_match",
    "opponent_rolling_win_rate_hard (indoor)",
    
    'best_of',
    'surface_clay',
    'surface_grass',
    'surface_hard',
    'surface_hard (indoor)',
    'match_round',
    'level'
]
target_col = 'target'

---
### 2.0 Random Forest

In [311]:
# Also seperate by round in case comes across a tournament where all round where imputed same date
train = df[df["match_date"] < "2025-05-25"].copy()
test  = df[df["match_date"] >= "2025-05-25"].copy()

# Keep metadata for later inspection
meta_train = train[["player", "opponent", "match_date"]].copy()
meta_test  = test[["player", "opponent", "match_date"]].copy()


X_train = train[features].copy()
y_train = train[target_col].copy()

X_test = test[features].copy()
y_test = test[target_col].copy()

from sklearn.preprocessing import OrdinalEncoder
round_cols = ['level', 'match_round']
encoder = OrdinalEncoder()
X_train[round_cols] = encoder.fit_transform(X_train[round_cols])
X_test[round_cols] = encoder.transform(X_test[round_cols])


best_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
best_model.fit(X_train, y_train)

In [312]:
accuracy_before = best_model.score(X_test,y_test)
print(f"Accuracy before feature selection: {accuracy_before:.2f}")

Accuracy before feature selection: 0.55


Lets see the importance of certain features

In [314]:
# Extract feature importances
importances = best_model.feature_importances_
feature_names = df[features].columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Select top N features (example selecting top 10 features)
top_features = feature_importance_df['Feature'][:10].values
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]
feature_importance_df

                                             Feature  Importance
0                                           duration    0.046870
13                    unforced_errors_last_5_matches    0.025931
18  opponent_2nd_serve_percentage_won_last_5_matches    0.025093
8     1st_serve_return_percentage_won_last_5_matches    0.024681
4            1st_serve_percentage_won_last_5_matches    0.024629
..                                               ...         ...
64                                           best_of    0.002018
68                             surface_hard (indoor)    0.001900
66                                     surface_grass    0.000850
53                            rolling_win_rate_grass    0.000282
61                   opponent_rolling_win_rate_grass    0.000157

[71 rows x 2 columns]


Unnamed: 0,Feature,Importance
0,duration,0.046870
13,unforced_errors_last_5_matches,0.025931
18,opponent_2nd_serve_percentage_won_last_5_matches,0.025093
8,1st_serve_return_percentage_won_last_5_matches,0.024681
4,1st_serve_percentage_won_last_5_matches,0.024629
...,...,...
64,best_of,0.002018
68,surface_hard (indoor),0.001900
66,surface_grass,0.000850
53,rolling_win_rate_grass,0.000282


---
### 3.0 Select Features

In [315]:
top_features = ["duration", "days_since_last_match",
     "opponent_days_since_last_match",
    "opponent_2nd_serve_percentage_won_last_5_matches",
    "2nd_serve_percentage_won_last_5_matches",
    "opponent_first_serve_percentage_last_5_matches" ,
    "opponent_unforced_errors_last_5_matches",
    "first_serve_percentage_last_5_matches",
    "1st_serve_return_percentage_won_last_5_matches",
    "1st_serve_percentage_won_last_5_matches",
    "opponent_1st_serve_return_percentage_won_last_5_matches",
    "opponent_1st_serve_percentage_won_last_5_matches",
    "unforced_errors_last_5_matches",
    "winners_last_5_matches",
    "2nd_serve_return_percentage_won_last_5_matches",
    "opponent_1st_serve_return_percentage_won_last_5_matches",
    "1st_serve_return_percentage_won_last_5_matches",
    "opponent_break_points_percentage_saved_last_5_matches",
    "break_points_percentage_saved_last_5_matches",
    "2nd_serve_percentage_won_last_5_matches",
    "opponent_winners_last_5_matches",
    "opponent_2nd_serve_return_percentage_won_last_5_matches",
    "opponent_break_points_converted_percentage_last_5_matches",
    "break_points_converted_percentage_last_5_matches",
    "match_number", "opponent_match_number",
    "aces_last_5_matches", "opponent_aces_last_5_matches",
    "opponent_break_point_opportunities_last_5_matches", "break_point_opportunities_last_5_matches",
    'imputed_duration_last_10_matches',
    'opponent_imputed_opponent_winners',
    'imputed_unforced_errors_last_10_matches',
    'opponent_imputed_winners_last_10_matches',
    'opponent_imputed_unforced_errors_last_10_matches',
    'imputed_first_serve_percentage_last_10_matches',
    'opponent_imputed_first_serve_percentage_last_10_matches',
    'aces_last_5_matches', 'opponent_aces_last_5_matches',
    'surface_clay',
    'surface_grass',
    'surface_hard',
    'surface_hard (indoor)',
    'clay_last_10_matches',
    'opponent_clay_last_10_matches',
    'grass_last_10_matches',
    'opponent_grass_last_10_matches',
    'hard_last_10_matches',
    'opponent_hard_last_10_matches',
    'hard (indoor)_last_10_matches',
    'opponent_hard (indoor)_last_10_matches',
    'rolling_clay_match', 'rolling_win_rate_clay', 'rolling_hard_match',
    'rolling_win_rate_hard', 'rolling_grass_match',
    'rolling_win_rate_grass', 'rolling_hard (indoor)_match',
    'rolling_win_rate_hard (indoor)',
    'opponent_rolling_clay_match',
    'opponent_rolling_win_rate_clay',
    "opponent_rolling_hard_match",
    "opponent_rolling_win_rate_hard",
    "opponent_rolling_grass_match",
    "opponent_rolling_win_rate_grass",
    "opponent_rolling_hard (indoor)_match",
    "opponent_rolling_win_rate_hard (indoor)",]
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

In [320]:
best_model_selected = RandomForestClassifier(
    n_estimators=100,
    random_state=42)
best_model_selected.fit(X_train_selected, y_train)

accuracy_after = best_model_selected.score(
    X_test_selected, y_test)
print(f"Accuracy before feature selection: {accuracy_after:.2f}")

Accuracy before feature selection: 0.58


Accuracy is a bit better

---
### 4.0 Grid Search Hyperparameter Tuning

In [321]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [10, 15, 20],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc')
grid.fit(X_train_selected, y_train)

best_model = grid.best_estimator_
print(grid.best_params_)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

best_model = RandomForestClassifier(
    n_estimators=grid.best_params_['n_estimators'],
    max_depth=grid.best_params_['max_depth'],
    min_samples_leaf=grid.best_params_['min_samples_leaf'],
    max_features=grid.best_params_['max_features'],
    random_state=42
)
best_model.fit(X_train_selected, y_train)




y_pred = best_model.predict(X_test_selected) 
y_proba = best_model.predict_proba(X_test_selected)[:, 1] # probability of win 
# Metrics 
acc = accuracy_score(y_test, y_pred) 
auc = roc_auc_score(y_test, y_proba) 
print(f"Test Accuracy: {acc:.3f}") 
print(f"Test AUC: {auc:.3f}")

{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 5, 'n_estimators': 1000}
Test Accuracy: 0.573
Test AUC: 0.625
