DATA PREPARATION, IMORTATION AND GENERATION OF NEW COLUMNS

In [None]:
#DATA PREPARATION ALREADY DONE AND SAVED IN TENNIS_MATCHES_WITH_FEATURES

"""FILENAME_TENNIS = "atp_tennis.csv"
matches = pd.read_csv(FILENAME_TENNIS)
display(matches.head(5))

#code from https://www.kaggle.com/code/sabahao/my-tennis-randomforest


matches["date"] = pd.to_datetime(matches["Date"])

# Create a new column representing the Indoor or Outdoor value
matches["courtType"] = matches["Court"].astype("category").cat.codes

# Create a new column representing the kind of surface
matches["groundType"] = matches["Surface"].astype("category").cat.codes

# Create a new column representing a code for each opponent
matches["opponentCode"] = matches["Player_2"].astype("category").cat.codes

# Create a new column representing a code for each opponent
matches["playerCode"] = matches["Player_1"].astype("category").cat.codes


# Create a new column representing if the player1 won the game with a 1 or if it didn't with a 0
matches["target"] = (matches["Winner"] == matches["Player_1"]).astype("int")

display(matches.head(5))

def calculate_additional_features(matches_df):

    # Make a copy to avoid modifying the original
    df = matches_df.copy()
    
    # Convert date to datetime if not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Sort by date to ensure chronological order
    df = df.sort_values('date')
    
    # Initialize new columns
    df['days_since_match'] = 0
    df['h2h_matches'] = 0
    df['h2h_win_pct'] = 0.5  # Default to neutral when no history
    df['h2h_surface_matches'] = 0
    df['h2h_surface_win_pct'] = 0.5
    df['recent_matches_p1'] = 0
    df['recent_win_pct_p1'] = 0.5
    df['recent_avg_opp_rank_p1'] = 0
    df['recent_matches_p2'] = 0
    df['recent_win_pct_p2'] = 0.5
    df['recent_avg_opp_rank_p2'] = 0
    df['surface_hard'] = (df['Surface'] == 'Hard').astype(int)
    df['surface_clay'] = (df['Surface'] == 'Clay').astype(int)
    df['surface_grass'] = (df['Surface'] == 'Grass').astype(int)
    
    # Calculate days since match (relative to latest date in dataset)
    latest_date = df['date'].max()
    df['days_since_match'] = (latest_date - df['date']).dt.days
    
    # Create player codes if not already present
    if 'playerCode' not in df.columns:
        df['playerCode'] = df['Player_1'].astype('category').cat.codes
    if 'opponentCode' not in df.columns:
        df['opponentCode'] = df['Player_2'].astype('category').cat.codes
    
    # Create target variable if not present
    if 'target' not in df.columns:
        df['target'] = (df['Winner'] == df['Player_1']).astype(int)
    
    # Progress bar for the calculation
    from tqdm import tqdm
    tqdm.pandas(desc="Calculating features")
    
    # We'll process each match and look backward in time for historical data
    def calculate_features(row):
        # Get current match date
        current_date = row['date']
        current_surface = row['Surface']
        player1 = row['Player_1']
        player2 = row['Player_2']
        
        # Filter matches before current date
        historical_matches = df[df['date'] < current_date]
        
        # HEAD-TO-HEAD CALCULATION
        # Find all previous matches between these players
        h2h_matches = historical_matches[
            ((historical_matches['Player_1'] == player1) & (historical_matches['Player_2'] == player2)) |
            ((historical_matches['Player_1'] == player2) & (historical_matches['Player_2'] == player1))
        ]
        
        if len(h2h_matches) > 0:
            # Calculate overall head-to-head
            player1_wins = len(h2h_matches[h2h_matches['Winner'] == player1])
            total_matches = len(h2h_matches)
            row['h2h_matches'] = total_matches
            row['h2h_win_pct'] = player1_wins / total_matches
            
            # Calculate surface-specific head-to-head
            surface_matches = h2h_matches[h2h_matches['Surface'] == current_surface]
            if len(surface_matches) > 0:
                player1_surface_wins = len(surface_matches[surface_matches['Winner'] == player1])
                row['h2h_surface_matches'] = len(surface_matches)
                row['h2h_surface_win_pct'] = player1_surface_wins / len(surface_matches)
        
        # RECENT FORM CALCULATION (last 14 days)
        recent_cutoff = current_date - timedelta(days=1000)
        
        # Player 1 recent form
        p1_matches = historical_matches[
            ((historical_matches['Player_1'] == player1) | (historical_matches['Player_2'] == player1)) &
            (historical_matches['date'] >= recent_cutoff)
        ]
        
        if len(p1_matches) > 0:
            p1_wins = len(p1_matches[p1_matches['Winner'] == player1])
            row['recent_matches_p1'] = len(p1_matches)
            row['recent_win_pct_p1'] = p1_wins / len(p1_matches)
            
            # Calculate average opponent rank
            opp_ranks = []
            for _, match in p1_matches.iterrows():
                if match['Player_1'] == player1:
                    opp_ranks.append(match['Rank_2'])
                else:
                    opp_ranks.append(match['Rank_1'])
            row['recent_avg_opp_rank_p1'] = sum(opp_ranks) / len(opp_ranks) if opp_ranks else 0
        
        # Player 2 recent form
        p2_matches = historical_matches[
            ((historical_matches['Player_1'] == player2) | (historical_matches['Player_2'] == player2)) &
            (historical_matches['date'] >= recent_cutoff)
        ]
        
        if len(p2_matches) > 0:
            p2_wins = len(p2_matches[p2_matches['Winner'] == player2])
            row['recent_matches_p2'] = len(p2_matches)
            row['recent_win_pct_p2'] = p2_wins / len(p2_matches)
            
            # Calculate average opponent rank
            opp_ranks = []
            for _, match in p2_matches.iterrows():
                if match['Player_1'] == player2:
                    opp_ranks.append(match['Rank_2'])
                else:
                    opp_ranks.append(match['Rank_1'])
            row['recent_avg_opp_rank_p2'] = sum(opp_ranks) / len(opp_ranks) if opp_ranks else 0
        
        return row
    
    # Apply the feature calculation to each row
    df = df.progress_apply(calculate_features, axis=1)
    
    return df

# Usage example:
# Load your dataset if not already loaded
# matches = pd.read_csv('your_tennis_data.csv')

# Precalculate all features (this may take some time for large datasets)
print("Precalculating features...")
matches_with_features = calculate_additional_features(matches)

display(matches_with_features.head(5))"""



#matches_with_features.to_csv('tennis_matches_with_features.csv', index=False)

'FILENAME_TENNIS = "atp_tennis.csv"\nmatches = pd.read_csv(FILENAME_TENNIS)\ndisplay(matches.head(5))\n\n#code from https://www.kaggle.com/code/sabahao/my-tennis-randomforest\n\n\nmatches["date"] = pd.to_datetime(matches["Date"])\n\n# Create a new column representing the Indoor or Outdoor value\nmatches["courtType"] = matches["Court"].astype("category").cat.codes\n\n# Create a new column representing the kind of surface\nmatches["groundType"] = matches["Surface"].astype("category").cat.codes\n\n# Create a new column representing a code for each opponent\nmatches["opponentCode"] = matches["Player_2"].astype("category").cat.codes\n\n# Create a new column representing a code for each opponent\nmatches["playerCode"] = matches["Player_1"].astype("category").cat.codes\n\n\n# Create a new column representing if the player1 won the game with a 1 or if it didn\'t with a 0\nmatches["target"] = (matches["Winner"] == matches["Player_1"]).astype("int")\n\ndisplay(matches.head(5))\n\ndef calculate_a

ADD MORE COLUMNS FOR A MORE PRECISE PREDICTION

In [None]:
import pandas as pd
from tqdm import tqdm

# Load your data
df = pd.read_csv("tennis_matches_with_features.csv")

# Make a copy to avoid modifying the original
df = df.copy()

# Convert date
df['date'] = pd.to_datetime(df['Date'])

# Sort chronologically
df = df.sort_values('date')

# Initialize new columns
df['days_since_match'] = (df['date'].max() - df['date']).dt.days

# Implied probabilities
df['implied_prob_p1'] = 1 / df['Odd_1']
df['implied_prob_p2'] = 1 / df['Odd_2']

# Normalize implied probs
df['implied_prob_p1'] /= (df['implied_prob_p1'] + df['implied_prob_p2'])
df['implied_prob_p2'] = 1 - df['implied_prob_p1']

# Series level
series_map = {'Grand Slam': 4, 'Masters': 3, 'ATP 500': 2, 'ATP 250': 1, 'International': 1}
df['series_level'] = df['Series'].map(series_map).fillna(0)

# Round numeric
round_map = {
    'Final': 5, 'Semi-Final': 4, 'Quarter-Final': 3,
    '4th Round': 2, '3rd Round': 2, '2nd Round': 1, '1st Round': 0
}
df['round_num'] = df['Round'].map(round_map).fillna(0)

# Best of already numeric — just copy if needed
df['best_of'] = df['Best of']

# Initialize win % on surface
df['win_pct_surface_p1'] = 0.5
df['win_pct_surface_p2'] = 0.5

# Initialize days since last match for each player
df['days_since_last_match_p1'] = 0
df['days_since_last_match_p2'] = 0

# Dictionary to track player history
player_matches = {}
player_surface_wins = {}
player_surface_total = {}

tqdm.pandas(desc="Calculating player features")

for i, row in tqdm(df.iterrows(), total=len(df)):
    p1 = row['Player_1']
    p2 = row['Player_2']
    surface = row['Surface']
    date = row['date']
    winner = row['Winner']

    # Days since last match
    if p1 in player_matches:
        df.at[i, 'days_since_last_match_p1'] = (date - player_matches[p1]).days
    if p2 in player_matches:
        df.at[i, 'days_since_last_match_p2'] = (date - player_matches[p2]).days
    player_matches[p1] = date
    player_matches[p2] = date

    # Surface win % for p1
    if p1 in player_surface_total and player_surface_total[p1].get(surface, 0) > 0:
        wins = player_surface_wins[p1].get(surface, 0)
        total = player_surface_total[p1][surface]
        df.at[i, 'win_pct_surface_p1'] = wins / total
    else:
        df.at[i, 'win_pct_surface_p1'] = 0.5

    # Surface win % for p2
    if p2 in player_surface_total and player_surface_total[p2].get(surface, 0) > 0:
        wins = player_surface_wins[p2].get(surface, 0)
        total = player_surface_total[p2][surface]
        df.at[i, 'win_pct_surface_p2'] = wins / total
    else:
        df.at[i, 'win_pct_surface_p2'] = 0.5

    # Update surface stats
    for player in [p1, p2]:
        if player not in player_surface_wins:
            player_surface_wins[player] = {}
            player_surface_total[player] = {}
        player_surface_total[player][surface] = player_surface_total[player].get(surface, 0) + 1
    if winner == p1:
        player_surface_wins[p1][surface] = player_surface_wins[p1].get(surface, 0) + 1
    else:
        player_surface_wins[p2][surface] = player_surface_wins[p2].get(surface, 0) + 1

# Save the enriched dataset
df.to_csv("tennis_matches_enriched.csv", index=False)
