# Useful Functions

In [1]:
def make_p1_sameplayer(dataframe, p1_id, p2_id):
   
    # Creates new dataframe with match data between p1 and p2
    df_h2h = dataframe[((dataframe['p1_id'] == p1_id) | (dataframe['p2_id'] == p1_id)) & ((dataframe['p1_id'] == p2_id) | (dataframe['p2_id'] == p2_id))].copy()
    df_h2h['result'] = 1
    
    # Creates list of column names
    col_list = dataframe.columns
    
    # Empty list of columns to swap 
    cols_to_swap = []
    
    # Finds which columns to swap
    for col in col_list:
        if str(col).startswith('p1_'):
            suffix = col[3:]
            matching_col = 'p2_' + suffix
            if matching_col in col_list:
                cols_to_swap.append((col, matching_col))
    
    # Swap mask condition
    swap_mask = ~((df_h2h['p1_id'] == p2_id) | (df_h2h['p2_id'] == p1_id))
    
    # Swaps columns
    for col1, col2 in cols_to_swap:
        temp = df_h2h.loc[swap_mask, col1].copy()
        df_h2h.loc[swap_mask, col1] = df_h2h.loc[swap_mask, col2]
        df_h2h.loc[swap_mask, col2] = temp
    
    df_h2h.loc[swap_mask, "result"] = 0     

    return df_h2h.head(30)

#make_p1_sameplayer(df_subset, 104925, 103819)

In [2]:
def df_player(dataframe, p1_id):
   
    # Creates new dataframe with match data between p1 and p2
    df_player = dataframe[((dataframe['p1_id'] == p1_id) | (dataframe['p2_id'] == p1_id))].copy()
    
    # Creates list of column names
    col_list = dataframe.columns
    
    # Empty list of columns to swap 
    cols_to_swap = []
    
    # Finds which columns to swap
    for col in col_list:
        if str(col).startswith('p1_'):
            suffix = col[3:]
            matching_col = 'p2_' + suffix
            if matching_col in col_list:
                cols_to_swap.append((col, matching_col))
    
    # Swap mask condition
    swap_mask = ~((df_player['p1_id'] == p1_id))
    
    # Swaps columns
    for col1, col2 in cols_to_swap:
        temp = df_player.loc[swap_mask, col1].copy()
        df_player.loc[swap_mask, col1] = df_player.loc[swap_mask, col2]
        df_player.loc[swap_mask, col2] = temp   

    return df_player

#df_player(df_subset, 104925)

# Cleaning and Reducing Dataset

In [3]:
import csv
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# Display all columns
pd.set_option('display.max_columns', None)

# Display all rows
pd.set_option('display.max_rows', None)


# Reads all csv files in this folder and concatinates them
csv_files = glob.glob('*.csv')
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)


# Rename useful columns
df.rename(columns={"tourney_id": "tournament_id"}, inplace=True)
df.rename(columns={"tourney_name": "tournament_name"}, inplace=True)
df.rename(columns={"tourney_date": "tournament_date"}, inplace=True)
df.rename(columns={"winner_ht": "p1_height"}, inplace=True)
df.rename(columns={"winner_id": "p1_id"}, inplace=True)
df.rename(columns={"winner_name": "p1_name"}, inplace=True)
df.rename(columns={"winner_age": "p1_age"}, inplace=True)
df.rename(columns={"loser_age": "p2_age"}, inplace=True)
df.rename(columns={"winner_hand": "p1_hand"}, inplace=True)
df.rename(columns={"loser_ht": "p2_height"}, inplace=True)
df.rename(columns={"loser_id": "p2_id"}, inplace=True)
df.rename(columns={"loser_name": "p2_name"}, inplace=True)
df.rename(columns={"loser_hand": "p2_hand"}, inplace=True)

# Create subset of dataframe with useful columns
df_subset = df[['tournament_date','tournament_name', 'surface', 'p1_name', 'p1_id', 'p1_age', 'p1_height', 'p2_name', 'p2_id', 'p2_age', 'p2_height']]

# Remove rows with NaN values for surface column
df_subset = df_subset.dropna(subset=['surface'])
df_subset = df_subset.dropna(subset=['p1_age'])
df_subset = df_subset.dropna(subset=['p2_age'])
df_subset = df_subset.dropna(subset=['p1_height'])
df_subset = df_subset.dropna(subset=['p2_height'])

# Sort rows by tournament date
df_subset = df_subset.sort_values(by='tournament_date')

# Adding Difference Columns

#### Player Age - Cleaning age data (did not remove any data)

In [4]:
#df_subset['p1_age'].describe()

In [5]:
#df_subset[(df_subset['p1_age'] >= 40) | (df_subset['p2_age'] >= 40)]

In [6]:
# Create empty lists to store age differences
p1_age_diff_list = []
p2_age_diff_list = []

# loops over the age columns and calculates players age difference
for p1, p2 in zip(df_subset['p1_age'], df_subset['p2_age']):
    p1_age_diff = p1 - p2
    p2_age_diff = p2 - p1

    # Adds calculated age differences to list
    p1_age_diff_list.append(p1_age_diff)
    p2_age_diff_list.append(p2_age_diff)

# Assign df columns
df_subset['p1_age_diff'] = p1_age_diff_list
df_subset['p2_age_diff'] = p2_age_diff_list

#df_subset.head()

#### Player Height - Cleaning height data

In [7]:
#df_subset['p1_height'].describe()
#df_subset['p2_height'].describe()

In [8]:
# Find anomolously short players
#max_height = 75
#new_subset = df_subset[(df_subset['p1_height'] <= max_height) | (df_subset['p2_height'] <= max_height)]

# Find anomolously tall players
#min_height = 211
#df_subset[(df_subset['p1_height'] >= min_height) | (df_subset['p2_height'] >= min_height)]


#List of anomalous players to remove
anom_heights_list = ['Jorge Brian Panta Herreros',
'Johannes Ingildsen',
'Viacheslav Bielinskyi'	                     
]


# remove anomalous players
for name in anom_heights_list:
    if (df_subset == name).any().any():
        df_subset = df_subset[~df_subset.isin([name]).any(axis=1)]

In [9]:
# Create empty lists to store height differences
p1_height_diff_list = []
p2_height_diff_list = []

# loops over the age columns and calculates players height difference
for p1, p2 in zip(df_subset['p1_height'], df_subset['p2_height']):
    p1_height_diff = p1 - p2
    p2_height_diff = p2 - p1

    # Adds calculated age differences to list
    p1_height_diff_list.append(p1_height_diff)
    p2_height_diff_list.append(p2_height_diff)

# Assign df columns
df_subset['p1_height_diff'] = p1_height_diff_list
df_subset['p2_height_diff'] = p2_height_diff_list

#df_subset.head()

# Calculating Previous Wins Against Opponent Columns

In [10]:
# Dictionary that stores head-to-head match results. Returns 0 if the key does not exist (players have never played before)
h2h_wins_dict = defaultdict(int)
h2h_history_dict = defaultdict(list)

# Stores immediate results
p1_h2h_wins_before = []
p2_h2h_wins_before = []

p1_h2h_wins_total_diff_before = []
p2_h2h_wins_total_diff_before = []

p1_h2h_wins_last1_diff_before = []
p2_h2h_wins_last1_diff_before = []

p1_h2h_wins_last2_diff_before = []
p2_h2h_wins_last2_diff_before = []

p1_h2h_wins_last3_diff_before = []
p2_h2h_wins_last3_diff_before = []

p1_h2h_wins_last4_diff_before = []
p2_h2h_wins_last4_diff_before = []

p1_h2h_wins_last5_diff_before = []
p2_h2h_wins_last5_diff_before = []

p1_h2h_wins_last10_diff_before = []
p2_h2h_wins_last10_diff_before = []


# Iterate through each match in df_subset, returning player_ids as pairs i.e [(100,200), (300, 250)...]  
for p1, p2 in zip(df_subset['p1_id'], df_subset['p2_id']):
    
    # Creates keys for head-to-head matches
    wins_key1 = (p1, p2)
    wins_key2 = (p2, p1)
    wins_match_key = tuple(sorted([p1, p2]))

    # Gets wins before this match
    p1_h2h_wins = h2h_wins_dict[wins_key1]
    p2_h2h_wins = h2h_wins_dict[wins_key2]
    
    # Saves wins to px_h2h_wins_before lists to then be used for dataframe columns
    p1_h2h_wins_before.append(p1_h2h_wins)
    p2_h2h_wins_before.append(p2_h2h_wins)
    
    # Calculates wins difference
    p1_h2h_wins_diff = p1_h2h_wins - p2_h2h_wins
    p2_h2h_wins_diff = p2_h2h_wins - p1_h2h_wins
    
    # Saves differences to px_h2h_wins_diff_before lists to then be used for dataframe columns
    p1_h2h_wins_total_diff_before.append(p1_h2h_wins_diff)
    p2_h2h_wins_total_diff_before.append(p2_h2h_wins_diff)

    #
    history_last1 = h2h_history_dict[wins_match_key][-1:]
    p1_last1_wins = history_last1.count(p1)
    p2_last1_wins = history_last1.count(p2)
    p1_last1_diff = p1_last1_wins - p2_last1_wins
    p2_last1_diff = p2_last1_wins - p1_last1_wins
    p1_h2h_wins_last1_diff_before.append(p1_last1_diff)
    p2_h2h_wins_last1_diff_before.append(p2_last1_diff)
    
    history_last2 = h2h_history_dict[wins_match_key][-2:]
    p1_last2_wins = history_last2.count(p1)
    p2_last2_wins = history_last2.count(p2)
    p1_last2_diff = p1_last2_wins - p2_last2_wins
    p2_last2_diff = p2_last2_wins - p1_last2_wins
    p1_h2h_wins_last2_diff_before.append(p1_last2_diff)
    p2_h2h_wins_last2_diff_before.append(p2_last2_diff)
    
    history_last3 = h2h_history_dict[wins_match_key][-3:]
    p1_last3_wins = history_last3.count(p1)
    p2_last3_wins = history_last3.count(p2)
    p1_last3_diff = p1_last3_wins - p2_last3_wins
    p2_last3_diff = p2_last3_wins - p1_last3_wins
    p1_h2h_wins_last3_diff_before.append(p1_last3_diff)
    p2_h2h_wins_last3_diff_before.append(p2_last3_diff)

    history_last4 = h2h_history_dict[wins_match_key][-4:]
    p1_last4_wins = history_last4.count(p1)
    p2_last4_wins = history_last4.count(p2)
    p1_last4_diff = p1_last4_wins - p2_last4_wins
    p2_last4_diff = p2_last4_wins - p1_last4_wins
    p1_h2h_wins_last4_diff_before.append(p1_last4_diff)
    p2_h2h_wins_last4_diff_before.append(p2_last4_diff)
    
    history_last5 = h2h_history_dict[wins_match_key][-5:]
    p1_last5_wins = history_last5.count(p1)
    p2_last5_wins = history_last5.count(p2)
    p1_last5_diff = p1_last5_wins - p2_last5_wins
    p2_last5_diff = p2_last5_wins - p1_last5_wins
    p1_h2h_wins_last5_diff_before.append(p1_last5_diff)
    p2_h2h_wins_last5_diff_before.append(p2_last5_diff)

    history_last10 = h2h_history_dict[wins_match_key][-10:]
    p1_last10_wins = history_last10.count(p1)
    p2_last10_wins = history_last10.count(p2)
    p1_last10_diff = p1_last10_wins - p2_last10_wins
    p2_last10_diff = p2_last10_wins - p1_last10_wins
    p1_h2h_wins_last10_diff_before.append(p1_last10_diff)
    p2_h2h_wins_last10_diff_before.append(p2_last10_diff)

    # Player 1 always wins in this df_subset, updates head-to-head
    h2h_wins_dict[wins_key1] += 1  

    # Updates h2h history dicitonary 
    h2h_history_dict[wins_match_key].append(p1)

# Assign to dataframe
df_subset['p1_h2h_wins'] = p1_h2h_wins_before
df_subset['p2_h2h_wins'] = p2_h2h_wins_before
df_subset['p1_h2h_wins_before_total_diff'] = p1_h2h_wins_total_diff_before
df_subset['p2_h2h_wins_before_total_diff'] = p2_h2h_wins_total_diff_before
df_subset['p1_h2h_wins_before_last1_diff'] = p1_h2h_wins_last1_diff_before
df_subset['p2_h2h_wins_before_last1_diff'] = p2_h2h_wins_last1_diff_before
df_subset['p1_h2h_wins_before_last2_diff'] = p1_h2h_wins_last2_diff_before
df_subset['p2_h2h_wins_before_last2_diff'] = p2_h2h_wins_last2_diff_before
df_subset['p1_h2h_wins_before_last3_diff'] = p1_h2h_wins_last3_diff_before
df_subset['p2_h2h_wins_before_last3_diff'] = p2_h2h_wins_last3_diff_before
df_subset['p1_h2h_wins_before_last4_diff'] = p1_h2h_wins_last4_diff_before
df_subset['p2_h2h_wins_before_last4_diff'] = p2_h2h_wins_last4_diff_before
df_subset['p1_h2h_wins_before_last5_diff'] = p1_h2h_wins_last5_diff_before
df_subset['p2_h2h_wins_before_last5_diff'] = p2_h2h_wins_last5_diff_before
df_subset['p1_h2h_wins_before_last10_diff'] = p1_h2h_wins_last10_diff_before
df_subset['p2_h2h_wins_before_last10_diff'] = p2_h2h_wins_last10_diff_before

# Calculating ELO

In [11]:
# Dictionary that stores players ELO. Returns 1500 if the key does not exist (player has not played before)
elo_dict = defaultdict(lambda: 1500)

# Stores immediate elo
p1_elo_before = []
p2_elo_before = []

# Iterate through each match in df_subset, returning player_ids as pairs i.e [(100,200), (300, 250)...]  
for p1, p2 in zip(df_subset['p1_id'], df_subset['p2_id']):
    elo_key1 = p1
    elo_key2 = p2

    # Checks dictionary for ELO and stores the ELO as px_elo
    p1_elo = elo_dict[elo_key1]
    p2_elo = elo_dict[elo_key2]

    # Adds ELO to the list px_elo_before
    p1_elo_before.append(p1_elo)
    p2_elo_before.append(p2_elo)

    # Calculates expected score
    p1_expected_score = 1 / (1 + 10**((p2_elo - p1_elo)/400))
    p2_expected_score = 1 / (1 + 10**((p1_elo - p2_elo)/400))

    # Calculates ELO after the match
    K = 32
    p1_elo_after = int(p1_elo + K * (1 - p1_expected_score))
    p2_elo_after = int(p2_elo + K * (0 - p2_expected_score))  # Fixed this line

    # Stores new ELO in the dictionary
    elo_dict[elo_key1] = p1_elo_after
    elo_dict[elo_key2] = p2_elo_after
    
# Assign to dataframe
df_subset['p1_elo_before'] = p1_elo_before
df_subset['p2_elo_before'] = p2_elo_before

#### Calculating ELO difference

In [12]:
p1_elo_diff_list = []
p2_elo_diff_list = []

for p1, p2 in zip(df_subset['p1_elo_before'], df_subset['p2_elo_before']):
    p1_elo_diff = p1 - p2
    p2_elo_diff = p2 - p1

    p1_elo_diff_list.append(p1_elo_diff)
    p2_elo_diff_list.append(p2_elo_diff)

df_subset['p1_elo_diff_before'] = p1_elo_diff_list
df_subset['p2_elo_diff_before'] = p2_elo_diff_list

#df_subset.head(1000)

#### Sum of Surfaces

In [13]:
surface_dict = {}

for surface in df_subset['surface']:
    surface_dict[surface] = surface_dict.get(surface, 0) + 1

#print(surface_dict)

#### Calculating surface ELO

In [14]:
from collections import defaultdict

# 1. Create a nested defaultdict: elo_dict[player_id][surface] = ELO
elo_dict = defaultdict(lambda: defaultdict(lambda: 1500))

# 2. Lists to store ELOs before the match
p1_surface_elo_before = []
p2_surface_elo_before = []

# 3. Loop through df_subset row by row
for surface, p1, p2 in zip(df_subset['surface'], df_subset['p1_id'], df_subset['p2_id']):
    
    p1_elo = elo_dict[p1][surface]
    p2_elo = elo_dict[p2][surface]

    # Store ELOs before match
    p1_surface_elo_before.append(p1_elo)
    p2_surface_elo_before.append(p2_elo)

    # Calculate expected scores
    p1_expected = 1 / (1 + 10 ** ((p2_elo - p1_elo) / 400))
    p2_expected = 1 / (1 + 10 ** ((p1_elo - p2_elo) / 400))

    # Update ELOs assuming p1 wins
    K = 32
    elo_dict[p1][surface] = int(p1_elo + K * (1 - p1_expected))
    elo_dict[p2][surface] = int(p2_elo + K * (0 - p2_expected))

# 4. Add columns to df_subset
df_subset['p1_surface_elo_before'] = p1_surface_elo_before
df_subset['p2_surface_elo_before'] = p2_surface_elo_before

#### Calculating surface ELO difference

In [15]:
p1_surface_elo_diff_list = []
p2_surface_elo_diff_list = []

for p1, p2 in zip(df_subset['p1_surface_elo_before'], df_subset['p2_surface_elo_before']):
    p1_surface_elo_diff = p1 - p2
    p2_surface_elo_diff = p2 - p1

    p1_surface_elo_diff_list.append(p1_surface_elo_diff)
    p2_surface_elo_diff_list.append(p2_surface_elo_diff)

df_subset['p1_surface_elo_diff_before'] = p1_surface_elo_diff_list
df_subset['p2_surface_elo_diff_before'] = p2_surface_elo_diff_list

#df_subset.head(1000)

# Calculating Total Number of matches a Player Has Played

In [16]:
total_matches_dict = defaultdict(int)

p1_total_matches_before = []
p2_total_matches_before = []

for p1, p2 in zip(df_subset['p1_id'], df_subset['p2_id']):
    total_matches_key1 = p1
    total_matches_key2 = p2

    p1_total_matches = total_matches_dict[total_matches_key1]
    p2_total_matches = total_matches_dict[total_matches_key2]

    p1_total_matches_before.append(p1_total_matches)
    p2_total_matches_before.append(p2_total_matches)

    p1_total_matches_after = p1_total_matches + 1
    p2_total_matches_after = p2_total_matches + 1

    total_matches_dict[total_matches_key1] = p1_total_matches_after
    total_matches_dict[total_matches_key2] = p2_total_matches_after

df_subset['p1_total_matches_before'] = p1_total_matches_before
df_subset['p2_total_matches_before'] = p2_total_matches_before

#### Calculate total matches player difference

In [17]:
p1_total_career_matches_diff_list = []
p2_total_career_matches_diff_list = []

for p1, p2 in zip(df_subset['p1_total_matches_before'], df_subset['p2_total_matches_before']):
    p1_career_matches_diff = p1 - p2
    p2_career_matches_diff = p2 - p1

    p1_total_career_matches_diff_list.append(p1_career_matches_diff)
    p2_total_career_matches_diff_list.append(p2_career_matches_diff)

df_subset['p1_total_matches_before_diff'] = p1_total_career_matches_diff_list
df_subset['p2_total_matches_before_diff'] = p2_total_career_matches_diff_list

#df_subset

# Calculating Total Wins

In [18]:
from collections import defaultdict

# Initialize
career_wins_dict = defaultdict(int)
career_wins_history_dict = defaultdict(list)

# Lists to store computed features
p1_career_wins_before = []
p2_career_wins_before = []

p1_career_wins_last3_pct_before = []
p2_career_wins_last3_pct_before = []

p1_career_wins_last5_pct_before = []
p2_career_wins_last5_pct_before = []

p1_career_wins_last10_pct_before = []
p2_career_wins_last10_pct_before = []

x1 = 3
x2 = 5
x3 = 10

for p1, p2 in zip(df_subset['p1_id'], df_subset['p2_id']):
    # Store total wins before this match
    p1_career_wins_before.append(career_wins_dict[p1])
    p2_career_wins_before.append(career_wins_dict[p2])

    # Get last 3 outcomes
    p1_last3 = career_wins_history_dict[p1][-x1:]
    p2_last3 = career_wins_history_dict[p2][-x1:]

    # Get last 5 outcomes
    p1_last5 = career_wins_history_dict[p1][-x2:]
    p2_last5 = career_wins_history_dict[p2][-x2:]

    # Get last 10 outcomes
    p1_last10 = career_wins_history_dict[p1][-x3:]
    p2_last10 = career_wins_history_dict[p2][-x3:]

    # Compute recent win % (if enough history)
    p1_last3_pct = round((sum(p1_last3) / x1) * 100, 1) if len(p1_last3) == x1 else 0
    p2_last3_pct = round((sum(p2_last3) / x1) * 100, 1) if len(p2_last3) == x1 else 0

    # Compute recent win % (if enough history)
    p1_last5_pct = round((sum(p1_last5) / x2) * 100, 1) if len(p1_last5) == x2 else 0
    p2_last5_pct = round((sum(p2_last5) / x2) * 100, 1) if len(p2_last5) == x2 else 0

    # Compute recent win % (if enough history)
    p1_last10_pct = round((sum(p1_last10) / x3) * 100, 1) if len(p1_last10) == x3 else 0
    p2_last10_pct = round((sum(p2_last10) / x3) * 100, 1) if len(p2_last10) == x3 else 0

    
    p1_career_wins_last3_pct_before.append(p1_last3_pct)
    p2_career_wins_last3_pct_before.append(p2_last3_pct)

    p1_career_wins_last5_pct_before.append(p1_last5_pct)
    p2_career_wins_last5_pct_before.append(p2_last5_pct)

    p1_career_wins_last10_pct_before.append(p1_last10_pct)
    p2_career_wins_last10_pct_before.append(p2_last10_pct)

    
    # Update total wins
    career_wins_dict[p1] += 1

    # Update recent win/loss history
    career_wins_history_dict[p1].append(1)  # p1 won
    career_wins_history_dict[p2].append(0)  # p2 lost

# Assign to DataFrame
df_subset['p1_career_wins_before'] = p1_career_wins_before
df_subset['p2_career_wins_before'] = p2_career_wins_before
df_subset['p1_career_wins_last3_pct_before'] = p1_career_wins_last3_pct_before
df_subset['p2_career_wins_last3_pct_before'] = p2_career_wins_last3_pct_before
df_subset['p1_career_wins_last5_pct_before'] = p1_career_wins_last5_pct_before
df_subset['p2_career_wins_last5_pct_before'] = p2_career_wins_last5_pct_before
df_subset['p1_career_wins_last10_pct_before'] = p1_career_wins_last10_pct_before
df_subset['p2_career_wins_last10_pct_before'] = p2_career_wins_last10_pct_before