# Notebook used to preprocessed the data for classification task

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import re
from tqdm.notebook import tqdm

In [2]:
data_path = '../data/preprocessed_data/atp_matches_68_24_cleaned.csv'
data = pd.read_csv(data_path)
data.shape

(317049, 23)

In [3]:
#Remove all specifc Davis tourney name
data.loc[data['tourney_name'].str.contains('Davis'), 'tourney_name'] = 'Davis Cup'

### Create the target column

In [4]:
#Create columns for total games count
p = r'(\d)-(\d)'
games = data['score'].str.extractall(p).astype(float)
games.columns = ['winner', 'loser']
game_sums = games.groupby(level=0).sum()
data['winner_games'] = game_sums['winner']
data['loser_games'] = game_sums['loser']
data = data.dropna()

In [5]:
#create a game ratio score
data['game_ratio'] = data['winner_games'] / (data['loser_games'] + data['winner_games'])
#If won with less game, put a really score close to 0.5
data.loc[data['winner_games'] < data['loser_games'], 'game_ratio'] = 0.51

### Anonymized Winner and Loser

In [6]:
shuffled = np.random.rand(len(data)) < 0.5
data['player_1_age'] = np.where(shuffled, data['winner_age'], data['loser_age'])
data['player_2_age'] = np.where(shuffled, data['loser_age'], data['winner_age'])
#------
data['player_1_ht'] = np.where(shuffled, data['winner_ht'], data['loser_ht'])
data['player_2_ht'] = np.where(shuffled, data['loser_ht'], data['winner_ht'])
#------
data['player_1_seed'] = np.where(shuffled, data['winner_seed'], data['loser_seed'])
data['player_2_seed'] = np.where(shuffled, data['loser_seed'], data['winner_seed'])
#------
data['player_1_rank'] = np.where(shuffled, data['winner_rank'], data['loser_rank'])
data['player_2_rank'] = np.where(shuffled, data['loser_rank'], data['winner_rank'])
#------
data['player_1_rank_points'] = np.where(shuffled, data['winner_rank_points'], data['loser_rank_points'])
data['player_2_rank_points'] = np.where(shuffled, data['loser_rank_points'], data['winner_rank_points'])
#------
data['player_1_games'] = np.where(shuffled, data['winner_games'], data['loser_games'])
data['player_2_games'] = np.where(shuffled, data['loser_games'], data['winner_games'])
#------
data['player_1_ioc'] = np.where(shuffled, data['winner_ioc'], data['loser_ioc'])
data['player_2_ioc'] = np.where(shuffled, data['loser_ioc'], data['winner_ioc'])
#------
data['player_1_hand'] = np.where(shuffled, data['winner_hand'], data['loser_hand'])
data['player_2_hand'] = np.where(shuffled, data['loser_hand'], data['winner_hand'])



#Modify score for player 1 and 2
data.loc[(data['player_2_ht'] == data['loser_ht']) & (data['player_2_rank_points'] == data['loser_rank_points']) & (data['player_2_seed'] == data['loser_seed']) & (data['player_2_age'] == data['loser_age']), 'game_ratio'] = 1 - data.loc[(data['player_2_ht'] == data['loser_ht']) & (data['player_2_rank_points'] == data['loser_rank_points']) & (data['player_2_seed'] == data['loser_seed']) & (data['player_2_age'] == data['loser_age']), 'game_ratio']
#Remove Winner Looser columns
data = data.drop(['winner_hand', 'loser_hand', 'winner_ioc', 'loser_ioc', 'winner_age', 'loser_age', 'winner_ht', 'loser_ht', 'winner_seed', 'loser_seed', 'winner_rank', 'loser_rank', 'winner_rank_points', 'loser_rank_points', 'winner_games', 'loser_games', ], axis=1)

### Encode columns

In [7]:
one_hot_columns = ['tourney_name', 'surface', 'player_1_ioc', 'player_2_ioc', 'tourney_level', 'player_1_hand', 'player_2_hand']
#binary_columns = []
rankable_columns = ['round']

round_dict = {'F' : 0,
              'BR': 1,
              'SF': 2,
              'QF': 3,
              'R16': 4,
              'R32': 5,
              'R64': 6,
              'R128': 7, 
              'RR': 8, 
              'ER': 9,}

In [8]:
for c in one_hot_columns:
    data = pd.get_dummies(data, columns=[c])
data['round'] = data['round'].map(round_dict)
data.shape

(315519, 467)

# Save Dataset

In [9]:
print(data.shape)
data.to_csv('../data/preprocessed_data/atp_matches_68_24_preprocessed.csv', index=False)

(315519, 467)
