# ATP Players Initial Dataset Cleaning

In [None]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
from collections import defaultdict
import re
import warnings

atp_players = pd.read_csv('./data/Players/atp_players.csv')
atp_players.head()

Unnamed: 0,player_id,player_name,hand,dob,height,source_name
0,116971,?? Baillie,U,,,0
1,204608,A Aguilar,R,,,0
2,152006,A Alexander,U,,,0
3,151280,A Allendorfer,U,,,0
4,202778,A Alvarez,U,,,0


In [9]:
#Jeff Sackmann ATP Datasets
jeff_data = pd.DataFrame()
for year in range(1991, 2025):
    file = "./data/all/atp_matches_"+str(year)+".csv"
    
    year_data = pd.read_csv(file)

    jeff_data = pd.concat([jeff_data, year_data], axis=0)
    
jeff_data['tourney_date'] = pd.to_datetime(jeff_data['tourney_date'], format='%Y%m%d')

### Remove irrelevant Players

In [10]:
# Find players in atp_players that don't appear in jeff_data
jeff_player_ids = pd.concat([jeff_data['winner_id'], jeff_data['loser_id']]).dropna().astype(int).unique()
atp_player_ids = atp_players['player_id'].dropna().astype(int)

players_not_in_jeff = atp_players[~atp_player_ids.isin(jeff_player_ids)]

print(f"Players in atp_players not found in jeff_data: {len(players_not_in_jeff)}")
players_not_in_jeff.head()

Players in atp_players not found in jeff_data: 62420


Unnamed: 0,player_id,player_name,hand,dob,height,source_name
0,116971,?? Baillie,U,,,0
1,204608,A Aguilar,R,,,0
2,152006,A Alexander,U,,,0
3,151280,A Allendorfer,U,,,0
4,202778,A Alvarez,U,,,0


In [11]:
# Drop players not found in jeff_data
atp_players = atp_players[atp_players['player_id'].astype(int).isin(jeff_player_ids)]

print(f"Remaining players in atp_players: {len(atp_players)}")
atp_players.head()

Remaining players in atp_players: 3562


Unnamed: 0,player_id,player_name,hand,dob,height,source_name
457,101381,Aaron Krickstein,R,19670802.0,183.0,0
504,108961,Abd Hazli Bin Zainuddin,R,19801104.0,,0
524,102033,Abdelhak Hameurlaine,R,19720319.0,,0
548,104464,Abdul Mumin Babalola,L,19841215.0,,0
556,102778,Abdul Hamid Makhkamov,R,19760419.0,,0


### Amend Data Quality Issues
1. Null Names

In [12]:
atp_players = atp_players.replace(r'^\s*$', np.nan, regex=True)
empty_names = atp_players.loc[atp_players['player_name'].isna()]
print(len(empty_names))
empty_names.head()

0


Unnamed: 0,player_id,player_name,hand,dob,height,source_name


2. Duplicate Names

In [13]:
names_duplicate = atp_players[atp_players.duplicated(['player_name'])]
names_duplicate = names_duplicate.sort_values(by=['player_name'])
print(len(names_duplicate))
names_duplicate.head()

1


Unnamed: 0,player_id,player_name,hand,dob,height,source_name
23763,211326,Gunawan Trismuwantara,U,20030109.0,,0


In [18]:
atp_players = atp_players[atp_players['player_id'] != 209870]

3. Irregular Names

In [22]:
# Find players with irregular names (3 or fewer letters or containing special chars like ?, !, /, etc.)
irregular_pattern = r'^[A-Za-z]{1,3}$|[?!/@#$%^&*()+=\[\]{};:"|<>,\\~`]'
irregular_names = atp_players[atp_players['player_name'].str.contains(irregular_pattern, regex=True, na=False)]

print(f"Players with irregular names: {len(irregular_names)}")
irregular_names.head(20)

Players with irregular names: 0


Unnamed: 0,player_id,player_name,hand,dob,height,source_name


### Export

In [None]:
atp_players.to_csv('./data/Players/ATP_Players_Cleaned.csv')