In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [2]:
matches = pd.read_csv('atp_matches.csv')
players = pd.read_csv('atp_players.csv')
rankings = pd.read_csv('atp_rankings.csv')

In [3]:
def preprocess_matches(matches):

    numeric_cols = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 
                    'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 
                    'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank', 'loser_rank', 
                    'winner_rank_points', 'loser_rank_points', 'minutes', 'winner_age', 'loser_age']
    for col in numeric_cols:
        matches[col] = matches[col].fillna(matches[col].median())

    matches['winner_seed'] = matches['winner_seed'].fillna('Unseeded')
    matches['loser_seed'] = matches['loser_seed'].fillna('Unseeded')
    matches['winner_entry'] = matches['winner_entry'].fillna('None')
    matches['loser_entry'] = matches['loser_entry'].fillna('None')
    matches['surface'] = matches['surface'].fillna(matches['surface'].mode()[0])
    matches['tourney_date'] = pd.to_datetime(matches['tourney_date'], format='%Y%m%d')
    
    # Encode categorical variables
    le = LabelEncoder()
    matches['surface'] = le.fit_transform(matches['surface'])
    matches['tourney_level'] = le.fit_transform(matches['tourney_level'])
    matches['winner_hand'] = matches['winner_hand'].fillna('U').apply(lambda x: 'U' if x not in ['R', 'L'] else x)
    matches['loser_hand'] = matches['loser_hand'].fillna('U').apply(lambda x: 'U' if x not in ['R', 'L'] else x)
    matches['winner_hand'] = le.fit_transform(matches['winner_hand'])
    matches['loser_hand'] = le.fit_transform(matches['loser_hand'])
    
    # Feature engineering
    matches['rank_diff'] = matches['winner_rank'] - matches['loser_rank']
    matches['age_diff'] = matches['winner_age'] - matches['loser_age']
    matches = matches.drop(['winner_name', 'loser_name'], axis=1)
    
    return matches

In [4]:
def preprocess_players(players):

    players['hand'] = players['hand'].fillna('U')
    players['height'] = players['height'].fillna(players['height'].median())
    

    players['dob'] = pd.to_datetime(players['dob'], format='%Y%m%d', errors='coerce')
    players['age'] = (datetime.now() - players['dob']).dt.days / 365.25
    le = LabelEncoder()
    players['hand'] = le.fit_transform(players['hand'])
    players['ioc'] = le.fit_transform(players['ioc'].fillna('Unknown'))
    players = players.drop(['name_first', 'name_last', 'wikidata_id'], axis=1)
    
    return players

In [5]:
def preprocess_rankings(rankings):

    rankings['points'] = rankings['points'].fillna(0)  # Assume 0 points for missing
    
    rankings['ranking_date'] = pd.to_datetime(rankings['ranking_date'], format='%Y%m%d')
    rankings = rankings.drop_duplicates(subset=['ranking_date', 'player'])
    
    return rankings

In [6]:
def merge_datasets(matches, players, rankings):
    #mergevmatches with players for winner and loser details
    matches = matches.merge(players, left_on='winner_id', right_on='player_id', how='left', suffixes=('', '_winner'))
    matches = matches.merge(players, left_on='loser_id', right_on='player_id', how='left', suffixes=('', '_loser'))
    
    # merge_w rankings for winner and loser ranks
    matches = matches.merge(rankings, left_on=['winner_id', 'tourney_date'], right_on=['player', 'ranking_date'], how='left')
    matches = matches.merge(rankings, left_on=['loser_id', 'tourney_date'], right_on=['player', 'ranking_date'], how='left', suffixes=('_winner', '_loser'))
    
    matches = matches.drop(['player_winner', 'player_loser', 'ranking_date_winner', 'ranking_date_loser'], axis=1)
    
    return matches

In [7]:
matches = preprocess_matches(matches)
players = preprocess_players(players)
rankings = preprocess_rankings(rankings)
combined = merge_datasets(matches, players, rankings)

In [9]:
combined.to_csv('preprocessed_atp_data.csv', index=False)