In [None]:
import pandas as pd 
import numpy as np 
import glob
import os

# Get all ATP match files
files = sorted(glob.glob("atp_matches/*.csv"))  # Sort files to ensure chronological order

# Create an empty list to store individual dataframes
dfs = []

# Read each file and append to the list
print("Reading files:")
for file in files:
    print(f"Reading {file}...")
    df = pd.read_csv(file)
    dfs.append(df)

# Combine all dataframes
if dfs:  # Check if we found any files
    matches = pd.concat(dfs, ignore_index=True)
    print(f"\nCombined {len(files)} files into one DataFrame with {len(matches)} rows and {len(matches.columns)} columns")
    
    # Convert tourney_date to datetime
    matches['tourney_date'] = pd.to_datetime(matches['tourney_date'].astype(str), format='%Y%m%d')
    
    # Show distribution of years
    year_counts = matches['tourney_date'].dt.year.value_counts().sort_index()
    print("\nMatches per year:")
    print(year_counts)
    
    matches.head()  # Display the first few rows
else:
    print("No ATP match files found")

# *Grass DF*

In [None]:
grass_matches = matches[matches['surface'] == 'Grass']
grass_matches = grass_matches[grass_matches['draw_size'] >= 32]
essential_columns = [
    'winner_name', 'loser_name', 'surface', 'score', 'tourney_date', 'round'
]
grass_matches.dropna(subset = essential_columns, inplace = True)

grass_matches


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
1527,2010-500,Halle,Grass,32,A,2010-06-07,1,103819,1.0,,...,40.0,30.0,8.0,10.0,2.0,4.0,2.0,8390.0,65.0,736.0
1528,2010-500,Halle,Grass,32,A,2010-06-07,2,104268,,,...,47.0,30.0,12.0,10.0,12.0,16.0,67.0,723.0,273.0,162.0
1529,2010-500,Halle,Grass,32,A,2010-06-07,3,103843,,,...,52.0,35.0,19.0,13.0,7.0,12.0,61.0,752.0,54.0,822.0
1530,2010-500,Halle,Grass,32,A,2010-06-07,4,104259,,,...,84.0,63.0,19.0,17.0,4.0,6.0,35.0,1230.0,199.0,236.0
1531,2010-500,Halle,Grass,32,A,2010-06-07,5,103747,,LL,...,46.0,33.0,11.0,10.0,2.0,4.0,203.0,235.0,17.0,2095.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41328,2024-0315,Newport,Grass,32,A,2024-07-15,276,208118,,WC,...,46.0,27.0,13.0,10.0,5.0,9.0,393.0,123.0,208.0,294.0
41329,2024-0315,Newport,Grass,32,A,2024-07-15,275,210319,,WC,...,63.0,41.0,13.0,15.0,7.0,14.0,254.0,225.0,217.0,278.0
41330,2024-0315,Newport,Grass,32,A,2024-07-15,273,206909,5.0,,...,36.0,22.0,11.0,10.0,1.0,5.0,53.0,915.0,148.0,415.0
41331,2024-0315,Newport,Grass,32,A,2024-07-15,272,106109,,Q,...,26.0,18.0,9.0,9.0,2.0,6.0,210.0,292.0,79.0,728.0


# *Main DF*

In [None]:
match = matches[matches['surface'] != 'Grass']
match= match[match['draw_size'] >= 32]
match.dropna(subset = essential_columns, inplace = True)
match



Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2010-339,Brisbane,Hard,32,A,2010-01-03,1,104053,1.0,,...,34.0,29.0,11.0,10.0,3.0,5.0,7.0,4410.0,77.0,598.0
1,2010-339,Brisbane,Hard,32,A,2010-01-03,2,104958,,WC,...,34.0,22.0,14.0,9.0,7.0,10.0,134.0,400.0,78.0,590.0
2,2010-339,Brisbane,Hard,32,A,2010-01-03,3,104755,,,...,58.0,38.0,14.0,14.0,7.0,11.0,52.0,850.0,88.0,568.0
3,2010-339,Brisbane,Hard,32,A,2010-01-03,4,105051,,Q,...,29.0,16.0,15.0,9.0,2.0,5.0,285.0,151.0,28.0,1260.0
4,2010-339,Brisbane,Hard,32,A,2010-01-03,5,104607,4.0,,...,41.0,26.0,14.0,9.0,6.0,9.0,20.0,1655.0,251.0,179.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42222,2024-0352,Paris Masters,Hard,56,M,2024-10-28,327,200267,,LL,...,22.0,15.0,16.0,9.0,2.0,4.0,65.0,833.0,133.0,452.0
42223,2024-0352,Paris Masters,Hard,56,M,2024-10-28,326,105173,,WC,...,39.0,26.0,10.0,11.0,6.0,11.0,58.0,929.0,12.0,3180.0
42224,2024-0352,Paris Masters,Hard,56,M,2024-10-28,325,200005,15.0,,...,48.0,35.0,21.0,14.0,2.0,4.0,18.0,2385.0,35.0,1375.0
42225,2024-0352,Paris Masters,Hard,56,M,2024-10-28,324,106218,,Q,...,45.0,35.0,20.0,12.0,1.0,2.0,49.0,1105.0,51.0,1075.0


*General Features*
general per match
rank_dif
seed_diff
age_diff


