In [98]:
import pandas as pd
from getpass import getuser


In [99]:
# Get the current user's name
user = getuser()
# Load the dataset
file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/combined_davis.xlsx"  
df = pd.read_excel(file_path)

In [100]:
# Loop through the DataFrame and mark rows for deletion
to_delete = []
for i in range(len(df) - 1):
    # Check for consecutive rows with "Sam Groth" as "Player 1" and "John Peers" as "Player 2"
    if (df.iloc[i]['Player 1'] == "Sam Groth" and 
        df.iloc[i]['Player 2'] == "John Peers" and
        df.iloc[i]['Player 1'] == df.iloc[i + 1]['Player 1'] and
        df.iloc[i]['Player 2'] == df.iloc[i + 1]['Player 2']):
        to_delete.append(i + 1)
    
    # Check for consecutive rows with "Jordan Thompson" as "Player 1"
    elif (df.iloc[i]['Player 1'] == "Jordan Thompson" and 
          df.iloc[i]['Player 1'] == df.iloc[i + 1]['Player 1']):
        to_delete.append(i + 1)

    # Check for consecutive rows with "Nick Kyrgios" as "Player 1"
    elif (df.iloc[i]['Player 1'] == "Nick Kyrgios" and 
          df.iloc[i]['Player 1'] == df.iloc[i + 1]['Player 1']):
        to_delete.append(i + 1)
        
    # Check for consecutive rows with "Sam Groth" as "Player 1"
    elif (df.iloc[i]['Player 1'] == "Sam Groth" and 
          df.iloc[i]['Player 1'] == df.iloc[i + 1]['Player 1']):
        to_delete.append(i + 1)
    # Check for consecutive rows with "Sam Groth" as "Player 1" and "John Peers" as "Player 2"
    elif (df.iloc[i]['Player 1'] == "John Peers" and 
        df.iloc[i]['Player 2'] == "Jordan Thompson" and
        df.iloc[i]['Player 1'] == df.iloc[i + 1]['Player 1'] and
        df.iloc[i]['Player 2'] == df.iloc[i + 1]['Player 2']):
        to_delete.append(i + 1)


# Remove the marked rows
df_cleaned = df.drop(df.index[to_delete])


In [101]:
# Extract the relevant columns and rename them
df_extracted = df_cleaned[['Stage', 'Player 1', 'Player 2', 'Year', 'Date', 'Venue', 'Court Pace Rating', 'Ball', 'match']].copy()
df_extracted.rename(columns={'Player 1': 'P1T1', 'Player 2': 'P2T1'}, inplace=True)

# Helper function to pair rows with the additional condition that "Date" should be equal
def pair_rows(df):
    paired_rows = []
    for i in range(0, len(df), 2):
        if i+1 < len(df):
            row1 = df.iloc[i]
            row2 = df.iloc[i+1]
            if row1['match'] == row2['match'] and row1['Year'] == row2['Year'] and row1['Date'] == row2['Date']:
                paired_row = [row1['match'], row1['Year'], row1['P1T1'], row1['P2T1'], row2['P1T1'], row2['P2T1'], row1['Date'], row1['Venue'], row1['Court Pace Rating'], row1['Ball']]
                paired_rows.append(paired_row)
    paired_df = pd.DataFrame(paired_rows, columns=['Match', 'Year', 'P1T1', 'P2T1', 'P1T2', 'P2T2', 'Date', 'Venue', 'Court Pace Rating', 'Ball'])
    return paired_df

# Pair the rows in the extracted dataframe
df_paired = pair_rows(df_extracted)

# Replace specific player names to match with data from atp matches
df_paired.replace({
    'P1T1': {
        'Jan-Lennard Struff': 'Jan Lennard Struff',
        'Juan Martin Del Potro': 'Juan Martin del Potro',
        'Albert Ramos-Vinolas': 'Albert Ramos',
        'Felix Auger-Aliassime': 'Felix Auger Aliassime',
        'Pierre-Hugues Herbert': 'Pierre Hugues Herbert',
        'Marc-Andrea Huesler': 'Marc Andrea Huesler',
        'Roman Khassanov': 'Roman Hassanov'
    },
    'P2T1': {
        'Jan-Lennard Struff': 'Jan Lennard Struff',
        'Juan Martin Del Potro': 'Juan Martin del Potro',
        'Albert Ramos-Vinolas': 'Albert Ramos',
        'Felix Auger-Aliassime': 'Felix Auger Aliassime',
        'Pierre-Hugues Herbert': 'Pierre Hugues Herbert',
        'Marc-Andrea Huesler': 'Marc Andrea Huesler',
        'Roman Khassanov': 'Roman Hassanov'

    },
    'P1T2': {
        'Jan-Lennard Struff': 'Jan Lennard Struff',
        'Juan Martin Del Potro': 'Juan Martin del Potro',
        'Albert Ramos-Vinolas': 'Albert Ramos',
        'Felix Auger-Aliassime': 'Felix Auger Aliassime',
         'Pierre-Hugues Herbert': 'Pierre Hugues Herbert',
        'Marc-Andrea Huesler': 'Marc Andrea Huesler',
        'Roman Khassanov': 'Roman Hassanov'

    },
    'P2T2': {
        'Jan-Lennard Struff': 'Jan Lennard Struff',
        'Juan Martin Del Potro': 'Juan Martin del Potro',
        'Albert Ramos-Vinolas': 'Albert Ramos',
        'Felix Auger-Aliassime': 'Felix Auger Aliassime',
         'Pierre-Hugues Herbert': 'Pierre Hugues Herbert',
        'Marc-Andrea Huesler': 'Marc Andrea Huesler',
        'Roman Khassanov': 'Roman Hassanov'

    }
}, inplace=True)


# Print the shapes of the dataframes
print("df shape:", df.shape)
print("df_cleaned shape:", df_cleaned.shape)
print("df_paired shape:", df_paired.shape)


df shape: (866, 28)
df_cleaned shape: (846, 28)
df_paired shape: (423, 10)


In [102]:
# Group by Date, Year, and Venue, then count observations
grouped_counts = df_cleaned.groupby(['Date', 'Year', 'Venue']).size()

# Define a function to check if counts meet the specified criteria
def check_count(year, count):
    if year <= 2018:
        return count == 8 or count == 10
    else:
        return 4 <= count <= 16

# Apply the function to each row in the grouped data and create a new column 'Count'
correct_counts_df = grouped_counts.reset_index()
correct_counts_df['Count'] = correct_counts_df.apply(lambda row: row[0], axis=1)

# Filter rows where counts do not meet the criteria
incorrect_counts_df = correct_counts_df[~correct_counts_df.apply(lambda row: check_count(row['Year'], row['Count']), axis=1)]

# Check if the DataFrame is empty, indicating all matches are in the correct range
if incorrect_counts_df.empty:
    print("All matches are in the correct range.")
else:
    # Display Date, Year, Venue, and Count when correct_counts is False
    print(incorrect_counts_df[['Date', 'Year', 'Venue', 'Count']])


All matches are in the correct range.


# Single matches

## check correspondance between data official website (davis matches) and https://github.com/JeffSackmann/tennis_atp/tree/master repo

In [103]:
# keep only rows for single matches
davis_single_df = df_paired[df_paired['P2T1'].isna() & df_paired['P2T2'].isna()]
print("davis_single_df", davis_single_df.shape)

davis_single_df (331, 10)


In [104]:
davis_single_df.loc[davis_single_df['P1T2'] == 'Juan Martin del Potro',]

Unnamed: 0,Match,Year,P1T1,P2T1,P1T2,P2T2,Date,Venue,Court Pace Rating,Ball
195,MATCH 1,2016,Andy Murray,,Juan Martin del Potro,,16 Sep - 18 Sep 2016,"Emirates Arena, Glasgow, Great Britain",Medium Fast,Slazenger Wimbledon
206,MATCH 2,2016,Ivo Karlovic,,Juan Martin del Potro,,25 Nov - 27 Nov 2016,"Arena Zagreb, Zagreb, Croatia",Medium Slow,Wilson US Open Extra Duty
208,MATCH 4,2016,Marin Cilic,,Juan Martin del Potro,,25 Nov - 27 Nov 2016,"Arena Zagreb, Zagreb, Croatia",Medium Slow,Wilson US Open Extra Duty


In [105]:
folder_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/"

# Initialize DataFrame to store all ATP matches where Davis Cup players took part
combined_matches_df = pd.DataFrame()

for year in range(2014, 2024):
    # Filter Davis Cup data for the current year
    davis_single_year_df = df_paired[df_paired['Year'] == year]

    # Create a set of tuples representing pairs of players for the current year
    davis_pairs = set(zip(davis_single_year_df['P1T1'], davis_single_year_df['P1T2']))

    # Load ATP matches data for the current year
    atp_matches_path = f"{folder_path}atp_matches_{year}.csv"
    atp_matches_df = pd.read_csv(atp_matches_path)

    # Filter ATP matches data to match Davis Cup pairs
    matched_atp_matches_df = atp_matches_df[
        atp_matches_df.apply(lambda row: (row['winner_name'], row['loser_name']) in davis_pairs or
                                           (row['loser_name'], row['winner_name']) in davis_pairs, axis=1)
    ]

    # Append the filtered data to the combined DataFrame
    combined_matches_df = combined_matches_df.append(matched_atp_matches_df)

# # Extract the year information before the first hyphen in 'tourney_id'
combined_matches_df['year'] = combined_matches_df['tourney_id'].str.split('-').str[0]

# Convert 'year' column to integer
combined_matches_df['year'] = combined_matches_df['year'].astype(int)

# Save the combined data to a single Excel file
final_output_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/filtered_atp_matches.xlsx"
combined_matches_df.to_excel(final_output_path, index=False)


In [106]:
# Count the number of observations where tourney_level is "D" - Davis Cup
count_tourney_level_D = combined_matches_df[combined_matches_df['tourney_level'] == 'D'].shape[0]

# Count the number of observations where tourney_level is not "D"
count_not_tourney_level_D = combined_matches_df[combined_matches_df['tourney_level'] != 'D'].shape[0]

print("Number of observations with tourney_level 'D':", count_tourney_level_D)
print("Number of observations without tourney_level 'D':", count_not_tourney_level_D)


Number of observations with tourney_level 'D': 331
Number of observations without tourney_level 'D': 117


331 corresponds in both datasets combined_matches_df and davis_single_df

117 is the number of games played in other atp matches by davis cup players

In [107]:
# Creating a new DataFrame with observations where tourney_level is "D"
davis_cup_matches_df = combined_matches_df[combined_matches_df['tourney_level'] == 'D']


In [108]:

# Creating a list with year, match_num, winner_name, loser_name, where names are alphabetically ordered
matches_list = davis_cup_matches_df.apply(
    lambda row: [row['year'],row['match_num'], *sorted([row['winner_name'], row['loser_name']])], 
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(matches_list[:5])


[[2014, 1, 'Andreas Seppi', 'Carlos Berlocq'], [2014, 2, 'Fabio Fognini', 'Juan Monaco'], [2014, 4, 'Carlos Berlocq', 'Fabio Fognini'], [2014, 1, 'Radek Stepanek', 'Robin Haase'], [2014, 2, 'Igor Sijsling', 'Tomas Berdych']]


In [109]:
# Convert 'Match' column to string type first, then clean it to keep only the number after "MATCH"
davis_single_df['Match'] = davis_single_df['Match'].astype(str).str.replace('MATCH ', '').astype(int)

# Creating a list with Year, cleaned Match, P1T1, P1T2, where player names are alphabetically ordered
davis_single_list = davis_single_df.apply(
    lambda row: [row['Year'], row['Match'], *sorted([row['P1T1'], row['P1T2']])],
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(davis_single_list[:5])




[[2014, 1, 'Radek Stepanek', 'Robin Haase'], [2014, 2, 'Igor Sijsling', 'Tomas Berdych'], [2014, 4, 'Thiemo De Bakker', 'Tomas Berdych'], [2014, 5, 'Igor Sijsling', 'Lukas Rosol'], [2014, 1, 'Kei Nishikori', 'Peter Polansky']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [110]:
# Standardizing formats in matches_list and davis_single_list
# Assuming year and match number are integers, and player names are strings

standardized_matches_list = [
    [int(year), int(match_num), str(winner_name), str(loser_name)]
    for year, match_num, winner_name, loser_name in matches_list
]

standardized_davis_single_list = [
    [int(year),int(match_num), str(player1), str(player2)]
    for year, match_num, player1, player2 in davis_single_list
]

# Convert to sets of tuples for comparison
standardized_matches_set = set(tuple(item) for item in standardized_matches_list)
standardized_davis_single_set = set(tuple(item) for item in standardized_davis_single_list)

# Find non-matching elements in both sets
non_matching_in_standardized_matches = standardized_matches_set - standardized_davis_single_set
non_matching_in_standardized_davis_single = standardized_davis_single_set - standardized_matches_set

print("Non-matching elements in standardized_matches_list:", non_matching_in_standardized_matches)
print("Non-matching elements in standardized_davis_single_list:", non_matching_in_standardized_davis_single)

# Count the number of non-matching tuples in standardized_matches_list
num_unmatched_in_standardized_matches = len(non_matching_in_standardized_matches)

# Count the number of non-matching tuples in standardized_davis_single_list
num_unmatched_in_standardized_davis_single = len(non_matching_in_standardized_davis_single)

print("Number of unmatched elements in standardized_matches_list:", num_unmatched_in_standardized_matches)
print("Number of unmatched elements in standardized_davis_single_list:", num_unmatched_in_standardized_davis_single)


Non-matching elements in standardized_matches_list: {(2014, 5, 'Daniel Brands', 'Roberto Bautista Agut')}
Non-matching elements in standardized_davis_single_list: {(2014, 4, 'Daniel Brands', 'Roberto Bautista Agut')}
Number of unmatched elements in standardized_matches_list: 1
Number of unmatched elements in standardized_davis_single_list: 1


# Double matches

In [111]:
# Filter the DataFrame to keep rows where P1T1, P2T1, P1T2, P2T2 are all non-NA
davis_double_df = df_paired.dropna(subset=['P1T1', 'P2T1', 'P1T2', 'P2T2'])

In [112]:
print("davis_double_df", davis_double_df.shape)

davis_double_df (92, 10)


In [125]:
davis_double_df

Unnamed: 0,Match,Year,P1T1,P2T1,P1T2,P2T2,Date,Venue,Court Pace Rating,Ball
2,MATCH 3,2014,Tomas Berdych,Radek Stepanek,Robin Haase,Jean-Julien Rojer,31 Jan - 02 Feb 2014,"Cez Arena, Ostrava, Czech Republic",Medium,Head ATP
7,MATCH 3,2014,Kei Nishikori,Yasutaka Uchiyama,Frank Dancevic,Daniel Nestor,31 Jan - 02 Feb 2014,"Ariake Coliseum, Tokyo, Japan",Medium,Srixon
12,MATCH 3,2014,Tommy Haas,Philipp Kohlschreiber,David Marrero Santana,Fernando Verdasco,31 Jan - 02 Feb 2014,"Fraport Arena, Frankfurt, Germany",Medium Fast,Dunlop Fort Tournament
16,MATCH 3,2014,Richard Gasquet,Jo-Wilfried Tsonga,Chris Guccione,Lleyton Hewitt,31 Jan - 02 Feb 2014,"Vendéspace, La Roche sur Yon, France",,Babolat French Open Roland-Garros
21,MATCH 3,2014,Bob Bryan,Mike Bryan,Colin Fleming,Dominic Inglot,31 Jan - 02 Feb 2014,"Petco Park, San Diego, CA, USA",Medium Fast,Dunlop Fort Clay Court
...,...,...,...,...,...,...,...,...,...,...
402,MATCH 3,2022,Max Purcell,Jordan Thompson,Nikola Mektic,Mate Pavic,25 Nov - 25 Nov 2022,Malaga,,
407,MATCH 3,2023,Alexis Galarneau,Vasek Pospisil,Harri Heliovaara,Otto Virtanen,21 Nov - 21 Nov 2023,Malaga,,
410,MATCH 3,2023,Jiri Lehecka,Adam Pavlasek,Matthew Ebden,Max Purcell,22 Nov - 22 Nov 2023,Malaga,,
413,MATCH 3,2023,Jannik Sinner,Lorenzo Sonego,Tallon Griekspoor,Wesley Koolhof,23 Nov - 23 Nov 2023,Malaga,,


In [149]:

# Sample setup - ensure to replace this with actual data loading logic
# df_paired = pd.read_csv('path_to_davis_cup_data.csv')

folder_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/double matches/"

double_matches_df = pd.DataFrame()

for year in range(2014, 2021):
    davis_double_year_df = davis_double_df[davis_double_df['Year'] == year].copy()

    # Generate sorted tuples for each Davis Cup match
    davis_quads = set(
        davis_double_year_df.apply(
            lambda x: (
                tuple(sorted([str(x['P1T1']), str(x['P2T1'])])),
                tuple(sorted([str(x['P1T2']), str(x['P2T2'])]))
            ), axis=1
        )
    )

    atp_matches_path = f"{folder_path}atp_matches_doubles_{year}.csv"
    atp_matches_df = pd.read_csv(atp_matches_path)

    # Filter out any rows that might have NaN values for player names to avoid type errors
    atp_matches_df.dropna(subset=['winner1_name', 'winner2_name', 'loser1_name', 'loser2_name'], inplace=True)

    # Create sorted tuples for each ATP match
    matched_atp_matches_df = atp_matches_df[
        atp_matches_df.apply(
            lambda row: (
                tuple(sorted([str(row['winner1_name']), str(row['winner2_name'])])),
                tuple(sorted([str(row['loser1_name']), str(row['loser2_name'])]))
            ) in davis_quads, axis=1
        )
    ]

    # Append matched ATP matches to the final DataFrame
    double_matches_df = pd.concat([double_matches_df, matched_atp_matches_df], ignore_index=True)

if not double_matches_df.empty:
    double_matches_df['year'] = double_matches_df['tourney_id'].astype(str).str.split('-').str[0].astype(int)
    final_output_path = "C:/Users/{}/Documents/GitHub/davis-cup/data/double matches/filtered_atp_matches_doubles.xlsx"
    double_matches_df.to_excel(final_output_path, index=False)
else:
    print("No matches found or 'tourney_id' column missing in all years.")


{(('Kei Nishikori', 'Yasutaka Uchiyama'), ('Daniel Nestor', 'Frank Dancevic')), (('Fabio Fognini', 'Simone Bolelli'), ('Andy Murray', 'Colin Fleming')), (('Philipp Kohlschreiber', 'Tommy Haas'), ('David Marrero Santana', 'Fernando Verdasco')), (('Jo-Wilfried Tsonga', 'Richard Gasquet'), ('Chris Guccione', 'Lleyton Hewitt')), (('Julien Benneteau', 'Richard Gasquet'), ('Roger Federer', 'Stan Wawrinka')), (('Roger Federer', 'Stan Wawrinka'), ('Aleksandr Nedovyesov', 'Andrey Golubev')), (('Evgeny Korolev', 'Mikhail Kukushkin'), ('Olivier Rochus', 'Ruben Bemelmans')), (('Marco Chiudinelli', 'Stan Wawrinka'), ('Fabio Fognini', 'Simone Bolelli')), (('Bob Bryan', 'Mike Bryan'), ('Colin Fleming', 'Dominic Inglot')), (('Radek Stepanek', 'Tomas Berdych'), ('Jean-Julien Rojer', 'Robin Haase')), (('Julien Benneteau', 'Michael Llodra'), ('Andre Begemann', 'Tobias Kamke')), (('Eduardo Schwank', 'Horacio Zeballos'), ('Fabio Fognini', 'Simone Bolelli')), (('Filip Krajinovic', 'Nenad Zimonjic'), ('Marco

In [128]:
double_matches_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner1_id,winner2_id,winner_seed,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced


# Exporting dataframes

In [None]:
file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/singles_paired.xlsx"

# Export the DataFrame to an Excel file
df_paired.to_excel(file_path, index=False)

print(f"DataFrame exported to {file_path}")


filtered_atp_matches.xlsx in single matches folder = final df for single matches