# Libraries

In [1]:
import pandas as pd
from getpass import getuser


In [2]:
user = getuser()

# input datasets

In [3]:
github_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/"
website_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/website_df.xlsx"
website_df = pd.read_excel(website_path)

In [8]:
# Create the single_matches_web dataframe by selecting rows where p2t1 and p2t2 are empty
single_matches_web = website_df[website_df['p2t1'].isna() & website_df['p2t2'].isna()]
double_matches_df = website_df[website_df['p2t1'].notna() & website_df['p2t2'].notna()]
print(website_df.shape)
print(single_matches_web.shape)
print(double_matches_df.shape)

(494, 19)
(387, 19)
(107, 19)


In [10]:

# Initialize a list to store all ATP matches where Davis Cup players took part
combined_matches_list = []

for year in range(2014, 2024):
    # Filter Davis Cup data for the current year
    davis_single_year_df = single_matches_web[single_matches_web['year'] == year]

    # Create a set of tuples representing pairs of players for the current year
    davis_pairs = set(zip(davis_single_year_df['p1t1'], davis_single_year_df['p1t2']))

    # Load ATP matches data for the current year
    atp_matches_path = f"{github_path}atp_matches_{year}.csv"
    atp_matches_df = pd.read_csv(atp_matches_path)

    # Filter ATP matches data to match Davis Cup pairs
    matched_atp_matches_df = atp_matches_df[
        atp_matches_df.apply(lambda row: (row['winner_name'], row['loser_name']) in davis_pairs or
                                           (row['loser_name'], row['winner_name']) in davis_pairs, axis=1)
    ]

    # Append the filtered data to the list
    combined_matches_list.append(matched_atp_matches_df)

# Concatenate all the DataFrames in the list into one combined DataFrame
combined_matches_df = pd.concat(combined_matches_list, ignore_index=True)

# Extract the year information before the first hyphen in 'tourney_id'
combined_matches_df['year'] = combined_matches_df['tourney_id'].str.split('-').str[0]

# Convert 'year' column to integer
combined_matches_df['year'] = combined_matches_df['year'].astype(int)

# Save the combined data to a single Excel file
final_output_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/filtered_atp_matches.xlsx"
combined_matches_df.to_excel(final_output_path, index=False)


In [11]:
# Count the number of observations where tourney_level is "D" - Davis Cup
count_tourney_level_D = combined_matches_df[combined_matches_df['tourney_level'] == 'D'].shape[0]

# Count the number of observations where tourney_level is not "D"
count_not_tourney_level_D = combined_matches_df[combined_matches_df['tourney_level'] != 'D'].shape[0]

print("Number of observations with tourney_level 'D':", count_tourney_level_D)
print("Number of observations without tourney_level 'D':", count_not_tourney_level_D)


Number of observations with tourney_level 'D': 328
Number of observations without tourney_level 'D': 110


331 corresponds in both datasets combined_matches_df and davis_single_df

117 is the number of games played in other atp matches by davis cup players

In [113]:
# Creating a new DataFrame with observations where tourney_level is "D"
davis_cup_matches_df = combined_matches_df[combined_matches_df['tourney_level'] == 'D']


In [114]:

# Creating a list with year, match_num, winner_name, loser_name, where names are alphabetically ordered
matches_list = davis_cup_matches_df.apply(
    lambda row: [row['year'],row['match_num'], *sorted([row['winner_name'], row['loser_name']])], 
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(matches_list[:5])


[[2014, 1, 'Andreas Seppi', 'Carlos Berlocq'], [2014, 2, 'Fabio Fognini', 'Juan Monaco'], [2014, 4, 'Carlos Berlocq', 'Fabio Fognini'], [2014, 1, 'Radek Stepanek', 'Robin Haase'], [2014, 2, 'Igor Sijsling', 'Tomas Berdych']]


In [115]:
# Convert 'Match' column to string type first, then clean it to keep only the number after "MATCH"
davis_single_df['Match'] = davis_single_df['Match'].astype(str).str.replace('MATCH ', '').astype(int)

# Creating a list with Year, cleaned Match, P1T1, P1T2, where player names are alphabetically ordered
davis_single_list = davis_single_df.apply(
    lambda row: [row['Year'], row['Match'], *sorted([row['P1T1'], row['P1T2']])],
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(davis_single_list[:5])




[[2014, 1, 'Radek Stepanek', 'Robin Haase'], [2014, 2, 'Igor Sijsling', 'Tomas Berdych'], [2014, 4, 'Thiemo De Bakker', 'Tomas Berdych'], [2014, 5, 'Igor Sijsling', 'Lukas Rosol'], [2014, 1, 'Kei Nishikori', 'Peter Polansky']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  davis_single_df['Match'] = davis_single_df['Match'].astype(str).str.replace('MATCH ', '').astype(int)


In [116]:
# Standardizing formats in matches_list and davis_single_list
# Assuming year and match number are integers, and player names are strings

standardized_matches_list = [
    [int(year), int(match_num), str(winner_name), str(loser_name)]
    for year, match_num, winner_name, loser_name in matches_list
]

standardized_davis_single_list = [
    [int(year),int(match_num), str(player1), str(player2)]
    for year, match_num, player1, player2 in davis_single_list
]

# Convert to sets of tuples for comparison
standardized_matches_set = set(tuple(item) for item in standardized_matches_list)
standardized_davis_single_set = set(tuple(item) for item in standardized_davis_single_list)

# Find non-matching elements in both sets
non_matching_in_standardized_matches = standardized_matches_set - standardized_davis_single_set
non_matching_in_standardized_davis_single = standardized_davis_single_set - standardized_matches_set

print("Non-matching elements in standardized_matches_list:", non_matching_in_standardized_matches)
print("Non-matching elements in standardized_davis_single_list:", non_matching_in_standardized_davis_single)

# Count the number of non-matching tuples in standardized_matches_list
num_unmatched_in_standardized_matches = len(non_matching_in_standardized_matches)

# Count the number of non-matching tuples in standardized_davis_single_list
num_unmatched_in_standardized_davis_single = len(non_matching_in_standardized_davis_single)

print("Number of unmatched elements in standardized_matches_list:", num_unmatched_in_standardized_matches)
print("Number of unmatched elements in standardized_davis_single_list:", num_unmatched_in_standardized_davis_single)


Non-matching elements in standardized_matches_list: {(2014, 5, 'Daniel Brands', 'Roberto Bautista Agut')}
Non-matching elements in standardized_davis_single_list: {(2014, 4, 'Daniel Brands', 'Roberto Bautista Agut')}
Number of unmatched elements in standardized_matches_list: 1
Number of unmatched elements in standardized_davis_single_list: 1


In [117]:
# Change missmatched names in davis_single_df
# Define the conditions to locate the specific observation
condition = (
    (single_matches_web['Year'] == 2014) &
    (single_matches_web['P1T1'] == 'Daniel Brands') &
    (single_matches_web['P1T2'] == 'Roberto Bautista Agut') &
    (single_matches_web['Match'] == 'MATCH 4')
)

# Update the 'Match' column for the rows that match the condition
single_matches_web.loc[condition, 'Match'] = 'MATCH 5'

# Verify the change
print(single_matches_web[condition])


      Match  Year           P1T1 P2T1                   P1T2 P2T2  \
13  MATCH 5  2014  Daniel Brands  NaN  Roberto Bautista Agut  NaN   

                     Date                               Venue  \
13   31 Jan - 02 Feb 2014   Fraport Arena, Frankfurt, Germany   

   Court Pace Rating                     Ball  
13       Medium Fast   Dunlop Fort Tournament  


In [118]:
# Convert all column names to lowercase
single_matches_web.columns = single_matches_web.columns.str.lower()
davis_cup_matches_df.columns = davis_cup_matches_df.columns.str.lower()
# Ensure all relevant columns are standardized formats
davis_cup_matches_df['year'] = davis_cup_matches_df['year'].astype(int)
single_matches_web['year'] = single_matches_web['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  davis_cup_matches_df['year'] = davis_cup_matches_df['year'].astype(int)


In [119]:
# Direct match merge
merged_df = pd.merge(
    davis_cup_matches_df, 
    single_matches_web, 
    how='left', 
    left_on=['year', 'winner_name', 'loser_name'], 
    right_on=['year', 'p1t1', 'p1t2'],
    indicator=True  # This will add a column to indicate if the merge was successful
)

# Reverse match merge (loser_name vs. winner_name)
merged_df_reverse = pd.merge(
    davis_cup_matches_df, 
    single_matches_web, 
    how='left', 
    left_on=['year', 'loser_name', 'winner_name'], 
    right_on=['year', 'p1t1', 'p1t2'],
    indicator=True  # This will add a column to indicate if the merge was successful
)

# Combine both merged DataFrames
combined_merged_df = pd.concat([merged_df, merged_df_reverse], ignore_index=True)

# Drop duplicates if any
combined_merged_df = combined_merged_df.drop_duplicates()

# Check for unmatched rows in davis_cup_matches_df
unmatched_rows_df = combined_merged_df[combined_merged_df['_merge'] == 'left_only']

# Count the number of unmatched rows
num_unmatched = unmatched_rows_df.shape[0]

print(f"Number of unmatched rows in davis_cup_matches_df: {num_unmatched}")

# If needed, you can inspect these unmatched rows
if num_unmatched > 0:
    print("Unmatched rows:")
    print(unmatched_rows_df)


Number of unmatched rows in davis_cup_matches_df: 331
Unmatched rows:
                          tourney_id                     tourney_name surface  \
1                          2014-D006      Davis Cup WG R1: ARG vs ITA    Clay   
2                          2014-D006      Davis Cup WG R1: ARG vs ITA    Clay   
3                          2014-D001      Davis Cup WG R1: CZE vs NED    Hard   
6                          2014-D001      Davis Cup WG R1: CZE vs NED    Hard   
15                         2014-D002      Davis Cup WG R1: JPN vs CAN    Hard   
..                               ...                              ...     ...   
663  2023-M-DC-2023-FLS-M-CZE-AUS-01  Davis Cup Finals QF: CZE vs AUS    Hard   
668  2023-M-DC-2023-FLS-M-ITA-NED-01  Davis Cup Finals QF: ITA vs NED    Hard   
670  2023-M-DC-2023-FLS-M-ITA-SRB-01  Davis Cup Finals SF: ITA vs SRB    Hard   
671  2023-M-DC-2023-FLS-M-SRB-GBR-01  Davis Cup Finals QF: SRB vs GBR    Hard   
672  2023-M-DC-2023-FLS-M-SRB-GBR-01  D

In [120]:
unmatched_rows_df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,match,p1t1,p2t1,p1t2,p2t2,date,venue,court pace rating,ball,_merge
1,2014-D006,Davis Cup WG R1: ARG vs ITA,Clay,4,D,20140131,2,104926,,,...,,,,,,,,,,left_only
2,2014-D006,Davis Cup WG R1: ARG vs ITA,Clay,4,D,20140131,4,104926,,,...,,,,,,,,,,left_only
3,2014-D001,Davis Cup WG R1: CZE vs NED,Hard,4,D,20140131,1,104898,,,...,,,,,,,,,,left_only
6,2014-D001,Davis Cup WG R1: CZE vs NED,Hard,4,D,20140131,5,104997,,,...,,,,,,,,,,left_only
15,2014-D002,Davis Cup WG R1: JPN vs CAN,Hard,4,D,20140131,2,104433,,,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,2023-M-DC-2023-FLS-M-CZE-AUS-01,Davis Cup Finals QF: CZE vs AUS,Hard,2,D,20231122,1,207830,,,...,,,,,,,,,,left_only
668,2023-M-DC-2023-FLS-M-ITA-NED-01,Davis Cup Finals QF: ITA vs NED,Hard,2,D,20231123,2,206173,,,...,,,,,,,,,,left_only
670,2023-M-DC-2023-FLS-M-ITA-SRB-01,Davis Cup Finals SF: ITA vs SRB,Hard,2,D,20231125,2,206173,,,...,,,,,,,,,,left_only
671,2023-M-DC-2023-FLS-M-SRB-GBR-01,Davis Cup Finals QF: SRB vs GBR,Hard,2,D,20231123,1,200175,,,...,,,,,,,,,,left_only


# Exporting dataframes

In [121]:
file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/unmatched_rows_df.xlsx"

# Export the DataFrame to an Excel file
unmatched_rows_df.to_excel(file_path, index=False)

print(f"DataFrame exported to {file_path}")

file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single_matches_web.xlsx"

# Export the DataFrame to an Excel file
single_matches_web.to_excel(file_path, index=False)

print(f"DataFrame exported to {file_path}")


DataFrame exported to C:/Users/aldi/Documents/GitHub/davis-cup/data/unmatched_rows_df.xlsx
DataFrame exported to C:/Users/aldi/Documents/GitHub/davis-cup/data/df_paired.xlsx


filtered_atp_matches.xlsx in single matches folder = final df for single matches

# Double matches

In [125]:

# Sample setup - ensure to replace this with actual data loading logic
# df_paired = pd.read_csv('path_to_davis_cup_data.csv')

folder_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/double matches/"

double_matches_df = pd.DataFrame()

for year in range(2014, 2021):
    davis_double_year_df = davis_double_df[davis_double_df['year'] == year].copy()

    # Generate sorted tuples for each Davis Cup match
    davis_quads = set(
        davis_double_year_df.apply(
            lambda x: (
                tuple(sorted([str(x['p1t1']), str(x['p2t1'])])),
                tuple(sorted([str(x['p1t2']), str(x['p2t2'])]))
            ), axis=1
        )
    )

    atp_matches_path = f"{folder_path}atp_matches_doubles_{year}.csv"
    atp_matches_df = pd.read_csv(atp_matches_path)

    # Filter out any rows that might have NaN values for player names to avoid type errors
    atp_matches_df.dropna(subset=['winner1_name', 'winner2_name', 'loser1_name', 'loser2_name'], inplace=True)

    # Create sorted tuples for each ATP match
    matched_atp_matches_df = atp_matches_df[
        atp_matches_df.apply(
            lambda row: (
                tuple(sorted([str(row['winner1_name']), str(row['winner2_name'])])),
                tuple(sorted([str(row['loser1_name']), str(row['loser2_name'])]))
            ) in davis_quads, axis=1
        )
    ]

    # Append matched ATP matches to the final DataFrame
    double_matches_df = pd.concat([double_matches_df, matched_atp_matches_df], ignore_index=True)

if not double_matches_df.empty:
    double_matches_df['year'] = double_matches_df['tourney_id'].astype(str).str.split('-').str[0].astype(int)
    final_output_path = "C:/Users/{}/Documents/GitHub/davis-cup/data/double matches/filtered_atp_matches_doubles.xlsx"
    double_matches_df.to_excel(final_output_path, index=False)
else:
    print("No matches found or 'tourney_id' column missing in all years.")


No matches found or 'tourney_id' column missing in all years.
