# Libraries

In [38]:
import pandas as pd
from getpass import getuser


In [39]:
user = getuser()

# input datasets

In [40]:
github_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/"
website_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/website_df.xlsx"
website_df = pd.read_excel(website_path)

In [41]:
# Create the single_matches_web dataframe by selecting rows where p2t1 and p2t2 are empty
single_matches_web_df = website_df[website_df['p2t1'].isna() & website_df['p2t2'].isna()]
double_matches_web_df = website_df[website_df['p2t1'].notna() & website_df['p2t2'].notna()]
print(website_df.shape)
print(single_matches_web_df.shape)
print(double_matches_web_df.shape)

(494, 19)
(387, 19)
(107, 19)


# single_matches_web_df

Daniel Brands (Germany) vs Roberto Bautista Agut (Spain) is recorded as match number 4 in website_df since in the match before Feliciano Lopez (Spain) walked over Philipp Kohlschreiber (Germany). In website_df, we therefore consider it as a match not played whereas in github_df the number of match is preserved. We therefore change match number in website_df to make it consistent. 

In [42]:
# Change the number of match  
single_matches_web_df.loc[
    (single_matches_web_df['year'] == 2014) &
    (single_matches_web_df['team1'] == 'germany') &
    (single_matches_web_df['team2'] == 'spain') &
    (single_matches_web_df['match'] == 4), 'match'] = 5



# github data

In [43]:
# Range of years to include
years = range(2013, 2024)
dfs = []

for year in years:
    atp_matches_path = f"{github_path}atp_matches_{year}.csv"
    df = pd.read_csv(atp_matches_path)
    dfs.append(df)

# Concatenating all the DataFrames into one
atp_matches_df = pd.concat(dfs, ignore_index=True)

# Extract the year from the 'tourney_id' by splitting the string at the first hyphen and taking the first part
atp_matches_df['year'] = atp_matches_df['tourney_id'].str.split('-').str[0]

# Convert the 'year' column to an integer
atp_matches_df['year'] = atp_matches_df['year'].astype(int)

# Replacing "Frances Tiafoe" with "Franklin Tiafoe" in both the 'winner_name' and 'loser_name' columns
atp_matches_df['winner_name'] = atp_matches_df['winner_name'].replace('Frances Tiafoe', 'Franklin Tiafoe')
atp_matches_df['loser_name'] = atp_matches_df['loser_name'].replace('Frances Tiafoe', 'Franklin Tiafoe')

# Replacing "Andreas Haider Maurer" with "Andreas Haider-Maurer" in both the 'winner_name' and 'loser_name' columns
atp_matches_df['winner_name'] = atp_matches_df['winner_name'].replace('Andreas Haider Maurer', 'Andreas Haider-Maurer')
atp_matches_df['loser_name'] = atp_matches_df['loser_name'].replace('Andreas Haider Maurer', 'Andreas Haider-Maurer')

# Replacing "Guillermo Garcia Lopez" with "Guillermo Garcia-Lopez" in both the 'winner_name' and 'loser_name' columns
atp_matches_df['winner_name'] = atp_matches_df['winner_name'].replace('Guillermo Garcia Lopez', 'Guillermo Garcia-Lopez')
atp_matches_df['loser_name'] = atp_matches_df['loser_name'].replace('Guillermo Garcia Lopez', 'Guillermo Garcia-Lopez')

# Save the combined data to a single Excel file
final_output_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/atp_matches_df.xlsx"
atp_matches_df.to_excel(final_output_path, index=False)


# github repo matched observations with website df players

In [44]:
# Initialize a list to store all ATP matches where Davis Cup players took part
combined_matches_list = []

# Iterate through each year to filter the matches
for year in range(2013, 2024):
    # Filter Davis Cup data for the current year
    davis_single_year_df = single_matches_web_df[single_matches_web_df['year'] == year]
    
    # Create a set of tuples representing pairs of players for the current year
    davis_pairs = set(zip(davis_single_year_df['p1t1'], davis_single_year_df['p1t2']))
    
    # Filter ATP matches data to match Davis Cup pairs for the current year
    matched_atp_matches_df = atp_matches_df[
        (atp_matches_df['year'] == year) &
        atp_matches_df.apply(lambda row: (row['winner_name'], row['loser_name']) in davis_pairs or
                                           (row['loser_name'], row['winner_name']) in davis_pairs, axis=1)
    ]
    
    # Append the filtered data to the list
    combined_matches_list.append(matched_atp_matches_df)

# Concatenate all the DataFrames in the list into one combined DataFrame
combined_matches_df = pd.concat(combined_matches_list, ignore_index=True)

# Save the combined data to a single Excel file
final_output_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/single matches/davis_cup_players.xlsx"
combined_matches_df.to_excel(final_output_path, index=False)


In [45]:
# Count the number of observations where tourney_level is "D" - Davis Cup
davis_single_github_count = combined_matches_df[combined_matches_df['tourney_level'] == 'D'].shape[0]

# Count the number of observations where tourney_level is not "D"
extra_matches_single_count = combined_matches_df[combined_matches_df['tourney_level'] != 'D'].shape[0]

print("Number of observations for davis cup matches from github", davis_single_github_count)
print("Number of observations for matches played by davis cup players outside davis cup:", extra_matches_single_count)


Number of observations for davis cup matches from github 387
Number of observations for matches played by davis cup players outside davis cup: 131


130 is the number of games played in other atp matches by davis cup players

# github dataset single match players

In [46]:
# Creating a new DataFrame with observations where tourney_level is "D"
single_github_df = combined_matches_df[combined_matches_df['tourney_level'] == 'D']
single_github_df.shape

(387, 50)

we have 387 observations for single davis matches for both single_github_df and single_matches_web_df

# list from github dataset

In [47]:

# Creating a list with year, match_num, winner_name, loser_name, where names are alphabetically ordered
github_list = single_github_df.apply(
    lambda row: [row['year'],row['match_num'], *sorted([row['winner_name'], row['loser_name']])], 
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(github_list[:5])


[[2013, 1, 'Carlos Berlocq', 'Philipp Kohlschreiber'], [2013, 2, 'Florian Mayer', 'Juan Monaco'], [2013, 4, 'Juan Monaco', 'Tobias Kamke'], [2013, 5, 'Carlos Berlocq', 'Christopher Kas'], [2013, 1, 'David Goffin', 'Viktor Troicki']]


# list from website df

In [48]:


# Creating a list with Year, cleaned Match, P1T1, P1T2, where player names are alphabetically ordered
website_list = single_matches_web_df.apply(
    lambda row: [row['year'], row['match'], *sorted([row['p1t1'], row['p1t2']])],
    axis=1
).tolist()

# Optionally, print the first few elements in the list to verify
print(website_list[:5])




[[2013, 1, 'Albert Ramos', 'Milos Raonic'], [2013, 2, 'Frank Dancevic', 'Marcel Granollers'], [2013, 4, 'Guillermo Garcia-Lopez', 'Milos Raonic'], [2013, 5, 'Albert Ramos', 'Frank Dancevic'], [2013, 1, 'Marin Cilic', 'Paolo Lorenzi']]


In [49]:
# Standardizing formats in matches_list and davis_single_list
# Assuming year and match number are integers, and player names are strings

standardized_github_list = [
    [int(year), int(match_num), str(winner_name), str(loser_name)]
    for year, match_num, winner_name, loser_name in github_list
]

standardized_web_list = [
    [int(year),int(match_num), str(player1), str(player2)]
    for year, match_num, player1, player2 in website_list
]

# Convert to sets of tuples for comparison
standardized_github_set = set(tuple(item) for item in standardized_github_list)
standardized_web_set = set(tuple(item) for item in standardized_web_list)

# Find non-matching elements in both sets
yes_github_no_web = standardized_github_set - standardized_web_set
no_github_yes_web = standardized_web_set - standardized_github_set

print("Non-matching elements in standardized_web_list:", yes_github_no_web)
print("Non-matching elements in standardized_github_list:", no_github_yes_web)

# Count the number of non-matching tuples in standardized_github_list
num_yes_github_no_web = len(yes_github_no_web)

# Count the number of non-matching tuples in standardized_web_list
num_no_github_yes_web = len(no_github_yes_web)

print("Number of unmatched elements in num_yes_github_no_web:", num_yes_github_no_web)
print("Number of unmatched elements in num_no_github_yes_web:", num_no_github_yes_web)


Non-matching elements in standardized_web_list: set()
Non-matching elements in standardized_github_list: set()
Number of unmatched elements in num_yes_github_no_web: 0
Number of unmatched elements in num_no_github_yes_web: 0


# merge two datasets single_github_df and single_matches_web_df

In [50]:
# Ensure 'year' is an integer in single_github_df
single_github_df['year'] = single_github_df['year'].astype(int)

# Ensure 'year' is an integer in single_matches_web_df
single_matches_web_df['year'] = single_matches_web_df['year'].astype(int)

# Standardize case and remove leading/trailing spaces in single_github_df
single_github_df['winner_name'] = single_github_df['winner_name'].str.lower().str.strip()
single_github_df['loser_name'] = single_github_df['loser_name'].str.lower().str.strip()

# Standardize case and remove leading/trailing spaces in single_matches_web_df
single_matches_web_df['p1t1'] = single_matches_web_df['p1t1'].str.lower().str.strip()
single_matches_web_df['p1t2'] = single_matches_web_df['p1t2'].str.lower().str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_github_df['year'] = single_github_df['year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_matches_web_df['year'] = single_matches_web_df['year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_github_df['winner_name'] = single_github_df['winner_name'].s

In [51]:
print(single_github_df.dtypes)

tourney_id             object
tourney_name           object
surface                object
draw_size               int64
tourney_level          object
tourney_date            int64
match_num               int64
winner_id               int64
winner_seed           float64
winner_entry           object
winner_name            object
winner_hand            object
winner_ht             float64
winner_ioc             object
winner_age            float64
loser_id                int64
loser_seed            float64
loser_entry            object
loser_name             object
loser_hand             object
loser_ht              float64
loser_ioc              object
loser_age             float64
score                  object
best_of                 int64
round                  object
minutes               float64
w_ace                 float64
w_df                  float64
w_svpt                float64
w_1stIn               float64
w_1stWon              float64
w_2ndWon              float64
w_SvGms   

In [52]:
print(single_matches_web_df.dtypes)

ball                 object
stage                object
match                 int64
year                  int32
p1t1                 object
p2t1                 object
p1t2                 object
p2t2                 object
team1                object
team2                object
date_start           object
date_end             object
venue_name           object
city                 object
country              object
main_surface         object
specific_surface     object
indoor_outdoor       object
court_pace_rating    object
dtype: object


In [53]:
# Direct match merge
merged_df = pd.merge(
    single_github_df, 
    single_matches_web_df, 
    how='left', 
    left_on=['year', 'winner_name', 'loser_name'], 
    right_on=['year', 'p1t1', 'p1t2'],
    indicator=True  # Adds a column to indicate if the merge was successful
)

# Reverse match merge (loser_name vs. winner_name) on the resulting DataFrame
merged_df_reverse = pd.merge(
    merged_df, 
    single_matches_web_df, 
    how='left', 
    left_on=['year', 'loser_name', 'winner_name'], 
    right_on=['year', 'p1t1', 'p1t2'],
    indicator='merge_reverse',  # Renames the indicator column for this merge
    suffixes=('', '_reverse')  # To distinguish the columns from the second merge
)

# Drop duplicates if any
merged_df_final = merged_df_reverse.drop_duplicates()

# Remove all rows that have only empty cells
merged_df_final = merged_df_final.dropna(how='all')

# Check for unmatched rows after both merges
unmatched_rows_df = merged_df_final[
    (merged_df_final['_merge'] == 'left_only') & 
    (merged_df_final['merge_reverse'] == 'left_only')
]

# Count the number of unmatched rows
num_unmatched = unmatched_rows_df.shape[0]

print(f"Number of unmatched rows in the final merged DataFrame: {num_unmatched}")

# If needed, you can inspect these unmatched rows
if num_unmatched > 0:
    print("Unmatched rows:")
    print(unmatched_rows_df)


Number of unmatched rows in the final merged DataFrame: 0


In [54]:
merged_df_final.shape

(387, 88)

# Exporting dataframes

In [55]:

file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/davis_singles.xlsx"

# Export the DataFrame to an Excel file
merged_df_final.to_excel(file_path, index=False)

print(f"DataFrame exported to {file_path}")


DataFrame exported to C:/Users/ALESSANDRO/Documents/GitHub/davis-cup/data/davis_singles.xlsx


filtered_atp_matches.xlsx in single matches folder = final df for single matches

# matches played by davis players outside davis cup

# Double matches