# Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from selenium.common.exceptions import TimeoutException
import pandas as pd
import glob
from getpass import getuser


In [2]:
# Get the current user's name
user = getuser()

## import external functions

In [3]:
%run functions.ipynb

In [4]:
# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

# Retrieve all years links till 2018

In [5]:
# webpage to scrape
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Initialize the WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Initialize an empty DataFrame to store all the data
final_df = pd.DataFrame()

# Loop through selected years
for year in range(2013,2014): 
    # Click on the dropdown arrow
    wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
    time.sleep(4)  # Introduce a delay before clicking the element
    year_xpath = f"//a[text()='{year}']"
    print(year_xpath)
    
    try:
        # Locate the element for the specific year and click it
        year_element = wait.until(EC.presence_of_element_located((By.XPATH, year_xpath)))
        print(year_element)        
        # Click on the element
        year_element.click()
        
        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed
        time.sleep(10)

        # Find all links with class "tie-link" within the tables
        tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

        # Initialize a list to store the extracted links
        links = []

        # Extract the links
        for link in tie_links:
            href = link.get_attribute("href")
            links.append(href)

        # Call the function to scrape and transform the data
        matches_df, players_df = to_df(links)
        # Cleaning DataFrames
        cleaned_matches_df = clean_matches_df(matches_df)
        cleaned_players_df = clean_players_df(players_df)

        # Merging DataFrames
        merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)
        # Add a column with the respective year information
        merged_df['Year'] = year

        # Append data to the final DataFrame
        final_df = pd.concat([final_df, merged_df], ignore_index=True)
        # Save the current year's data to a separate Excel file
        year_file_name = f"davis_{year}.xlsx"
        final_df.to_excel(year_file_name, index=False)

    except TimeoutException:
        print(f"TimeoutException occurred while locating element for year {year}. Skipping...")
        continue  # Skip to the next iteration if element not found within the timeout
        
# Save the final DataFrame to an Excel file
# final_df.to_excel("old_system.xlsx", index=False)


//a[text()='2013']
<selenium.webdriver.remote.webelement.WebElement (session="0612b11ed18cd0c71cf7b16357eb9f98", element="f.95A41B3E8D1B76A7DAA5EF9745312B11.d.DC9C9EAD0D77693482DAE14DA0822B6B.e.2427")>
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2013-WG-M-ESP-CAN-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2013-WG-M-CRO-ITA-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2013-WG-M-SRB-BEL-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded


KeyboardInterrupt: 

In a year I have 15 matches (number of link), each match has a max number of 5 matches and on avg 4 matches. The number of single matches should be therefore between 60 and 70. We have two observations (one for each player or team) for each match for a total of 120/140 rows (max 75*2 = 150).

We have 5 players in each team, two team in each match, and 15 matches for a total of (5 * 2 * 15) 150 players/teams.

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations.

# Retrieve all years links from 2019

In [None]:
# webpage to scrape
url = ("https://www.daviscup.com/en/draws-results/finals/2019.aspx")
# Initialize the WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 15)


# Initialize an empty DataFrame to store all the data
final_df = pd.DataFrame()

# Loop through selected years (2019 to 2023)
for year in range(2023, 2024):  # Change the range accordingly
    current_url = f"https://www.daviscup.com/en/draws-results/finals/{year}.aspx"
    
    try:
        driver.get(current_url)
        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed
        time.sleep(10)

        # Find all links with class "tie-link" within the tables
        tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")
        
        # Initialize a list to store the extracted links
        links = []

        # Extract the links
        for link in tie_links:
            href = link.get_attribute("href")
            links.append(href)

        # Call the function to scrape and transform the data
        matches_df, players_df = to_df(links)
        # Cleaning DataFrames
        cleaned_matches_df = clean_matches_df(matches_df)
        cleaned_players_df = clean_players_df(players_df)

        # Merging DataFrames
        merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)
        # Add a column with the respective year information
        merged_df['Year'] = year

        # Append data to the final DataFrame
        final_df = pd.concat([final_df, merged_df], ignore_index=True)
        # Save the current year's data to a separate Excel file
        year_file_name = f"davis_{year}.xlsx"
        final_df.to_excel(year_file_name, index=False)

    except TimeoutException:
        print(f"TimeoutException occurred while processing year {year}. Skipping...")
        continue  # Skip to the next iteration if page not loaded within the timeout

    except Exception as e:
        print(f"An error occurred for year {year}: {str(e)}")
        continue  # Continue to the next iteration if an error occurs

# Close the WebDriver
driver.quit()



  driver = webdriver.Chrome(executable_path=chrome_driver_path)
  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2023-FLS-M-CAN-FIN-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2023-FLS-M-CZE-AUS-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2023-FLS-M-ITA-NED-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2023-FLS-M-SRB-GBR-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
Skipping match 3
Skipping match 3
matches df downloaded
players df downloaded
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2023-FLS-M-FIN-AUS-01
Initia

In a year I have 7 matches (number of link), each match has a max number of 3 matches. The number of single matches should be 21. We have two observations (one for each player or team) for each match for a total of 42 (21*2) rows.

We have 5 players in each team, two team in each match, and 7 matches for a total of (5 * 2 * 7) 70 players/teams.
We have on avg 5 player per team and 8 teams competing from the quarter finals, for a total of 40 (8*5) players.

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations.

In [None]:
# check if each team has, on average, 5 players

# Group by 'Team Name' and count unique players
unique_players_per_team = cleaned_players_df.groupby('Team Name')['Player'].nunique()

# Display the counts
print(unique_players_per_team)



Team Name
AUSTRALIA        5
CANADA           5
CZECHIA          4
FINLAND          5
GREAT BRITAIN    5
ITALY            5
NETHERLANDS      5
SERBIA           5
Name: Player, dtype: int64


## concatenate all xlsx in a unique file

In [None]:


# File path pattern to match all 'davis_{year}.xlsx' files
file_pattern = 'davis_*.xlsx'

# Get a list of all file paths matching the pattern
file_paths = glob.glob(file_pattern)

# Create an empty list to store DataFrame objects
dataframes = []

# Read each Excel file into a DataFrame and append to the list
for file_path in file_paths:
    df = pd.read_excel(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the concatenated DataFrame to a new Excel file
combined_df.to_excel('combined_davis.xlsx', index=False)


In [None]:
# Assuming combined_df is the DataFrame with the 'Team Name' column
combined_df.rename(columns={'Team Name': 'team'}, inplace=True)


In [None]:
# Group by 'team' and 'Year', then count the number of unique 'Player 1'
unique_players_per_team_year = combined_df.groupby(['team', 'Year'])['Player 1'].nunique()

# Convert the Series to a DataFrame
result_table = unique_players_per_team_year.reset_index(name='Unique_Player_Count')

# Pivot the DataFrame to create the desired table
pivot_table = result_table.pivot(index='team', columns='Year', values='Unique_Player_Count')

# Reindex the columns to include all years from 2014 to 2023
all_years = range(2014, 2024)
pivot_table = pivot_table.reindex(columns=all_years)

# Display the resulting table
pivot_table



Year,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ARGENTINA,3.0,4.0,7.0,2.0,,3.0,,,,
AUSTRALIA,4.0,5.0,3.0,5.0,3.0,3.0,,,4.0,4.0
BELGIUM,2.0,4.0,3.0,5.0,5.0,,,,,
BRAZIL,,3.0,,,,,,,,
CANADA,2.0,5.0,3.0,3.0,3.0,3.0,,,3.0,3.0
CROATIA,,4.0,5.0,4.0,4.0,,,3.0,3.0,
CZECHIA,4.0,4.0,3.0,2.0,,,,,,2.0
FINLAND,,,,,,,,,,4.0
FRANCE,4.0,5.0,6.0,8.0,9.0,,,,,
GERMANY,7.0,3.0,2.0,,4.0,2.0,,4.0,3.0,


In [None]:
# Highlight values greater than 5
def highlight(value):
    if pd.notnull(value) and value > 5:
        return 'background-color: yellow'
    return ''

styled_table = pivot_table.style.applymap(lambda x: highlight(x))

# Display the styled table
styled_table


Year,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ARGENTINA,3.0,4.0,7.0,2.0,,3.0,,,,
AUSTRALIA,4.0,5.0,3.0,5.0,3.0,3.0,,,4.0,4.0
BELGIUM,2.0,4.0,3.0,5.0,5.0,,,,,
BRAZIL,,3.0,,,,,,,,
CANADA,2.0,5.0,3.0,3.0,3.0,3.0,,,3.0,3.0
CROATIA,,4.0,5.0,4.0,4.0,,,3.0,3.0,
CZECHIA,4.0,4.0,3.0,2.0,,,,,,2.0
FINLAND,,,,,,,,,,4.0
FRANCE,4.0,5.0,6.0,8.0,9.0,,,,,
GERMANY,7.0,3.0,2.0,,4.0,2.0,,4.0,3.0,


In [None]:
filtered_df = combined_df[(combined_df['Year'] == 2018) & (combined_df['team'] == 'FRANCE')]

unique_players_team = filtered_df['Player 1'].unique()

# Display the unique players for 2018 and France
print(unique_players_team)


['Adrian Mannarino' 'Richard Gasquet' 'Pierre-Hugues Herbert'
 'Lucas Pouille' 'Jeremy Chardy' 'Benoit Paire' 'Julien Benneteau'
 'Nicolas Mahut' 'Jo-Wilfried Tsonga']
