# Libraries

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from selenium.common.exceptions import TimeoutException
import pandas as pd
import glob
from getpass import getuser
import os


In [5]:
# Get the current user's name
user = getuser()

# import external functions

In [6]:
%run scraping_functions.ipynb

# Input Directories

In [7]:
# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"
# Define the directory where the files should be saved
save_directory = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/davis matches/"
# Define the directory where the 'davis_{year}.xlsx' files are saved
directory = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/davis matches/"

# Retrieve all years links till 2018

In [8]:
# webpage to scrape
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"


# Initialize the WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Initialize an empty DataFrame to store all the data
final_df = pd.DataFrame()

# Loop through selected years
for year in range(2013,2019): 
    # Click on the dropdown arrow
    wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
    time.sleep(4)  # Introduce a delay before clicking the element
    year_xpath = f"//a[text()='{year}']"
    print(year_xpath)
    
    try:
        # Locate the element for the specific year and click it
        year_element = wait.until(EC.presence_of_element_located((By.XPATH, year_xpath)))
        print(year_element)        
        # Click on the element
        year_element.click()
        
        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed
        time.sleep(10)

        # Find all links with class "tie-link" within the tables
        tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

        # Initialize a list to store the extracted links
        links = []

        # Extract the links
        for link in tie_links:
            href = link.get_attribute("href")
            links.append(href)

        # Call the function to scrape and transform the data
        matches_df, players_df = to_df(links)
        # Cleaning DataFrames
        cleaned_matches_df = clean_matches_df(matches_df)
        cleaned_players_df = clean_players_df(players_df)

        # Merging DataFrames
        merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)
        # Add a column with the respective year information
        merged_df['year'] = year

        # Append data to the final DataFrame
        final_df = pd.concat([final_df, merged_df], ignore_index=True)
        
        # Save the current year's data to a separate Excel file in the specified directory
        year_file_name = f"{save_directory}davis_{year}.xlsx"
        merged_df.to_excel(year_file_name, index=False)

    except TimeoutException:
        print(f"TimeoutException occurred while locating element for year {year}. Skipping...")
        continue  # Skip to the next iteration if element not found within the timeout
        

//a[text()='2013']
<selenium.webdriver.remote.webelement.WebElement (session="4546d096440948196ff8840786440e4e", element="f.93236DE5E9AA0F71E855F58ABA54CDB2.d.D4ABC8160F8B9B63176CD752D70F5A49.e.1579")>
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
[ERROR] Exception occurred: Message: no such element: Unable to locate element: {"method":"css selector","selector":".component-title"}
  (Session info: chrome=128.0.6613.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF60B30B632+29090]
	(No symbol) [0x00007FF60B27E6E9]
	(No symbol) [0x00007FF60B13B1CA]
	(No symbol) [0x00007FF60B18EFD7]
	(No symbol) [0x00007FF60B18F22C]
	(No symbol) [0x00007FF60B1827CC]
	(No symbol) [0x00007FF60B1B672F]
	(No symbol) [0x00007FF60B1826A

In a year I have 15 matches (number of link), each match has a max number of 5 matches and on avg 4 matches. The number of single matches should be therefore between 60 and 70. We have two observations (one for each player or team) for each match for a total of 120/140 rows (max 75*2 = 150).

We have 5 players in each team, two team in each match, and 15 matches for a total of (5 * 2 * 15) 150 players/teams.

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations.

# Retrieve all years links from 2019

In [9]:
# webpage to scrape
url = ("https://www.daviscup.com/en/draws-results/finals/2019.aspx")
# Initialize the WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Initialize an empty DataFrame to store all the data
final_df = pd.DataFrame()

# Loop through selected years (2019 to 2023)
for year in range(2019, 2024):  # Change the range accordingly
    current_url = f"https://www.daviscup.com/en/draws-results/finals/{year}.aspx"
    
    try:
        driver.get(current_url)
        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed
        time.sleep(10)

        # Find all links with class "tie-link" within the tables
        tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")
        
        # Initialize a list to store the extracted links
        links = []

        # Extract the links
        for link in tie_links:
            href = link.get_attribute("href")
            links.append(href)

        # Call the function to scrape and transform the data
        matches_df, players_df = to_df(links)
        # Cleaning DataFrames
        cleaned_matches_df = clean_matches_df(matches_df)
        cleaned_players_df = clean_players_df(players_df)

        # Merging DataFrames
        merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)
        # Add a column with the respective year information
        merged_df['year'] = year

        # Append data to the final DataFrame
        final_df = pd.concat([final_df, merged_df], ignore_index=True)
        # Save the current year's data to a separate Excel file in the specified directory
        year_file_name = f"{save_directory}davis_{year}.xlsx"
        final_df.to_excel(year_file_name, index=False)

    except TimeoutException:
        print(f"TimeoutException occurred while processing year {year}. Skipping...")
        continue  # Skip to the next iteration if page not loaded within the timeout

    except Exception as e:
        print(f"An error occurred for year {year}: {str(e)}")
        continue  # Continue to the next iteration if an error occurs

# Close the WebDriver
driver.quit()



Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2019-FLS-M-SRB-RUS-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2019-FLS-M-AUS-CAN-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2019-FLS-M-GBR-GER-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
Skipping match 3
Skipping match 3
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2019-FLS-M-ARG-ESP-01
Initializing Selenium...
Navigating to the webpage...
Waiting for the page to load...
matches df downloaded
players df downloaded
Processing match: https://www.daviscup.com/en/draw

In a year I have 7 matches (number of link), each match has a max number of 3 matches. The number of single matches should be 21. We have two observations (one for each player or team) for each match for a total of 42 (21*2) rows.

We have 5 players in each team, two team in each match, and 7 matches for a total of (5 * 2 * 7) 70 players/teams.
We have on avg 5 player per team and 8 teams competing from the quarter finals, for a total of 40 (8*5) players.

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations.

In [10]:
# check if each team has, on average, 5 players

# Group by 'Team Name' and count unique players
unique_players_per_team = cleaned_players_df.groupby('team_name')['player'].nunique()

# Display the counts
print(unique_players_per_team)



team_name
AUSTRALIA        5
CANADA           5
CZECHIA          4
FINLAND          5
GREAT BRITAIN    5
ITALY            5
NETHERLANDS      5
SERBIA           5
Name: player, dtype: int64


## concatenate all xlsx in a unique file

In [11]:


# File path pattern to match all 'davis_{year}.xlsx' files in the specified directory
file_pattern = os.path.join(directory, 'davis_*.xlsx')

# Get a list of all file paths matching the pattern
file_paths = glob.glob(file_pattern)

# Create an empty list to store DataFrame objects
dataframes = []

# Read each Excel file into a DataFrame, standardize column names to lowercase, and append to the list
for file_path in file_paths:
    df = pd.read_excel(file_path)
    
    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the concatenated DataFrame to a new Excel file
combined_df.to_excel(os.path.join(directory, 'combined_davis.xlsx'), index=False)

# Verify the operation
print("All files have been successfully combined and saved to 'combined_davis.xlsx'.")


All files have been successfully combined and saved to 'combined_davis.xlsx'.
