# Libraries

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

In [6]:
user = "aldi"

# Functions to extract matches and players ' information

In [7]:
def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "Stage" column with the component_title_text
        df["Stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] == "NOT PLAYED":
                        print(f"Skipping match {match_idx + 1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = []
                    tie_breaks = []

                    for set_score in set_scores:
                        set_result = set_score.text.strip()
                        tie_break = ""

                        # Use regular expressions to extract the first number in set_result
                        match = re.search(r'\d+', set_result)
                        if match:
                            set_result = match.group()
                        else:
                            set_result = ""

                        if set_score.find_elements(By.TAG_NAME, "sup"):
                            tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                            # Use regular expressions to extract the first number in tie_break
                            match = re.search(r'\d+', tie_break)
                            if match:
                                tie_break = match.group()
                            else:
                                tie_break = ""

                        set_results.append(set_result)
                        tie_breaks.append(tie_break)

                    # Ensure there are at most 3 sets
                    set_results = set_results[:3]
                    tie_breaks = tie_breaks[:3]

                    # Assign the extracted values to the dictionary
                    table_data["Player"].append(player)
                    table_data["Set 1"].append(set_results[0])
                    table_data["Set 2"].append(set_results[1])
                    table_data["Set 3"].append(set_results[2])

                    # Keep only the first element in the list for tie-breaks
                    for i, tie_break in enumerate(tie_breaks):
                        if i == 0 and tie_break:
                            table_data["Tie-Break 1"].append(tie_break)
                        else:
                            table_data[f"Tie-Break {i + 1}"].append(None)

                # Append the table data to the list
                tables_data.append(table_data)
                print(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        matches_df = pd.concat([df] * len(tables_df), ignore_index=True)
        matches_df = pd.concat([matches_df, tables_df], axis=1)

        # Add match and match status columns
        matches_df["match status"] = ""
        matches_df["match"] = ""
        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]

        # Display the combined DataFrame
        print("Combined DataFrame:")
        print(matches_df)
        return matches_df
    except Exception as e:
        print("Error:", str(e))
        return None


In [8]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService("C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"Team Name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["Team Name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns
        print(players_df)

        return players_df

    except Exception as e:
        print("Error:", str(e))
        return None


# Extract links for all matches in a year and save them in lists

In [9]:

# Initialize Selenium WebDriver
driver = webdriver.Chrome(executable_path="C:/Users/{user}/Downloads/chromedriver.exe")

# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Create an empty list to store the results
    match_results = []
    player_results = []

    # Iterate through the links and apply the functions
    for link in links:
        matches_df = extract_matches_df(link)
        players_df = extract_players_df(link)
        
        if matches_df is not None:
            match_results.append(matches_df)
        if players_df is not None:
            player_results.append(players_df)

    # Close the Selenium WebDriver
    driver.quit()
    

except Exception as e:
    print("Error:", str(e))
finally:
    driver.quit()


  


WebDriverException: Message: 'chromedriver.exe' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home


## output

In [None]:
match_results

In [None]:
player_results

# Create and clean dataframe

In [None]:


def to_df(links):
    # Initialize ChromeOptions with headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Initialize Selenium WebDriver with the provided executable path and headless option
    driver = webdriver.Chrome(executable_path="C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            # Apply the functions to extract match and player data
            matches_df = extract_matches_df(link)
            players_df = extract_players_df(link)

            if matches_df is not None:
                match_results.append(matches_df)
            if players_df is not None:
                player_results.append(players_df)

        # Close the Selenium WebDriver
        driver.quit()

        # Create DataFrames from the lists
        matches_df = pd.concat(match_results, ignore_index=True)
        players_df = pd.concat(player_results, ignore_index=True)

        return matches_df, players_df

    except Exception as e:
        print("Error:", str(e))
        driver.quit()

    return None, None  # Return None if there was an error




# Main function to scrape data for a single year and save the data in a dataframe format

In [None]:
# Usage example:
# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver = webdriver.Chrome(executable_path="C:/Users/{user}/Downloads/chromedriver.exe")
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Call the function to scrape and transform the data
    matches_df, players_df = to_df(links)

finally:
    driver.quit()

## outputs

In [None]:
matches_df.shape

In [None]:
matches_df.iloc[90:110]

In [None]:
players_df.shape

In [None]:
players_df.iloc[90:110]

In [None]:
duplicate_rows = players_df.duplicated()
duplicate_rows_df = players_df[duplicate_rows]
duplicate_rows_df

# Clean players and matches dataframes

In [None]:
def clean_players_df(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "Player", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["DOB"] = players_df["DOB"].str.split(":", expand=True)[1].str.strip()
    players_df["Single Ranking"] = players_df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["Doubles Ranking"] = players_df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "Player" column in players_df
    players_df['Player'] = players_df['Player'].str.title()
    
    return players_df



In [None]:
# Usage example:
# Call the clean_df_players function with your players_df DataFrame
cleaned_players_df = clean_players_df(players_df)


In [None]:
cleaned_players_df.iloc[130:145]

In [None]:

def clean_matches_df(matches_df):
    # Convert "Player" and set columns to string
    columns_to_convert = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    matches_df[columns_to_convert] = matches_df[columns_to_convert].astype(str)

    # Define a function to apply regular expression replacements
    def apply_regex_replacements(df, columns):
        for col in columns:
            df[col] = df[col].str.replace(r'[\[\]\']+| and ', '', regex=True)

    # Apply the function to the specified columns
    columns_to_clean = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    apply_regex_replacements(matches_df, columns_to_clean)

    # Apply the str.extract method with the specified regular expression
    split_players = matches_df['Player'].str.extract(r'^(.*?)\\n(.*)$')
    
    # Create 'Player 1' and 'Player 2' columns
    matches_df['Player 1'] = split_players[0].fillna(matches_df['Player'])
    matches_df['Player 2'] = split_players[1].fillna('')
    
    # Drop the original 'Player' column
    matches_df.drop('Player', axis=1, inplace=True)
    
    # Keep the first letter in each word in uppercase for the "Player 1" and "Player 2" columns
    matches_df["Player 1"] = matches_df["Player 1"].str.title()
    matches_df["Player 2"] = matches_df["Player 2"].str.title()
    
    # Define a function to convert specific columns from string to numeric
    def convert_columns_to_numeric(df, columns):
        for col in columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Specify the columns to convert
    columns_to_convert = ["Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]

    # Apply the function to convert the specified columns
    convert_columns_to_numeric(matches_df, columns_to_convert)
    
    return matches_df



In [None]:
# Usage example:
# Call the clean_df_combined function with your combined_df DataFrame
cleaned_matches_df = clean_matches_df(matches_df)


In [None]:
cleaned_matches_df.iloc[80:110]

# Merge players and matches dataframes

In [None]:
def merge_data_frames(df1, df2):
    # First merge based on "Player" matching "Player 1"
    merged_df = df1.merge(df2, left_on='Player 1', right_on='Player', how='left')

    # Second merge based on "Player" matching "Player 2"
    merged_df = merged_df.merge(df2, left_on='Player 2', right_on='Player', how='left')

    # Rename columns if needed
    merged_df.rename(columns={'Name_x': 'Player_1_Name', 'Country_x': 'Player_1_Country',
                             'Name_y': 'Player_2_Name', 'Country_y': 'Player_2_Country'}, inplace=True)

    # Drop the duplicated columns
    merged_df.drop(['Player_x', 'Player_y'], axis=1, inplace=True)

    return merged_df


In [None]:
merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)

In [None]:
merged_df.iloc[260:290]

# Export dataframe

In [None]:
# Specify the Excel file path
excel_file_path = 'merged_data.xlsx'

# Export merged_df to Excel
merged_df.to_excel(excel_file_path, index=False)

print(f"Data has been exported to {excel_file_path}")


# Retrieve all years links

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Path to the ChromeDriver
chrome_driver_path = "C:/Users/{user}/Downloads/chromedriver.exe"

# Initialize the WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get("https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx")
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Click on the dropdown arrow
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
# Click on dropdown items from 2015 to 2018
for year in range(2015, 2019):
    year_xpath = f"//a[text()='{year}']"
    year_element = wait.until(EC.element_to_be_clickable((By.XPATH, year_xpath)))
    year_element.click()