# Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time
import sys

In [2]:
user = "ALESSANDRO"

# Lists functions to extract matches and players ' information

## matches 

In [3]:
def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "Stage" column with the component_title_text
        df["Stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Set 4": [],
                    "Set 5": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": [],
                    "Tie-Break 4": [],
                    "Tie-Break 5": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] == "NOT PLAYED":
#                         print(f"Skipping match {match_idx + 1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = []
                    tie_breaks = []

                    for set_score in set_scores:
                        scores = set_score.text.strip().split()  # Split by whitespace to extract numbers
#                         print(scores)
                        set_result = scores[0] if scores else ""
#                         print(set_result)
                        tie_break = scores[1] if len(scores) > 1 else ""
#                         print(tie_break)
                        
                      # Append the extracted values to respective lists
                        set_results.append(set_result)
                        tie_breaks.append(tie_break)

                    # Ensure there are at most 5 sets
                    set_results = set_results[:5]
                    tie_breaks = tie_breaks[:5]
                    
                    # check lists' length
#                     print(f"Length of set_results: {len(set_results)}")
#                     print(f"Length of tie_breaks: {len(tie_breaks)}")
     

                    
                    # Assign the extracted values to the dictionary
                    table_data["Player"].append(player)
                    table_data["Set 1"].append(set_results[0])
                    table_data["Set 2"].append(set_results[1])
                    table_data["Set 3"].append(set_results[2] if len(set_results) > 2 else None)
                    table_data["Set 4"].append(set_results[3] if len(set_results) > 3 else None)  
                    table_data["Set 5"].append(set_results[4] if len(set_results) > 4 else None)
                    table_data["Tie-Break 1"].append(tie_breaks[0])
                    table_data["Tie-Break 2"].append(tie_breaks[1])
                    table_data["Tie-Break 3"].append(tie_breaks[2] if len(tie_breaks) > 2 else None)
                    table_data["Tie-Break 4"].append(tie_breaks[3] if len(tie_breaks) > 3 else None)  
                    table_data["Tie-Break 5"].append(tie_breaks[4] if len(tie_breaks) > 4 else None)



                # Append the table data to the list
                tables_data.append(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        matches_df = pd.concat([df] * len(tables_df), ignore_index=True)
        matches_df = pd.concat([matches_df, tables_df], axis=1)

        # Add match and match status columns
        matches_df["match status"] = ""
        matches_df["match"] = ""
        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]

        return matches_df
    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Line {exc_traceback.tb_lineno}: str(e)")
        return None


## players

In [4]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"Team Name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["Team Name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns

        return players_df

    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Line {exc_traceback.tb_lineno}: str(e)")
        return None


# Create and clean dataframe function

In [5]:


def to_df(links):
    # Initialize ChromeOptions with headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Initialize Selenium WebDriver with the provided executable path and headless option
    driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            
            print(f"match: {link}")  # Print the link being analyzed

            # Apply the functions to extract match and player data
            matches_df = extract_matches_df(link)
            players_df = extract_players_df(link)

            if matches_df is not None:
                match_results.append(matches_df)
            if players_df is not None:
                player_results.append(players_df)

        # Close the Selenium WebDriver
        driver.quit()

        # Create DataFrames from the lists
        matches_df = pd.concat(match_results, ignore_index=True)
        players_df = pd.concat(player_results, ignore_index=True)
        
        # Check the number of rows in the DataFrames
        print("Total number of rows in matches_df:", len(matches_df))
        print("Total number of rows in players_df:", len(players_df))

        return matches_df, players_df

    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Line {exc_traceback.tb_lineno}: str(e)")
        driver.quit()

    return None, None  # Return None if there was an error




# Scrape data for a single year and save the data in a dataframe format

In [6]:
# Usage example:
# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe")
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Call the function to scrape and transform the data
    matches_df, players_df = to_df(links)

finally:
    driver.quit()

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe")
  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-ITA-JPN-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-GBR-ESP-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-AUS-GER-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-KAZ-SUI-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-CAN-CRO-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-USA-SRB-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-HUN-BEL-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-ITA-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG

## outputs

In a year I have 15 matches (number of link), each match has a max number of 5 matches and on avg 4 matches. The number of single matches should be therefore between 60 and 70. We have two observations (one for each player or team) for each match for a total of 120/140 rows.

We have 5 players in each team, two team in each match, and 15 matches for a total of (5 * 2 * 15) 150 players/teams.

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations. 

In [7]:
duplicate_rows = players_df.duplicated()
duplicate_rows_df = players_df[duplicate_rows]
duplicate_rows_df.head(20)

Unnamed: 0,Team Name,Info 1,Info 2,Info 3,Info 4
93,ITALY,Fabio FOGNINI,Date of birth: 24 May 1987,Singles ranking: 131,Doubles ranking: 133
94,ITALY,Paolo LORENZI,Date of birth: 15 Dec 1981,Singles ranking:,Doubles ranking:
95,ITALY,Andreas SEPPI,Date of birth: 21 Feb 1984,Singles ranking:,Doubles ranking:
97,ITALY,Simone BOLELLI,Date of birth: 08 Oct 1985,Singles ranking:,Doubles ranking: 55
98,ITALY,Captain,Corrado BARAZZUTTI,,
99,FRANCE,Lucas POUILLE,Date of birth: 23 Feb 1994,Singles ranking: 335,Doubles ranking: 840
100,FRANCE,Adrian MANNARINO,Date of birth: 29 Jun 1988,Singles ranking: 22,Doubles ranking: 282
101,FRANCE,Pierre-Hugues HERBERT,Date of birth: 18 Mar 1991,Singles ranking: 250,Doubles ranking: 77
103,FRANCE,Nicolas MAHUT,Date of birth: 21 Jan 1982,Singles ranking:,Doubles ranking: 38
104,FRANCE,Captain,Yannick NOAH,,


# Clean players and matches dataframes

## clean players dataframe

In [8]:
def clean_players_df(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "Player", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["DOB"] = players_df["DOB"].str.split(":", expand=True)[1].str.strip()
    players_df["Single Ranking"] = players_df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["Doubles Ranking"] = players_df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "Player" column in players_df
    players_df['Player'] = players_df['Player'].str.title()
    
    return players_df



In [9]:
# Usage example:
# Call the clean_df_players function with your players_df DataFrame
cleaned_players_df = clean_players_df(players_df)


In [10]:
cleaned_players_df.iloc[130:145]

Unnamed: 0,Team Name,Player,DOB,Single Ranking,Doubles Ranking
157,USA,Steve Johnson,24 Dec 1989,239.0,
158,USA,Frances Tiafoe,20 Jan 1998,16.0,194.0
159,USA,Ryan Harrison,07 May 1992,836.0,839.0
160,USA,Sam Querrey,07 Oct 1987,,
161,USA,Mike Bryan,29 Apr 1978,,
163,FRANCE,Lucas Pouille,23 Feb 1994,335.0,840.0
164,FRANCE,Jeremy Chardy,12 Feb 1987,508.0,113.0
165,FRANCE,Pierre-Hugues Herbert,18 Mar 1991,250.0,77.0
166,FRANCE,Nicolas Mahut,21 Jan 1982,,38.0
167,FRANCE,Jo-Wilfried Tsonga,17 Apr 1985,,


## clean matches dataframe

In [11]:

def clean_matches_df(matches_df):
    # Convert "Player" and set columns to string
    columns_to_convert = ["Player", "Set 1", "Set 2", "Set 3", "Set 4", "Set 5",
                          "Tie-Break 1", "Tie-Break 2", "Tie-Break 3", "Tie-Break 4", "Tie-Break 5"]
    matches_df[columns_to_convert] = matches_df[columns_to_convert].astype(str)

    # Define a function to apply regular expression replacements
    def apply_regex_replacements(df, columns):
        for col in columns:
            df[col] = df[col].str.replace(r'[\[\]\']+| and ', '', regex=True)

    # Apply the function to the specified columns
    columns_to_clean = ["Player", "Set 1", "Set 2", "Set 3", "Set 4", "Set 5",
                          "Tie-Break 1", "Tie-Break 2", "Tie-Break 3", "Tie-Break 4", "Tie-Break 5"]
    apply_regex_replacements(matches_df, columns_to_clean)

    # Apply the str.extract method with the specified regular expression
    split_players = matches_df['Player'].str.extract(r'^(.*?)\\n(.*)$')
    
    # Create 'Player 1' and 'Player 2' columns
    matches_df['Player 1'] = split_players[0].fillna(matches_df['Player'])
    matches_df['Player 2'] = split_players[1].fillna('')
    
    # Drop the original 'Player' column
    matches_df.drop('Player', axis=1, inplace=True)
    
    # Keep the first letter in each word in uppercase for the "Player 1" and "Player 2" columns
    matches_df["Player 1"] = matches_df["Player 1"].str.title()
    matches_df["Player 2"] = matches_df["Player 2"].str.title()
    
    # Define a function to convert specific columns from string to numeric
    def convert_columns_to_numeric(df, columns):
        for col in columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Specify the columns to convert
    columns_to_convert = ["Set 1", "Set 2", "Set 3", "Set 4", "Set 5",
                          "Tie-Break 1", "Tie-Break 2", "Tie-Break 3", "Tie-Break 4", "Tie-Break 5"]

    # Apply the function to convert the specified columns
    convert_columns_to_numeric(matches_df, columns_to_convert)
    
    return matches_df



In [12]:
# Usage example:
# Call the clean_df_combined function with your combined_df DataFrame
cleaned_matches_df = clean_matches_df(matches_df)


In [13]:
cleaned_matches_df.iloc[80:110]

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Set 1,Set 2,Set 3,Set 4,Set 5,Tie-Break 1,Tie-Break 2,Tie-Break 3,Tie-Break 4,Tie-Break 5,match status,match,Court Pace Rating,Player 1,Player 2
80,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,3.0,2.0,6.0,6.0,1.0,,,,,,PLAYED & COMPLETED,MATCH 1,,Andreas Seppi,
81,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,6.0,6.0,4.0,3.0,6.0,,,,,,PLAYED & COMPLETED,MATCH 1,,Lucas Pouille,
82,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,6.0,6.0,6.0,6.0,,6.0,,,,,PLAYED & COMPLETED,MATCH 2,,Fabio Fognini,
83,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,7.0,2.0,2.0,3.0,,8.0,,,,,PLAYED & COMPLETED,MATCH 2,,Jeremy Chardy,
84,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,4.0,3.0,1.0,,,,,,,,PLAYED & COMPLETED,MATCH 3,,Simone Bolelli,Fabio Fognini
85,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,6.0,6.0,6.0,,,,,,,,PLAYED & COMPLETED,MATCH 3,,Pierre-Hugues Herbert,Nicolas Mahut
86,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,6.0,1.0,6.0,3.0,,,,3.0,,,PLAYED & COMPLETED,MATCH 4,,Fabio Fognini,
87,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,2.0,6.0,7.0,6.0,,,,7.0,,,PLAYED & COMPLETED,MATCH 4,,Lucas Pouille,
88,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,,,,,,,,,,,NOT PLAYED,MATCH 5,,,
89,06 Apr - 08 Apr 2018,"Valletta Cambiaso ASD, Genoa, Italy","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP QUARTERFINAL,,,,,,,,,,,NOT PLAYED,MATCH 5,,,


# Merge players and matches dataframes

In [14]:
def merge_data_frames(df1, df2):
    merged_df = df1.merge(df2, left_on='Player 1', right_on='Player', how='left')
    merged_df = merged_df.drop_duplicates()

    # Second merge based on 'Player 2'
    final_merged = merged_df.merge(df2, left_on='Player 2', right_on='Player', how='left')
    final_merged = final_merged.drop_duplicates()

    # Dropping specified columns
    columns_to_drop = ['Player_x', 'Player_y', 'Team Name_y']
    final_merged = final_merged.drop(columns_to_drop, axis=1)

    # Renaming columns based on suffixes
    final_merged = final_merged.rename(columns=lambda x: x.replace('_x', '_Player1').replace('_y', '_Player2'))

    # Renaming 'Team Name_Player1' to 'Team Name'
    final_merged = final_merged.rename(columns={'Team Name_Player1': 'Team Name'})

    return final_merged


In [15]:
merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)

In [16]:
merged_df.shape

(140, 27)

In [17]:
merged_df.head()

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Set 1,Set 2,Set 3,Set 4,Set 5,...,Court Pace Rating,Player 1,Player 2,Team Name,DOB_Player1,Single Ranking_Player1,Doubles Ranking_Player1,DOB_Player2,Single Ranking_Player2,Doubles Ranking_Player2
0,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,6.0,3.0,3.0,,,...,,Adrian Mannarino,,FRANCE,29 Jun 1988,22,282.0,,,
1,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,7.0,6.0,6.0,,,...,,Thiemo De Bakker,,NETHERLANDS,19 Sep 1988,707,686.0,,,
2,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,6.0,7.0,3.0,7.0,,...,,Richard Gasquet,,FRANCE,18 Jun 1986,76,,,,
3,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,4.0,6.0,6.0,5.0,,...,,Robin Haase,,NETHERLANDS,06 Apr 1987,693,41.0,,,
4,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,7.0,6.0,6.0,7.0,,...,,Pierre-Hugues Herbert,Nicolas Mahut,FRANCE,18 Mar 1991,250,77.0,21 Jan 1982,,38.0


# Export dataframe

In [18]:
# Specify the Excel file path
excel_file_path = 'merged_data.xlsx'

# Export merged_df to Excel
merged_df.to_excel(excel_file_path, index=False)

print(f"Data has been exported to {excel_file_path}")


Data has been exported to merged_data.xlsx


# Retrieve all years links

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import TimeoutException
import pandas as pd

# webpage to scrape
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

# Initialize the WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Dictionary to store matches_df and players_df for each year
data_dict = {}
# Loop through the years in reverse (from 2018 to 2013)
for year in range(2018, 2012, -1):  # Loop from 2018 to 2013 (exclusive) in reverse
    # Click on the dropdown arrow
    wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
    time.sleep(4)  # Introduce a delay before clicking the element
    year_xpath = f"//a[text()='{year}']"
    print(year_xpath)
    
    try:
        # Locate the element for the specific year and click it
        year_element = wait.until(EC.presence_of_element_located((By.XPATH, year_xpath)))
        print(year_element)        
        # Click on the element
        year_element.click()
        
        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed
        time.sleep(10)

        # Find all links with class "tie-link" within the tables
        tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

        # Initialize a list to store the extracted links
        links = []

        # Extract the links
        for link in tie_links:
            href = link.get_attribute("href")
            links.append(href)

        # Call the function to scrape and transform the data
        matches_df, players_df = to_df(links)
        #print(links)

        
    except TimeoutException:
        print(f"TimeoutException occurred while locating element for year {year}. Skipping...")
        continue  # Skip to the next iteration if element not found within the timeout

  driver = webdriver.Chrome(executable_path=chrome_driver_path)


//a[text()='2018']
<selenium.webdriver.remote.webelement.WebElement (session="788ddcf468030ff7a8e20ee6f3b7e8c8", element="A09E31D706995521AE05C8E48BB43D0E_element_97")>


  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-ITA-JPN-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-GBR-ESP-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-AUS-GER-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-KAZ-SUI-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-CAN-CRO-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-USA-SRB-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-HUN-BEL-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-ITA-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-ARG-ITA-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-BEL-GER-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-CZE-AUS-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-SUI-USA-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-JPN-FRA-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-CAN-GBR-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-RUS-SRB-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2017-WG-M-ESP-CRO-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
m

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-GBR-JPN-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-SRB-KAZ-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-SUI-ITA-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-ARG-POL-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-CAN-FRA-01
Loaded!
[ERROR] Line 149: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-GER-CZE-01
Loaded!
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-USA-AUS-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2016-WG-M-CRO-BEL-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
m

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-FRA-GER-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-USA-GBR-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-CZE-AUS-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-ITA-KAZ-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-BRA-ARG-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-CRO-SRB-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-JPN-CAN-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2015-WG-M-BEL-SUI-01
Loaded!
[ERROR] L

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-CZE-NED-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-CAN-JPN-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-ESP-GER-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-FRA-AUS-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-GBR-USA-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!


Exception ignored in: <function Service.__del__ at 0x000002349F723A60>
Traceback (most recent call last):
  File "C:\Users\ALESSANDRO\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 172, in __del__
    self.stop()
  File "C:\Users\ALESSANDRO\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 147, in stop
    self.send_remote_shutdown_command()
  File "C:\Users\ALESSANDRO\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 123, in send_remote_shutdown_command
    url_request.urlopen("%s/shutdown" % self.service_url)
  File "C:\Users\ALESSANDRO\anaconda3\lib\urllib\request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "C:\Users\ALESSANDRO\anaconda3\lib\urllib\request.py", line 525, in open
    response = self._open(req, data)
  File "C:\Users\ALESSANDRO\anaconda3\lib\urllib\request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "C:\Users\ALES

match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-ITA-ARG-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-BEL-KAZ-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-SUI-SRB-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-CZE-JPN-01
Loaded!
[ERROR] Line 171: str(e)
Loaded!
match: https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2014-WG-M-GER-FRA-01
Loaded!


KeyboardInterrupt: 