# Libraries

In [150]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

In [151]:
user = "aldi"

# Lists functions to extract matches and players ' information

## matches 

In [152]:
def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "Stage" column with the component_title_text
        df["Stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] == "NOT PLAYED":
                        print(f"Skipping match {match_idx + 1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = []
                    tie_breaks = []

                    for set_score in set_scores:
                        set_result = set_score.text.strip()
                        tie_break = ""

                        # Use regular expressions to extract the first number in set_result
                        match = re.search(r'\d+', set_result)
                        if match:
                            set_result = match.group()
                        else:
                            set_result = ""

                        if set_score.find_elements(By.TAG_NAME, "sup"):
                            tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                            # Use regular expressions to extract the first number in tie_break
                            match = re.search(r'\d+', tie_break)
                            if match:
                                tie_break = match.group()
                            else:
                                tie_break = ""

                        set_results.append(set_result)
                        tie_breaks.append(tie_break)

                    # Ensure there are at most 3 sets
                    set_results = set_results[:3]
                    tie_breaks = tie_breaks[:3]

                    # Assign the extracted values to the dictionary
                    table_data["Player"].append(player)
                    table_data["Set 1"].append(set_results[0])
                    table_data["Set 2"].append(set_results[1])
                    table_data["Set 3"].append(set_results[2])

                    # Keep only the first element in the list for tie-breaks
                    for i, tie_break in enumerate(tie_breaks):
                        if i == 0 and tie_break:
                            table_data["Tie-Break 1"].append(tie_break)
                        else:
                            table_data[f"Tie-Break {i + 1}"].append(None)

                # Append the table data to the list
                tables_data.append(table_data)
                print(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        matches_df = pd.concat([df] * len(tables_df), ignore_index=True)
        matches_df = pd.concat([matches_df, tables_df], axis=1)

        # Add match and match status columns
        matches_df["match status"] = ""
        matches_df["match"] = ""
        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]

        # Display the combined DataFrame
        print("Combined DataFrame:")
        print(matches_df)
        return matches_df
    except Exception as e:
        print("Error:", str(e))
        return None


## players

In [153]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"Team Name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["Team Name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns
        print(players_df)

        return players_df

    except Exception as e:
        print("Error:", str(e))
        return None


# Create and clean dataframe function

In [154]:


def to_df(links):
    # Initialize ChromeOptions with headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Initialize Selenium WebDriver with the provided executable path and headless option
    driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            # Apply the functions to extract match and player data
            matches_df = extract_matches_df(link)
            players_df = extract_players_df(link)

            if matches_df is not None:
                match_results.append(matches_df)
            if players_df is not None:
                player_results.append(players_df)

        # Close the Selenium WebDriver
        driver.quit()

        # Create DataFrames from the lists
        matches_df = pd.concat(match_results, ignore_index=True)
        players_df = pd.concat(player_results, ignore_index=True)

        return matches_df, players_df

    except Exception as e:
        print("Error:", str(e))
        driver.quit()

    return None, None  # Return None if there was an error




# Scrape data for a single year and save the data in a dataframe format

In [155]:
# Usage example:
# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe")
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Call the function to scrape and transform the data
    matches_df, players_df = to_df(links)

finally:
    driver.quit()

  
  import sys


Loaded!
{'Player': ['Adrian MANNARINO'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['3'], 'Tie-Break 1': ['4'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Thiemo DE BAKKER'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Richard GASQUET'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE'], 'Set 1': ['4'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Pierre-Hugues HERBERT\nNicolas MAHUT'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['8'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE\nJean-Julien ROJER'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['7'], 'Tie-Break 1': ['6'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Adrian MANNARINO'], 'Set 1': ['4'], 'Set 2': ['7'], 'Se

Loaded!
   Team Name             Info 1                      Info 2  \
0      JAPAN      Yuichi SUGITA  Date of birth: 18 Sep 1988   
1      JAPAN        Taro DANIEL  Date of birth: 27 Jan 1993   
2      JAPAN           Go SOEDA  Date of birth: 05 Sep 1984   
3      JAPAN  Yasutaka UCHIYAMA  Date of birth: 05 Aug 1992   
4      JAPAN      Ben MCLACHLAN  Date of birth: 10 May 1992   
5      JAPAN            Captain            Satoshi IWABUCHI   
6      ITALY      Fabio FOGNINI  Date of birth: 24 May 1987   
7      ITALY      Paolo LORENZI  Date of birth: 15 Dec 1981   
8      ITALY      Andreas SEPPI  Date of birth: 21 Feb 1984   
9      ITALY    Thomas FABBIANO  Date of birth: 26 May 1989   
10     ITALY     Simone BOLELLI  Date of birth: 08 Oct 1985   
11     ITALY            Captain          Corrado BARAZZUTTI   

                  Info 3                Info 4  
0   Singles ranking: 939      Doubles ranking:  
1    Singles ranking: 75      Doubles ranking:  
2       Singles ranking: 

{'Player': ['Tim PUETZ\nJan-Lennard STRUFF'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Nick KYRGIOS'], 'Set 1': ['2'], 'Set 2': ['6'], 'Set 3': ['2'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Alexander ZVEREV'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Combined DataFrame:
                    Date                                   Venue  \
0   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Australia   
1   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Australia   
2   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Austra

{'Player': ['Vasek POSPISIL'], 'Set 1': ['6'], 'Set 2': ['2'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Viktor GALOVIC'], 'Set 1': ['4'], 'Set 2': ['4'], 'Set 3': ['2'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Denis SHAPOVALOV'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marin CILIC\nIvan DODIG'], 'Set 1': ['2'], 'Set 2': ['3'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Daniel NESTOR\nVasek POSPISIL'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['4'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Borna CORIC'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Denis SHAPOVALOV'], 'Set 1': ['4'], 'Set 2': ['4'], 'Set 3': ['4'], 'Tie-Bre

{'Player': ['David GOFFIN'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Attila BALAZS'], 'Set 1': ['4'], 'Set 2': ['4'], 'Set 3': ['0'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Ruben BEMELMANS\nJoris DE LOORE'], 'Set 1': ['3'], 'Set 2': ['4'], 'Set 3': ['7'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Attila BALAZS\nMarton FUCSOVICS'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['David GOFFIN'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marton FUCSOVICS'], 'Set 1': ['5'], 'Set 2': ['4'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Error: list index out of range
Loaded!
   Team Name            Info 1            

Loaded!
{'Player': ['David FERRER'], 'Set 1': ['4'], 'Set 2': ['2'], 'Set 3': ['2'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Alexander ZVEREV'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Rafael NADAL'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Philipp KOHLSCHREIBER'], 'Set 1': ['2'], 'Set 2': ['2'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Feliciano LOPEZ\nMarc LOPEZ'], 'Set 1': ['3'], 'Set 2': ['4'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Tim PUETZ\nJan-Lennard STRUFF'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Rafael NADAL'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': [

Loaded!
     Team Name                Info 1                      Info 2  \
0      CROATIA           Marin CILIC  Date of birth: 28 Sep 1988   
1      CROATIA           Borna CORIC  Date of birth: 14 Nov 1996   
2      CROATIA        Viktor GALOVIC  Date of birth: 19 Sep 1990   
3      CROATIA            Ivan DODIG  Date of birth: 02 Jan 1985   
4      CROATIA         Nikola MEKTIC  Date of birth: 24 Dec 1988   
5      CROATIA               Captain               Zeljko KRAJAN   
6   KAZAKHSTAN     Mikhail KUKUSHKIN  Date of birth: 26 Dec 1987   
7   KAZAKHSTAN  Aleksandr NEDOVYESOV  Date of birth: 15 Feb 1987   
8   KAZAKHSTAN          Dmitry POPKO  Date of birth: 24 Oct 1996   
9   KAZAKHSTAN        Denis YEVSEYEV  Date of birth: 22 May 1993   
10  KAZAKHSTAN      Timur KHABIBULIN  Date of birth: 02 Aug 1995   
11  KAZAKHSTAN               Captain             Dias DOSKARAYEV   

                  Info 3                Info 4  
0   Singles ranking: 664      Doubles ranking:  
1    Sing

Loaded!
   Team Name                 Info 1                      Info 2  \
0     FRANCE          Lucas POUILLE  Date of birth: 23 Feb 1994   
1     FRANCE        Richard GASQUET  Date of birth: 18 Jun 1986   
2     FRANCE           Benoit PAIRE  Date of birth: 08 May 1989   
3     FRANCE       Julien BENNETEAU  Date of birth: 20 Dec 1981   
4     FRANCE          Nicolas MAHUT  Date of birth: 21 Jan 1982   
5     FRANCE                Captain                Yannick NOAH   
6      SPAIN    Pablo CARRENO BUSTA  Date of birth: 12 Jul 1991   
7      SPAIN  Roberto BAUTISTA AGUT  Date of birth: 14 Apr 1988   
8      SPAIN   Albert RAMOS-VINOLAS  Date of birth: 17 Jan 1988   
9      SPAIN        Feliciano LOPEZ  Date of birth: 20 Sep 1981   
10     SPAIN      Marcel GRANOLLERS  Date of birth: 12 Apr 1986   
11     SPAIN                Captain              Sergi BRUGUERA   

                  Info 3                Info 4  
0   Singles ranking: 335  Doubles ranking: 840  
1    Singles ranking: 

{'Player': ['Ivan DODIG\nMate PAVIC'], 'Set 1': ['4'], 'Set 2': ['4'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Lucas POUILLE'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['3'], 'Tie-Break 1': ['3'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marin CILIC'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Combined DataFrame:
                    Date                                Venue  \
0   23 Nov - 25 Nov 2018   Stade Pierre Mauroy, Lille, France   
1   23 Nov - 25 Nov 2018   Stade Pierre Mauroy, Lille, France   
2   23 Nov - 25 Nov 2018   Stade Pierre Mauroy, Lille, France   
3   23 Nov - 25 No

## outputs

In [156]:
matches_df.shape

(110, 15)

In [157]:
matches_df.iloc[90:110]

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Player,Set 1,Set 2,Set 3,Tie-Break 1,Tie-Break 2,Tie-Break 3,match status,match,Court Pace Rating
90,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Borna CORIC],[6],[7],[6],[None],[None],[None],PLAYED & COMPLETED,MATCH 1,
91,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Steve JOHNSON],[4],[6],[3],[None],[None],[None],PLAYED & COMPLETED,MATCH 1,
92,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Marin CILIC],[6],[6],[7],[None],[None],[None],PLAYED & COMPLETED,MATCH 2,
93,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Frances TIAFOE],[1],[3],[6],[None],[None],[None],PLAYED & COMPLETED,MATCH 2,
94,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Ivan DODIG\nMate PAVIC],[5],[6],[6],[None],[None],[None],PLAYED & COMPLETED,MATCH 3,
95,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Mike BRYAN\nRyan HARRISON],[7],[7],[1],[None],[None],[None],PLAYED & COMPLETED,MATCH 3,
96,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Marin CILIC],[7],[6],[3],[7],[None],[None],PLAYED & COMPLETED,MATCH 4,
97,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Sam QUERREY],[6],[7],[6],[2],[None],[None],PLAYED & COMPLETED,MATCH 4,
98,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Borna CORIC],[6],[6],[6],[0],[None],[None],PLAYED & COMPLETED,MATCH 5,
99,14 Sep - 16 Sep 2018,"Sportski centar Visnjik, Zadar, Croatia","Clay - Red Clay, Outdoor",Dunlop Fort Clay Court,WORLD GROUP SEMIFINAL,[Frances TIAFOE],[7],[1],[7],[7],[None],[None],PLAYED & COMPLETED,MATCH 5,


In [158]:
players_df.shape

(175, 5)

In [159]:
players_df.iloc[90:110]

Unnamed: 0,Team Name,Info 1,Info 2,Info 3,Info 4
90,HUNGARY,Mate VALKUSZ,Date of birth: 13 Aug 1998,Singles ranking: 217,Doubles ranking:
91,HUNGARY,Gabor BORSOS,Date of birth: 30 Jun 1991,Singles ranking:,Doubles ranking: 1625=
92,HUNGARY,Captain,Gabor KOVES,,
93,ITALY,Fabio FOGNINI,Date of birth: 24 May 1987,Singles ranking: 131,Doubles ranking: 133
94,ITALY,Paolo LORENZI,Date of birth: 15 Dec 1981,Singles ranking:,Doubles ranking:
95,ITALY,Andreas SEPPI,Date of birth: 21 Feb 1984,Singles ranking:,Doubles ranking:
96,ITALY,Matteo BERRETTINI,Date of birth: 12 Apr 1996,Singles ranking: 90,Doubles ranking:
97,ITALY,Simone BOLELLI,Date of birth: 08 Oct 1985,Singles ranking:,Doubles ranking: 55
98,ITALY,Captain,Corrado BARAZZUTTI,,
99,FRANCE,Lucas POUILLE,Date of birth: 23 Feb 1994,Singles ranking: 335,Doubles ranking: 840


players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations. 

In [160]:
duplicate_rows = players_df.duplicated()
duplicate_rows_df = players_df[duplicate_rows]
duplicate_rows_df.head(20)

Unnamed: 0,Team Name,Info 1,Info 2,Info 3,Info 4
93,ITALY,Fabio FOGNINI,Date of birth: 24 May 1987,Singles ranking: 131,Doubles ranking: 133
94,ITALY,Paolo LORENZI,Date of birth: 15 Dec 1981,Singles ranking:,Doubles ranking:
95,ITALY,Andreas SEPPI,Date of birth: 21 Feb 1984,Singles ranking:,Doubles ranking:
97,ITALY,Simone BOLELLI,Date of birth: 08 Oct 1985,Singles ranking:,Doubles ranking: 55
98,ITALY,Captain,Corrado BARAZZUTTI,,
99,FRANCE,Lucas POUILLE,Date of birth: 23 Feb 1994,Singles ranking: 335,Doubles ranking: 840
100,FRANCE,Adrian MANNARINO,Date of birth: 29 Jun 1988,Singles ranking: 22,Doubles ranking: 282
101,FRANCE,Pierre-Hugues HERBERT,Date of birth: 18 Mar 1991,Singles ranking: 250,Doubles ranking: 77
103,FRANCE,Nicolas MAHUT,Date of birth: 21 Jan 1982,Singles ranking:,Doubles ranking: 38
104,FRANCE,Captain,Yannick NOAH,,


# Clean players and matches dataframes

## clean players dataframe

In [161]:
def clean_players_df(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "Player", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["DOB"] = players_df["DOB"].str.split(":", expand=True)[1].str.strip()
    players_df["Single Ranking"] = players_df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["Doubles Ranking"] = players_df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "Player" column in players_df
    players_df['Player'] = players_df['Player'].str.title()
    
    return players_df



In [162]:
# Usage example:
# Call the clean_df_players function with your players_df DataFrame
cleaned_players_df = clean_players_df(players_df)


In [163]:
cleaned_players_df.iloc[130:145]

Unnamed: 0,Team Name,Player,DOB,Single Ranking,Doubles Ranking
157,USA,Steve Johnson,24 Dec 1989,239.0,
158,USA,Frances Tiafoe,20 Jan 1998,16.0,194.0
159,USA,Ryan Harrison,07 May 1992,836.0,839.0
160,USA,Sam Querrey,07 Oct 1987,,
161,USA,Mike Bryan,29 Apr 1978,,
163,FRANCE,Lucas Pouille,23 Feb 1994,335.0,840.0
164,FRANCE,Jeremy Chardy,12 Feb 1987,508.0,113.0
165,FRANCE,Pierre-Hugues Herbert,18 Mar 1991,250.0,77.0
166,FRANCE,Nicolas Mahut,21 Jan 1982,,38.0
167,FRANCE,Jo-Wilfried Tsonga,17 Apr 1985,,


## clean matches dataframe

In [164]:

def clean_matches_df(matches_df):
    # Convert "Player" and set columns to string
    columns_to_convert = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    matches_df[columns_to_convert] = matches_df[columns_to_convert].astype(str)

    # Define a function to apply regular expression replacements
    def apply_regex_replacements(df, columns):
        for col in columns:
            df[col] = df[col].str.replace(r'[\[\]\']+| and ', '', regex=True)

    # Apply the function to the specified columns
    columns_to_clean = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    apply_regex_replacements(matches_df, columns_to_clean)

    # Apply the str.extract method with the specified regular expression
    split_players = matches_df['Player'].str.extract(r'^(.*?)\\n(.*)$')
    
    # Create 'Player 1' and 'Player 2' columns
    matches_df['Player 1'] = split_players[0].fillna(matches_df['Player'])
    matches_df['Player 2'] = split_players[1].fillna('')
    
    # Drop the original 'Player' column
    matches_df.drop('Player', axis=1, inplace=True)
    
    # Keep the first letter in each word in uppercase for the "Player 1" and "Player 2" columns
    matches_df["Player 1"] = matches_df["Player 1"].str.title()
    matches_df["Player 2"] = matches_df["Player 2"].str.title()
    
    # Define a function to convert specific columns from string to numeric
    def convert_columns_to_numeric(df, columns):
        for col in columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Specify the columns to convert
    columns_to_convert = ["Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]

    # Apply the function to convert the specified columns
    convert_columns_to_numeric(matches_df, columns_to_convert)
    
    return matches_df



In [165]:
# Usage example:
# Call the clean_df_combined function with your combined_df DataFrame
cleaned_matches_df = clean_matches_df(matches_df)


In [166]:
cleaned_matches_df.iloc[80:110]

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Set 1,Set 2,Set 3,Tie-Break 1,Tie-Break 2,Tie-Break 3,match status,match,Court Pace Rating,Player 1,Player 2
80,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,7.0,6.0,6.0,,,,PLAYED & COMPLETED,MATCH 1,,Benoit Paire,
81,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,5.0,1.0,0.0,,,,PLAYED & COMPLETED,MATCH 1,,Pablo Carreno Busta,
82,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,3.0,7.0,6.0,,,,PLAYED & COMPLETED,MATCH 2,,Lucas Pouille,
83,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,6.0,6.0,4.0,,,,PLAYED & COMPLETED,MATCH 2,,Roberto Bautista Agut,
84,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,6.0,6.0,7.0,,,,PLAYED & COMPLETED,MATCH 3,,Julien Benneteau,Nicolas Mahut
85,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,0.0,4.0,6.0,,,,PLAYED & COMPLETED,MATCH 3,,Marcel Granollers,Feliciano Lopez
86,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,6.0,4.0,12.0,,,,PLAYED & COMPLETED,MATCH 4,,Richard Gasquet,
87,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,1.0,6.0,14.0,,,,PLAYED & COMPLETED,MATCH 4,,Albert Ramos-Vinolas,
88,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,7.0,3.0,11.0,7.0,,,PLAYED & COMPLETED,MATCH 5,,Nicolas Mahut,
89,14 Sep - 16 Sep 2018,"Stade Pierre Mauroy, Lille, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP SEMIFINAL,6.0,6.0,13.0,2.0,,,PLAYED & COMPLETED,MATCH 5,,Marcel Granollers,


# Merge players and matches dataframes

In [167]:
player_name = 'Marton Fucsovics'  # Replace with any player name you want to check

# Check if the specified player exists in 'cleaned_matches_df'
is_in_matches = player_name in cleaned_matches_df['Player 1'].values

# Check if the specified player exists in 'cleaned_players_df'
is_in_players = player_name in cleaned_players_df['Player'].values

print(f"Is {player_name} in cleaned_matches_df? {is_in_matches}")
print(f"Is {player_name} in cleaned_players_df? {is_in_players}")


Is Marton Fucsovics in cleaned_matches_df? False
Is Marton Fucsovics in cleaned_players_df? True


In [206]:
def merge_data_frames(df1, df2):
    merged_df = df1.merge(df2, left_on='Player 1', right_on='Player', how='left')
    merged_df = merged_df.drop_duplicates()

    # Second merge based on 'Player 2'
    final_merged = merged_df.merge(df2, left_on='Player 2', right_on='Player', how='left')
    final_merged = final_merged.drop_duplicates()

    # Dropping specified columns
    columns_to_drop = ['Player_x', 'Player_y', 'Team Name_y']
    final_merged = final_merged.drop(columns_to_drop, axis=1)

    # Renaming columns based on suffixes
    final_merged = final_merged.rename(columns=lambda x: x.replace('_x', '_Player1').replace('_y', '_Player2'))

    # Renaming 'Team Name_Player1' to 'Team Name'
    final_merged = final_merged.rename(columns={'Team Name_Player1': 'Team Name'})

    return final_merged


In [207]:
merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)

In [208]:
merged_df.shape

(102, 23)

In [209]:
merged_df.head()

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Set 1,Set 2,Set 3,Tie-Break 1,Tie-Break 2,...,Court Pace Rating,Player 1,Player 2,Team Name,DOB_Player1,Single Ranking_Player1,Doubles Ranking_Player1,DOB_Player2,Single Ranking_Player2,Doubles Ranking_Player2
0,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,6.0,3.0,3.0,4.0,,...,,Adrian Mannarino,,FRANCE,29 Jun 1988,22,282.0,,,
1,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,7.0,6.0,6.0,7.0,,...,,Thiemo De Bakker,,NETHERLANDS,19 Sep 1988,707,686.0,,,
2,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,6.0,7.0,3.0,,,...,,Richard Gasquet,,FRANCE,18 Jun 1986,76,,,,
3,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,4.0,6.0,6.0,,,...,,Robin Haase,,NETHERLANDS,06 Apr 1987,693,41.0,,,
4,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,7.0,6.0,6.0,8.0,,...,,Pierre-Hugues Herbert,Nicolas Mahut,FRANCE,18 Mar 1991,250,77.0,21 Jan 1982,,38.0


# Export dataframe

In [210]:
# Specify the Excel file path
excel_file_path = 'merged_data.xlsx'

# Export merged_df to Excel
merged_df.to_excel(excel_file_path, index=False)

print(f"Data has been exported to {excel_file_path}")


Data has been exported to merged_data.xlsx


# Retrieve all years links

In [172]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

# Initialize the WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get("https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx")
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Click on the dropdown arrow
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
# Click on dropdown items from 2015 to 2018
for year in range(2015, 2019):
    year_xpath = f"//a[text()='{year}']"
    year_element = wait.until(EC.element_to_be_clickable((By.XPATH, year_xpath)))
    year_element.click()

  # This is added back by InteractiveShellApp.init_path()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6DDF182B2+55298]
	(No symbol) [0x00007FF6DDE85E02]
	(No symbol) [0x00007FF6DDD405AB]
	(No symbol) [0x00007FF6DDD8175C]
	(No symbol) [0x00007FF6DDD818DC]
	(No symbol) [0x00007FF6DDDBCBC7]
	(No symbol) [0x00007FF6DDDA20EF]
	(No symbol) [0x00007FF6DDDBAAA4]
	(No symbol) [0x00007FF6DDDA1E83]
	(No symbol) [0x00007FF6DDD7670A]
	(No symbol) [0x00007FF6DDD77964]
	GetHandleVerifier [0x00007FF6DE290AAB+3694587]
	GetHandleVerifier [0x00007FF6DE2E728E+4048862]
	GetHandleVerifier [0x00007FF6DE2DF173+4015811]
	GetHandleVerifier [0x00007FF6DDFB47D6+695590]
	(No symbol) [0x00007FF6DDE90CE8]
	(No symbol) [0x00007FF6DDE8CF34]
	(No symbol) [0x00007FF6DDE8D062]
	(No symbol) [0x00007FF6DDE7D3A3]
	BaseThreadInitThunk [0x00007FFC7ED27344+20]
	RtlUserThreadStart [0x00007FFC7EF626B1+33]
