# Libraries

In [60]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

In [61]:
user = "ALESSANDRO"

# Lists functions to extract matches and players ' information

## matches 

def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "Stage" column with the component_title_text
        df["Stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] == "NOT PLAYED":
                        print(f"Skipping match {match_idx + 1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = []
                    tie_breaks = []

                    for set_score in set_scores:
                        set_result = set_score.text.strip()
                        tie_break = ""

                        # Use regular expressions to extract the first number in set_result
                        match = re.search(r'\d+', set_result)
                        if match:
                            set_result = match.group()
                        else:
                            set_result = ""

                        if set_score.find_elements(By.TAG_NAME, "sup"):
                            tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                            # Use regular expressions to extract the first number in tie_break
                            match = re.search(r'\d+', tie_break)
                            if match:
                                tie_break = match.group()
                            else:
                                tie_break = ""

                        set_results.append(set_result)
                        tie_breaks.append(tie_break)

                    # Ensure there are at most 3 sets
                    set_results = set_results[:3]
                    tie_breaks = tie_breaks[:3]

                    # Assign the extracted values to the dictionary
                    table_data["Player"].append(player)
                    table_data["Set 1"].append(set_results[0])
                    table_data["Set 2"].append(set_results[1])
                    table_data["Set 3"].append(set_results[2])

                    # Keep only the first element in the list for tie-breaks
                    for i, tie_break in enumerate(tie_breaks):
                        if i == 0 and tie_break:
                            table_data["Tie-Break 1"].append(tie_break)
                        else:
                            table_data[f"Tie-Break {i + 1}"].append(None)

                # Append the table data to the list
                tables_data.append(table_data)
                print(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        matches_df = pd.concat([df] * len(tables_df), ignore_index=True)
        matches_df = pd.concat([matches_df, tables_df], axis=1)

        # Add match and match status columns
        matches_df["match status"] = ""
        matches_df["match"] = ""
        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]

        # Display the combined DataFrame
        print("Combined DataFrame:")
        print(matches_df)
        return matches_df
    except Exception as e:
        print("Error:", str(e))
        return None


In [62]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = Service(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Extract the stage from the component-title
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # Extract match numbers and statuses
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())

        # Extract match details
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        for match_idx, rubber_body_element in enumerate(rubber_body_elements):
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": []
                }

                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    if len(td_elements) >= 3:
                        player = td_elements[1].text.strip()
                        results = td_elements[2]
                        set_scores = results.find_elements(By.TAG_NAME, "span")
                        
                        set_results = []
                        tie_breaks = []

                        for set_score in set_scores:
                            set_result = set_score.text.strip()
                            tie_break = ""

                            match = re.search(r'\d+', set_result)
                            if match:
                                set_result = match.group()
                            else:
                                set_result = ""

                            if set_score.find_elements(By.TAG_NAME, "sup"):
                                tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()

                                match = re.search(r'\d+', tie_break)
                                if match:
                                    tie_break = match.group()
                                else:
                                    tie_break = ""

                            set_results.append(set_result)
                            tie_breaks.append(tie_break)

                        set_results = set_results[:3]
                        tie_breaks = tie_breaks[:3]

                        table_data["Player"].append(player)
                        table_data["Set 1"].append(set_results[0])
                        table_data["Set 2"].append(set_results[1])
                        table_data["Set 3"].append(set_results[2])

                        for i, tie_break in enumerate(tie_breaks):
                            if i == 0 and tie_break:
                                table_data["Tie-Break 1"].append(tie_break)
                            else:
                                table_data[f"Tie-Break {i + 1}"].append(None)

                tables_data.append(table_data)

        driver.quit()

        tables_df = pd.DataFrame(tables_data)
        matches_df = pd.DataFrame(columns=["Stage"])

        matches_df["Stage"] = [stage] * len(tables_df)

        matches_df["Match Status"] = ""
        matches_df["Match"] = ""

        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "Match Status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "Match"] = match_num[i]

        matches_df = pd.concat([matches_df, tables_df], axis=1)

        print("Combined DataFrame:")
        print(matches_df)
        return matches_df

    except Exception as e:
        print("Error:", str(e))
        return None


## players

In [63]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"Team Name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["Team Name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns
        print(players_df)

        return players_df

    except Exception as e:
        print("Error:", str(e))
        return None


# Create and clean dataframe function

In [64]:


def to_df(links):
    # Initialize ChromeOptions with headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Initialize Selenium WebDriver with the provided executable path and headless option
    driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            # Apply the functions to extract match and player data
            matches_df = extract_matches_df(link)
            players_df = extract_players_df(link)

            if matches_df is not None:
                match_results.append(matches_df)
            if players_df is not None:
                player_results.append(players_df)

        # Close the Selenium WebDriver
        driver.quit()

        # Create DataFrames from the lists
        matches_df = pd.concat(match_results, ignore_index=True)
        players_df = pd.concat(player_results, ignore_index=True)
        
        # Check the number of rows in the DataFrames
        print("Total number of rows in matches_df:", len(matches_df))
        print("Total number of rows in players_df:", len(players_df))

        return matches_df, players_df

    except Exception as e:
        print("Error:", str(e))
        driver.quit()

    return None, None  # Return None if there was an error




# Scrape data for a single year and save the data in a dataframe format

In [65]:
# Usage example:
# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe")
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Call the function to scrape and transform the data
    matches_df, players_df = to_df(links)

finally:
    driver.quit()

  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe")
  driver = webdriver.Chrome(executable_path=f"C:/Users/{user}/Downloads/chromedriver.exe", options=chrome_options)


Loaded!
Error: list index out of range
Loaded!
      Team Name                 Info 1                      Info 2  \
0        FRANCE          Lucas POUILLE  Date of birth: 23 Feb 1994   
1        FRANCE       Adrian MANNARINO  Date of birth: 29 Jun 1988   
2        FRANCE        Richard GASQUET  Date of birth: 18 Jun 1986   
3        FRANCE  Pierre-Hugues HERBERT  Date of birth: 18 Mar 1991   
4        FRANCE          Nicolas MAHUT  Date of birth: 21 Jan 1982   
5        FRANCE                Captain                Yannick NOAH   
6   NETHERLANDS            Robin HAASE  Date of birth: 06 Apr 1987   
7   NETHERLANDS      Tallon GRIEKSPOOR  Date of birth: 02 Jul 1996   
8   NETHERLANDS       Thiemo DE BAKKER  Date of birth: 19 Sep 1988   
9   NETHERLANDS       Matwe MIDDELKOOP  Date of birth: 03 Sep 1983   
10  NETHERLANDS      Jean-Julien ROJER  Date of birth: 25 Aug 1981   
11  NETHERLANDS                Captain               Paul HAARHUIS   

                  Info 3                In

Loaded!
Error: list index out of range
Loaded!
   Team Name            Info 1                      Info 2  \
0     SERBIA     Dusan LAJOVIC  Date of birth: 30 Jun 1990   
1     SERBIA       Laslo DJERE  Date of birth: 02 Jun 1995   
2     SERBIA  Nikola MILOJEVIC  Date of birth: 19 Jun 1995   
3     SERBIA      Pedja KRSTIN  Date of birth: 03 Sep 1994   
4     SERBIA      Miljan ZEKIC  Date of birth: 12 Jul 1988   
5     SERBIA           Captain              Nenad ZIMONJIC   
6        USA       Sam QUERREY  Date of birth: 07 Oct 1987   
7        USA        John ISNER  Date of birth: 26 Apr 1985   
8        USA     Ryan HARRISON  Date of birth: 07 May 1992   
9        USA     Steve JOHNSON  Date of birth: 24 Dec 1989   
10       USA           Captain                 Jim COURIER   

                    Info 3                 Info 4  
0      Singles ranking: 46  Doubles ranking: 851=  
1      Singles ranking: 33  Doubles ranking: 570=  
2     Singles ranking: 587       Doubles ranking:  


Loaded!
Error: list index out of range
Loaded!
   Team Name           Info 1                      Info 2  \
0        USA       John ISNER  Date of birth: 26 Apr 1985   
1        USA      Sam QUERREY  Date of birth: 07 Oct 1987   
2        USA        Jack SOCK  Date of birth: 24 Sep 1992   
3        USA    Steve JOHNSON  Date of birth: 24 Dec 1989   
4        USA    Ryan HARRISON  Date of birth: 07 May 1992   
5        USA          Captain                 Jim COURIER   
6    BELGIUM  Ruben BEMELMANS  Date of birth: 14 Jan 1988   
7    BELGIUM   Joris DE LOORE  Date of birth: 21 Apr 1993   
8    BELGIUM     Sander GILLE  Date of birth: 15 Jan 1991   
9    BELGIUM    Joran VLIEGEN  Date of birth: 07 Jul 1993   
10   BELGIUM          Captain             Johan VAN HERCK   

                  Info 3                 Info 4  
0   Singles ranking: 174   Doubles ranking: 192  
1       Singles ranking:       Doubles ranking:  
2   Singles ranking: 663   Doubles ranking: 190  
3   Singles ranking:

Total number of rows in matches_df: 30
Total number of rows in players_df: 175


## outputs

In [66]:
len(links)

15

In [67]:
links

['https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-ITA-JPN-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-GBR-ESP-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-AUS-GER-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-KAZ-SUI-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-CAN-CRO-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-USA-SRB-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-HUN-BEL-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-ITA-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-ESP-GER-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-KAZ-CRO-01',
 'https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-USA-B

In [69]:
matches_df

Unnamed: 0,Stage,Match Status,Match,Player,Set 1,Set 2,Set 3,Tie-Break 1,Tie-Break 2,Tie-Break 3
0,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 1,[David FERRER],[4],[2],[2],[None],[None],[None]
1,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 1,[Alexander ZVEREV],[6],[6],[6],[None],[None],[None]
2,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 2,[Rafael NADAL],[6],[6],[6],[None],[None],[None]
3,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 2,[Philipp KOHLSCHREIBER],[2],[2],[3],[None],[None],[None]
4,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 3,[Feliciano LOPEZ\nMarc LOPEZ],[3],[4],[6],[None],[None],[None]
5,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 3,[Tim PUETZ\nJan-Lennard STRUFF],[6],[6],[3],[None],[None],[None]
6,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 4,[Rafael NADAL],[6],[6],[6],[None],[None],[None]
7,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 4,[Alexander ZVEREV],[1],[4],[4],[None],[None],[None]
8,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 5,[David FERRER],[7],[3],[7],[7],[None],[None]
9,WORLD GROUP QUARTERFINAL,PLAYED & COMPLETED,MATCH 5,[Philipp KOHLSCHREIBER],[6],[6],[6],[1],[None],[None]


In [68]:

# Dynamic assignment of player name
player = 'Mikhail Kukushkin'  # Change this variable to search for different players

# Check if the specified player played matches
player_matches = matches_df[matches_df['Player 1'] == player]

if not player_matches.empty:
    print(f"{player} played matches.")
    # If you want to see the details of the matches involving the specified player, you can print or inspect player_matches
    print(player_matches)
else:
    print(f"{player} did not play matches in the provided data.")


KeyError: 'Player 1'

In a year I have 15 matches (number of link), each match has a max number of 5 matches and on avg 4 matches. The number of single matches should be therefore between 60 and 70. We have two observations (one for each player or team) for each match for a total of 120/140 rows.

In [None]:
matches_df.shape

In [None]:
matches_df.iloc[30:50]

We have 5 players in each team, two team in each match, and 15 matches for a total of (5 * 2 * 15) 150 players/teams.

In [None]:
players_df.shape

In [None]:
players_df.iloc[90:110]

players of teams advancing to the next stage appear more than ones with the same information. it would be efficient to not repeat the data collection for those observations. 

In [None]:
duplicate_rows = players_df.duplicated()
duplicate_rows_df = players_df[duplicate_rows]
duplicate_rows_df.head(20)

# Clean players and matches dataframes

## clean players dataframe

In [None]:
def clean_players_df(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "Player", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["DOB"] = players_df["DOB"].str.split(":", expand=True)[1].str.strip()
    players_df["Single Ranking"] = players_df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["Doubles Ranking"] = players_df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "Player" column in players_df
    players_df['Player'] = players_df['Player'].str.title()
    
    return players_df



In [None]:
# Usage example:
# Call the clean_df_players function with your players_df DataFrame
cleaned_players_df = clean_players_df(players_df)


In [None]:
cleaned_players_df.iloc[130:145]

## clean matches dataframe

In [None]:

def clean_matches_df(matches_df):
    # Convert "Player" and set columns to string
    columns_to_convert = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    matches_df[columns_to_convert] = matches_df[columns_to_convert].astype(str)

    # Define a function to apply regular expression replacements
    def apply_regex_replacements(df, columns):
        for col in columns:
            df[col] = df[col].str.replace(r'[\[\]\']+| and ', '', regex=True)

    # Apply the function to the specified columns
    columns_to_clean = ["Player", "Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]
    apply_regex_replacements(matches_df, columns_to_clean)

    # Apply the str.extract method with the specified regular expression
    split_players = matches_df['Player'].str.extract(r'^(.*?)\\n(.*)$')
    
    # Create 'Player 1' and 'Player 2' columns
    matches_df['Player 1'] = split_players[0].fillna(matches_df['Player'])
    matches_df['Player 2'] = split_players[1].fillna('')
    
    # Drop the original 'Player' column
    matches_df.drop('Player', axis=1, inplace=True)
    
    # Keep the first letter in each word in uppercase for the "Player 1" and "Player 2" columns
    matches_df["Player 1"] = matches_df["Player 1"].str.title()
    matches_df["Player 2"] = matches_df["Player 2"].str.title()
    
    # Define a function to convert specific columns from string to numeric
    def convert_columns_to_numeric(df, columns):
        for col in columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Specify the columns to convert
    columns_to_convert = ["Set 1", "Set 2", "Set 3", "Tie-Break 1", "Tie-Break 2", "Tie-Break 3"]

    # Apply the function to convert the specified columns
    convert_columns_to_numeric(matches_df, columns_to_convert)
    
    return matches_df



In [None]:
# Usage example:
# Call the clean_df_combined function with your combined_df DataFrame
cleaned_matches_df = clean_matches_df(matches_df)


In [None]:
cleaned_matches_df.iloc[80:110]

# Merge players and matches dataframes

In [None]:
player_name = 'Marton Fucsovics'  # Replace with any player name you want to check

# Check if the specified player exists in 'cleaned_matches_df'
is_in_matches = player_name in cleaned_matches_df['Player 1'].values

# Check if the specified player exists in 'cleaned_players_df'
is_in_players = player_name in cleaned_players_df['Player'].values

print(f"Is {player_name} in cleaned_matches_df? {is_in_matches}")
print(f"Is {player_name} in cleaned_players_df? {is_in_players}")


In [None]:
def merge_data_frames(df1, df2):
    merged_df = df1.merge(df2, left_on='Player 1', right_on='Player', how='left')
    merged_df = merged_df.drop_duplicates()

    # Second merge based on 'Player 2'
    final_merged = merged_df.merge(df2, left_on='Player 2', right_on='Player', how='left')
    final_merged = final_merged.drop_duplicates()

    # Dropping specified columns
    columns_to_drop = ['Player_x', 'Player_y', 'Team Name_y']
    final_merged = final_merged.drop(columns_to_drop, axis=1)

    # Renaming columns based on suffixes
    final_merged = final_merged.rename(columns=lambda x: x.replace('_x', '_Player1').replace('_y', '_Player2'))

    # Renaming 'Team Name_Player1' to 'Team Name'
    final_merged = final_merged.rename(columns={'Team Name_Player1': 'Team Name'})

    return final_merged


In [None]:
merged_df = merge_data_frames(cleaned_matches_df, cleaned_players_df)

In [None]:
merged_df.shape

In [None]:
merged_df.head()

# Export dataframe

In [None]:
# Specify the Excel file path
excel_file_path = 'merged_data.xlsx'

# Export merged_df to Excel
merged_df.to_excel(excel_file_path, index=False)

print(f"Data has been exported to {excel_file_path}")


# Retrieve all years links

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

# Initialize the WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)
driver.get("https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx")
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# Click on Accept All Cookies button
acceptCookie_Btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", acceptCookie_Btn)

# Click on the dropdown arrow
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='selected']//div[@class='arrow']"))).click()
# Click on dropdown items from 2015 to 2018
for year in range(2015, 2019):
    year_xpath = f"//a[text()='{year}']"
    year_element = wait.until(EC.element_to_be_clickable((By.XPATH, year_xpath)))
    year_element.click()