In [30]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

In [51]:
def extract_matches_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService("C:/Users/ALESSANDRO/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to be loaded
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "Stage" column with the component_title_text
        df["Stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "Player": [],
                    "Set 1": [],
                    "Set 2": [],
                    "Set 3": [],
                    "Tie-Break 1": [],
                    "Tie-Break 2": [],
                    "Tie-Break 3": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] == "NOT PLAYED":
                        print(f"Skipping match {match_idx + 1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = []
                    tie_breaks = []

                    for set_score in set_scores:
                        set_result = set_score.text.strip()
                        tie_break = ""

                        # Use regular expressions to extract the first number in set_result
                        match = re.search(r'\d+', set_result)
                        if match:
                            set_result = match.group()
                        else:
                            set_result = ""

                        if set_score.find_elements(By.TAG_NAME, "sup"):
                            tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                            # Use regular expressions to extract the first number in tie_break
                            match = re.search(r'\d+', tie_break)
                            if match:
                                tie_break = match.group()
                            else:
                                tie_break = ""

                        set_results.append(set_result)
                        tie_breaks.append(tie_break)

                    # Ensure there are at most 3 sets
                    set_results = set_results[:3]
                    tie_breaks = tie_breaks[:3]

                    # Assign the extracted values to the dictionary
                    table_data["Player"].append(player)
                    table_data["Set 1"].append(set_results[0])
                    table_data["Set 2"].append(set_results[1])
                    table_data["Set 3"].append(set_results[2])

                    # Keep only the first element in the list for tie-breaks
                    for i, tie_break in enumerate(tie_breaks):
                        if i == 0 and tie_break:
                            table_data["Tie-Break 1"].append(tie_break)
                        else:
                            table_data[f"Tie-Break {i + 1}"].append(None)

                # Append the table data to the list
                tables_data.append(table_data)
                print(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        combined_df = pd.concat([df] * len(tables_df), ignore_index=True)
        combined_df = pd.concat([combined_df, tables_df], axis=1)

        # Add match and match status columns
        combined_df["match status"] = ""
        combined_df["match"] = ""
        for i in range(len(match_status)):
            combined_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            combined_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]

        # Display the combined DataFrame
        print("Combined DataFrame:")
        print(combined_df)
        return combined_df
    except Exception as e:
        print("Error:", str(e))
        return None


In [52]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService("C:/Users/ALESSANDRO/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
                print("Loaded!")
            except:
                time.sleep(1)
                print("waiting...")

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"Team Name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["Team Name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns
        print(players_df)

        return players_df

    except Exception as e:
        print("Error:", str(e))
        return None


In [53]:

# Initialize Selenium WebDriver
driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe")

# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Create an empty list to store the results
    match_results = []
    player_results = []

    # Iterate through the links and apply the functions
    for link in links:
        matches_df = extract_matches_df(link)
        players_df = extract_players_df(link)
        
        if matches_df is not None:
            match_results.append(matches_df)
        if players_df is not None:
            player_results.append(players_df)

    # Close the Selenium WebDriver
    driver.quit()
    

except Exception as e:
    print("Error:", str(e))
finally:
    driver.quit()


  driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe")


Loaded!
{'Player': ['Adrian MANNARINO'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['3'], 'Tie-Break 1': ['4'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Thiemo DE BAKKER'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Richard GASQUET'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE'], 'Set 1': ['4'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Pierre-Hugues HERBERT\nNicolas MAHUT'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['8'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE\nJean-Julien ROJER'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['7'], 'Tie-Break 1': ['6'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Adrian MANNARINO'], 'Set 1': ['4'], 'Set 2': ['7'], 'Se

Loaded!
   Team Name             Info 1                      Info 2  \
0      JAPAN      Yuichi SUGITA  Date of birth: 18 Sep 1988   
1      JAPAN        Taro DANIEL  Date of birth: 27 Jan 1993   
2      JAPAN           Go SOEDA  Date of birth: 05 Sep 1984   
3      JAPAN  Yasutaka UCHIYAMA  Date of birth: 05 Aug 1992   
4      JAPAN      Ben MCLACHLAN  Date of birth: 10 May 1992   
5      JAPAN            Captain            Satoshi IWABUCHI   
6      ITALY      Fabio FOGNINI  Date of birth: 24 May 1987   
7      ITALY      Paolo LORENZI  Date of birth: 15 Dec 1981   
8      ITALY      Andreas SEPPI  Date of birth: 21 Feb 1984   
9      ITALY    Thomas FABBIANO  Date of birth: 26 May 1989   
10     ITALY     Simone BOLELLI  Date of birth: 08 Oct 1985   
11     ITALY            Captain          Corrado BARAZZUTTI   

                  Info 3                Info 4  
0   Singles ranking: 940      Doubles ranking:  
1   Singles ranking: 100      Doubles ranking:  
2       Singles ranking: 

Loaded!
{'Player': ['Dmitry POPKO'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Henri LAAKSONEN'], 'Set 1': ['2'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Mikhail KUKUSHKIN'], 'Set 1': ['3'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Adrian BODMER'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['2'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Timur KHABIBULIN\nAleksandr NEDOVYESOV'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marc-Andrea HUESLER\nLuca MARGAROLI'], 'Set 1': ['4'], 'Set 2': ['4'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Error: list index out of range
Loaded!
      Team Name    

Loaded!
{'Player': ['Laslo DJERE'], 'Set 1': ['7'], 'Set 2': ['2'], 'Set 3': ['5'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Sam QUERREY'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['7'], 'Tie-Break 1': ['4'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Dusan LAJOVIC'], 'Set 1': ['4'], 'Set 2': ['7'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['John ISNER'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Nikola MILOJEVIC\nMiljan ZEKIC'], 'Set 1': ['7'], 'Set 2': ['2'], 'Set 3': ['5'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Ryan HARRISON\nSteve JOHNSON'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['7'], 'Tie-Break 1': ['3'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Error: list index out of range
Loaded!
   Team Name            Info 1                    

Loaded!
   Team Name                 Info 1                      Info 2  \
0      ITALY          Fabio FOGNINI  Date of birth: 24 May 1987   
1      ITALY          Paolo LORENZI  Date of birth: 15 Dec 1981   
2      ITALY          Andreas SEPPI  Date of birth: 21 Feb 1984   
3      ITALY      Matteo BERRETTINI  Date of birth: 12 Apr 1996   
4      ITALY         Simone BOLELLI  Date of birth: 08 Oct 1985   
5      ITALY                Captain          Corrado BARAZZUTTI   
6     FRANCE          Lucas POUILLE  Date of birth: 23 Feb 1994   
7     FRANCE       Adrian MANNARINO  Date of birth: 29 Jun 1988   
8     FRANCE  Pierre-Hugues HERBERT  Date of birth: 18 Mar 1991   
9     FRANCE          Jeremy CHARDY  Date of birth: 12 Feb 1987   
10    FRANCE          Nicolas MAHUT  Date of birth: 21 Jan 1982   
11    FRANCE                Captain                Yannick NOAH   

                  Info 3                Info 4  
0   Singles ranking: 134  Doubles ranking: 124  
1       Singles rankin

{'Player': ['Marin CILIC'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Mikhail KUKUSHKIN'], 'Set 1': ['1'], 'Set 2': ['1'], 'Set 3': ['1'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Combined DataFrame:
                    Date                               Venue  \
0   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
1   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
2   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
3   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
4   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
5   06 Apr - 08 Apr 2018   Varazdin Arena, V

{'Player': ['Albert RAMOS-VINOLAS'], 'Set 1': ['1'], 'Set 2': ['6'], 'Set 3': ['14'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Nicolas MAHUT'], 'Set 1': ['7'], 'Set 2': ['3'], 'Set 3': ['11'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marcel GRANOLLERS'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['13'], 'Tie-Break 1': ['2'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Combined DataFrame:
                    Date                                Venue  \
0   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
1   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
2   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
3   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
4   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
5   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
6   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
7   14 

Loaded!
   Team Name          Info 1                      Info 2  \
0    CROATIA     Marin CILIC  Date of birth: 28 Sep 1988   
1    CROATIA     Borna CORIC  Date of birth: 14 Nov 1996   
2    CROATIA   Franko SKUGOR  Date of birth: 20 Sep 1987   
3    CROATIA      Mate PAVIC  Date of birth: 04 Jul 1993   
4    CROATIA      Ivan DODIG  Date of birth: 02 Jan 1985   
5    CROATIA         Captain               Zeljko KRAJAN   
6        USA   Steve JOHNSON  Date of birth: 24 Dec 1989   
7        USA  Frances TIAFOE  Date of birth: 20 Jan 1998   
8        USA   Ryan HARRISON  Date of birth: 07 May 1992   
9        USA     Sam QUERREY  Date of birth: 07 Oct 1987   
10       USA      Mike BRYAN  Date of birth: 29 Apr 1978   
11       USA         Captain                 Jim COURIER   

                  Info 3                Info 4  
0   Singles ranking: 602      Doubles ranking:  
1    Singles ranking: 33      Doubles ranking:  
2       Singles ranking:  Doubles ranking: 162  
3       Singles

In [54]:


def to_df(links):
    # Initialize ChromeOptions with headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Initialize Selenium WebDriver with the provided executable path and headless option
    driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe", options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            # Apply the functions to extract match and player data
            matches_df = extract_matches_df(link)
            players_df = extract_players_df(link)

            if matches_df is not None:
                match_results.append(matches_df)
            if players_df is not None:
                player_results.append(players_df)

        # Close the Selenium WebDriver
        driver.quit()

        # Create DataFrames from the lists
        matches_df = pd.concat(match_results, ignore_index=True)
        players_df = pd.concat(player_results, ignore_index=True)

        return matches_df, players_df

    except Exception as e:
        print("Error:", str(e))
        driver.quit()

    return None, None  # Return None if there was an error




In [55]:
# Usage example:
# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/historic-format/world-group.aspx"

# Navigate to the webpage
driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe")
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all links with class "tie-link" within the tables
    tie_links = driver.find_elements(By.CSS_SELECTOR, "table.tie.ng-scope a.tie-link")

    # Initialize a list to store the extracted links
    links = []

    # Extract the links
    for link in tie_links:
        href = link.get_attribute("href")
        links.append(href)

    # Call the function to scrape and transform the data
    matches_df, players_df = to_df(links)

finally:
    driver.quit()

  driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe")
  driver = webdriver.Chrome(executable_path="C:/Users/ALESSANDRO/Downloads/chromedriver.exe", options=chrome_options)


Loaded!
{'Player': ['Adrian MANNARINO'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['3'], 'Tie-Break 1': ['4'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Thiemo DE BAKKER'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Richard GASQUET'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['3'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE'], 'Set 1': ['4'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Pierre-Hugues HERBERT\nNicolas MAHUT'], 'Set 1': ['7'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': ['8'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Robin HAASE\nJean-Julien ROJER'], 'Set 1': ['6'], 'Set 2': ['3'], 'Set 3': ['7'], 'Tie-Break 1': ['6'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Adrian MANNARINO'], 'Set 1': ['4'], 'Set 2': ['7'], 'Se

Loaded!
   Team Name             Info 1                      Info 2  \
0      JAPAN      Yuichi SUGITA  Date of birth: 18 Sep 1988   
1      JAPAN        Taro DANIEL  Date of birth: 27 Jan 1993   
2      JAPAN           Go SOEDA  Date of birth: 05 Sep 1984   
3      JAPAN  Yasutaka UCHIYAMA  Date of birth: 05 Aug 1992   
4      JAPAN      Ben MCLACHLAN  Date of birth: 10 May 1992   
5      JAPAN            Captain            Satoshi IWABUCHI   
6      ITALY      Fabio FOGNINI  Date of birth: 24 May 1987   
7      ITALY      Paolo LORENZI  Date of birth: 15 Dec 1981   
8      ITALY      Andreas SEPPI  Date of birth: 21 Feb 1984   
9      ITALY    Thomas FABBIANO  Date of birth: 26 May 1989   
10     ITALY     Simone BOLELLI  Date of birth: 08 Oct 1985   
11     ITALY            Captain          Corrado BARAZZUTTI   

                  Info 3                Info 4  
0   Singles ranking: 940      Doubles ranking:  
1   Singles ranking: 100      Doubles ranking:  
2       Singles ranking: 

{'Player': ['Tim PUETZ\nJan-Lennard STRUFF'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Nick KYRGIOS'], 'Set 1': ['2'], 'Set 2': ['6'], 'Set 3': ['2'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Alexander ZVEREV'], 'Set 1': ['6'], 'Set 2': ['7'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Combined DataFrame:
                    Date                                   Venue  \
0   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Australia   
1   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Australia   
2   02 Feb - 04 Feb 2018   Pat Rafter Arena, Brisbane, Austra

Loaded!
   Team Name            Info 1                      Info 2  \
0    CROATIA       Marin CILIC  Date of birth: 28 Sep 1988   
1    CROATIA       Borna CORIC  Date of birth: 14 Nov 1996   
2    CROATIA    Viktor GALOVIC  Date of birth: 19 Sep 1990   
3    CROATIA     Franko SKUGOR  Date of birth: 20 Sep 1987   
4    CROATIA        Ivan DODIG  Date of birth: 02 Jan 1985   
5    CROATIA           Captain               Zeljko KRAJAN   
6     CANADA  Denis SHAPOVALOV  Date of birth: 15 Apr 1999   
7     CANADA    Vasek POSPISIL  Date of birth: 23 Jun 1990   
8     CANADA    Peter POLANSKY  Date of birth: 15 Jun 1988   
9     CANADA    Frank DANCEVIC  Date of birth: 26 Sep 1984   
10    CANADA     Daniel NESTOR  Date of birth: 04 Sep 1972   
11    CANADA           Captain              Frank DANCEVIC   

                  Info 3                 Info 4  
0   Singles ranking: 602       Doubles ranking:  
1    Singles ranking: 33       Doubles ranking:  
2       Singles ranking:       Doub

Loaded!
   Team Name                 Info 1                      Info 2  \
0      ITALY          Fabio FOGNINI  Date of birth: 24 May 1987   
1      ITALY          Paolo LORENZI  Date of birth: 15 Dec 1981   
2      ITALY          Andreas SEPPI  Date of birth: 21 Feb 1984   
3      ITALY      Matteo BERRETTINI  Date of birth: 12 Apr 1996   
4      ITALY         Simone BOLELLI  Date of birth: 08 Oct 1985   
5      ITALY                Captain          Corrado BARAZZUTTI   
6     FRANCE          Lucas POUILLE  Date of birth: 23 Feb 1994   
7     FRANCE       Adrian MANNARINO  Date of birth: 29 Jun 1988   
8     FRANCE  Pierre-Hugues HERBERT  Date of birth: 18 Mar 1991   
9     FRANCE          Jeremy CHARDY  Date of birth: 12 Feb 1987   
10    FRANCE          Nicolas MAHUT  Date of birth: 21 Jan 1982   
11    FRANCE                Captain                Yannick NOAH   

                  Info 3                Info 4  
0   Singles ranking: 134  Doubles ranking: 124  
1       Singles rankin

{'Player': ['Marin CILIC'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['6'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Mikhail KUKUSHKIN'], 'Set 1': ['1'], 'Set 2': ['1'], 'Set 3': ['1'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Skipping match 5
{'Player': [], 'Set 1': [], 'Set 2': [], 'Set 3': [], 'Tie-Break 1': [], 'Tie-Break 2': [], 'Tie-Break 3': []}
Combined DataFrame:
                    Date                               Venue  \
0   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
1   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
2   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
3   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
4   06 Apr - 08 Apr 2018   Varazdin Arena, Varazdin, Croatia   
5   06 Apr - 08 Apr 2018   Varazdin Arena, V

{'Player': ['Albert RAMOS-VINOLAS'], 'Set 1': ['1'], 'Set 2': ['6'], 'Set 3': ['14'], 'Tie-Break 1': [None], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Nicolas MAHUT'], 'Set 1': ['7'], 'Set 2': ['3'], 'Set 3': ['11'], 'Tie-Break 1': ['7'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
{'Player': ['Marcel GRANOLLERS'], 'Set 1': ['6'], 'Set 2': ['6'], 'Set 3': ['13'], 'Tie-Break 1': ['2'], 'Tie-Break 2': [None], 'Tie-Break 3': [None]}
Combined DataFrame:
                    Date                                Venue  \
0   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
1   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
2   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
3   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
4   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
5   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
6   14 Sep - 16 Sep 2018   Stade Pierre Mauroy, Lille, France   
7   14 

Loaded!
   Team Name          Info 1                      Info 2  \
0    CROATIA     Marin CILIC  Date of birth: 28 Sep 1988   
1    CROATIA     Borna CORIC  Date of birth: 14 Nov 1996   
2    CROATIA   Franko SKUGOR  Date of birth: 20 Sep 1987   
3    CROATIA      Mate PAVIC  Date of birth: 04 Jul 1993   
4    CROATIA      Ivan DODIG  Date of birth: 02 Jan 1985   
5    CROATIA         Captain               Zeljko KRAJAN   
6        USA   Steve JOHNSON  Date of birth: 24 Dec 1989   
7        USA  Frances TIAFOE  Date of birth: 20 Jan 1998   
8        USA   Ryan HARRISON  Date of birth: 07 May 1992   
9        USA     Sam QUERREY  Date of birth: 07 Oct 1987   
10       USA      Mike BRYAN  Date of birth: 29 Apr 1978   
11       USA         Captain                 Jim COURIER   

                  Info 3                Info 4  
0   Singles ranking: 602      Doubles ranking:  
1    Singles ranking: 33      Doubles ranking:  
2       Singles ranking:  Doubles ranking: 162  
3       Singles

In [57]:
def clean_df_players(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "Player", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["DOB"] = players_df["DOB"].str.split(":", expand=True)[1].str.strip()
    players_df["Single Ranking"] = players_df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["Doubles Ranking"] = players_df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "Player" column in players_df
    players_df['Player'] = players_df['Player'].str.title()
    
    return players_df



In [58]:
# Usage example:
# Call the clean_df_players function with your players_df DataFrame
cleaned_players_df = clean_df_players(players_df)
