In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage containing the table
url = "https://www.ultimatetennisstatistics.com/tournamentEvent?tournamentEventId=4626&tab=results"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table by its ID ("resultsTable")
    table = soup.find("table", {"id": "resultsTable"})

    if table:
        # Create an empty DataFrame to store the data
        data = []

        # Initialize a list to store the href links
        href_links = []

        # Initialize a variable to keep track of the row index
        row_index = 0

        # Extract and append the data from the table to the DataFrame
        for row in table.find_all("tr"):
            player_cell = row.find("td", {"class": "player"})
            score_cells = row.find_all("td", {"class": "score"})

            if player_cell:
                player_name = player_cell.text.strip()
                # Extract nationality from the title attribute of the img tag
                nationality = player_cell.find("img")["title"]

                # Initialize variables for tiebreak sets and games in each set
                tiebreak_set1 = tiebreak_set2 = tiebreak_set3 = 0
                games_set1 = games_set2 = games_set3 = None

                if score_cells:
                    scores = [cell.text.strip() for cell in score_cells]

                    # Extract tiebreak information and update set scores
                    if len(scores) > 0 and "(" in scores[0]:
                        tiebreak_set1 = 1
                        games_set1 = int(scores[0].split("(")[1].split(")")[0])
                        scores[0] = scores[0].split("(")[0].strip()
                    if len(scores) > 1 and "(" in scores[1]:
                        tiebreak_set2 = 1
                        games_set2 = int(scores[1].split("(")[1].split(")")[0])
                        scores[1] = scores[1].split("(")[0].strip()
                    if len(scores) > 2 and "(" in scores[2]:
                        tiebreak_set3 = 1
                        games_set3 = int(scores[2].split("(")[1].split(")")[0])
                        scores[2] = scores[2].split("(")[0].strip()

                # Determine the 'stage' based on row index
                row_index += 1
                if row_index in [3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59, 60]:
                    stage = "quarterfinal"
                elif row_index in [5, 6, 13, 14, 21, 22, 29, 30]:
                    stage = "semifinal"
                elif row_index in [7, 8, 15, 16]:
                    stage = "final"
                else:
                    stage = "roundrobin"

                # Find the player's href link in the first column
                player_link = row.find("a")

                # Check if the player_link is not None and get the href attribute
                if player_link:
                    href = player_link.get("href")
                    href_links.append(href)

                data.append([player_name, nationality, scores[0] if len(scores) > 0 else None, scores[1] if len(scores) > 1 else None, scores[2] if len(scores) > 2 else None, tiebreak_set1, tiebreak_set2, tiebreak_set3, games_set1, games_set2, games_set3, stage])

        # Create a Pandas DataFrame from the data
        df = pd.DataFrame(data, columns=["Player", "Nationality", "Set1", "Set2", "Set3", "Tiebreak_Set1", "Tiebreak_Set2", "Tiebreak_Set3", "Games_Set1", "Games_Set2", "Games_Set3", "Stage"])

        # Add the href links as a new column in the DataFrame
        df["Player_Href"] = href_links

        # Display the DataFrame
        print(df)

else:
    print("Failed to fetch the webpage. Status code:", response.status_code)


                   Player Nationality Set1 Set2  Set3  Tiebreak_Set1  \
0                 B. Gojo         CRO    6    3     6              0   
1                 B. Gojo         CRO    6    3     6              0   
2                 S. Baez         ARG    1    6     3              0   
3             J. Thompson         AUS    4    7     6              0   
4           T. Griekspoor         NED    6    5     3              0   
..                    ...         ...  ...  ...   ...            ...   
167         T. Griekspoor         NED    7    7  None              0   
168               T. Paul         USA    5    6  None              0   
169  B. Van De Zandschulp         NED    6    7     4              0   
170  B. Van De Zandschulp         NED    6    7  None              0   
171           T. H. Fritz         USA    4    6  None              0   

     Tiebreak_Set2  Tiebreak_Set3  Games_Set1  Games_Set2  Games_Set3  \
0                0              0         NaN         NaN     

In [13]:
df

Unnamed: 0,Player,Nationality,Set1,Set2,Set3,Tiebreak_Set1,Tiebreak_Set2,Tiebreak_Set3,Games_Set1,Games_Set2,Games_Set3,Stage,Player_Href
0,B. Gojo,CRO,6,3,6,0,0,0,,,,roundrobin,/playerProfile?playerId=27140
1,B. Gojo,CRO,6,3,6,0,0,0,,,,roundrobin,/playerProfile?playerId=27140
2,S. Baez,ARG,1,6,3,0,0,0,,,,quarterfinal,/playerProfile?playerId=46888
3,J. Thompson,AUS,4,7,6,0,0,0,,,,quarterfinal,/playerProfile?playerId=11415
4,T. Griekspoor,NED,6,5,3,0,0,0,,,,semifinal,/playerProfile?playerId=34651
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,T. Griekspoor,NED,7,7,,0,1,0,,7.0,,roundrobin,/playerProfile?playerId=34651
168,T. Paul,USA,5,6,,0,1,0,,3.0,,roundrobin,/playerProfile?playerId=26008
169,B. Van De Zandschulp,NED,6,7,4,0,1,0,,7.0,,roundrobin,/playerProfile?playerId=22139
170,B. Van De Zandschulp,NED,6,7,,0,1,0,,7.0,,roundrobin,/playerProfile?playerId=22139


In [37]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to retrieve th and td elements from the table using Selenium
def retrieve_table_data_selenium(player_url):
    # Initialize Selenium
    chrome_service = ChromeService("C:/Users/ALESSANDRO/Downloads/chromedriver.exe")  # Replace with the path to your ChromeDriver executable
    chrome_service.start()
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)

    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

    # Append "&tab=profile" to the player's URL
    player_url += "&tab=profile"

    # Navigate to the player's URL
    driver.get(player_url)

    try:
        # Wait for the table to be visible (adjust timeout as needed)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.table.table-condensed.text-nowrap")))

        # Find the table with class "table table-condensed text-nowrap"
        table = driver.find_element(By.CSS_SELECTOR, "table.table.table-condensed.text-nowrap")

        # Find all th and td elements within the table
        th_elements = table.find_elements(By.TAG_NAME, "th")
        td_elements = table.find_elements(By.TAG_NAME, "td")

        # Create a dictionary to store th and td values
        player_data = {}
        th_values = [th.text.strip() for th in th_elements]
        td_values = [td.text.strip() for td in td_elements]

        # Pair each th with its corresponding td value
        for th, td in zip(th_values, td_values):
            player_data[th] = td

        return player_data
    except Exception as e:
        print(f"Error retrieving table data for URL {player_url}: {str(e)}")
        return None
    finally:
        # Close the Selenium WebDriver
        driver.quit()

# URL of the webpage containing the table
url = "https://www.ultimatetennisstatistics.com/tournamentEvent?tournamentEventId=4626&tab=results"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table by its ID ("resultsTable")
    table = soup.find("table", {"id": "resultsTable"})

    if table:
        # Create an empty list to store player data
        player_data_list = []

        # Initialize a variable to keep track of the row index
        row_index = 0

        # Extract and append the data from the table to the list
        for row in table.find_all("tr"):
            player_cell = row.find("td", {"class": "player"})
            score_cells = row.find_all("td", {"class": "score"})

            if player_cell:
                player_name = player_cell.text.strip()
                # Extract nationality from the title attribute of the img tag
                nationality = player_cell.find("img")["title"]

                # Initialize variables for tiebreak sets and games in each set
                tiebreak_set1 = tiebreak_set2 = tiebreak_set3 = 0
                games_set1 = games_set2 = games_set3 = None

                if score_cells:
                    scores = [cell.text.strip() for cell in score_cells]

                    # Extract tiebreak information and update set scores
                    if len(scores) > 0 and "(" in scores[0]:
                        tiebreak_set1 = 1
                        games_set1 = int(scores[0].split("(")[1].split(")")[0])
                        scores[0] = scores[0].split("(")[0].strip()
                    if len(scores) > 1 and "(" in scores[1]:
                        tiebreak_set2 = 1
                        games_set2 = int(scores[1].split("(")[1].split(")")[0])
                        scores[1] = scores[1].split("(")[0].strip()
                    if len(scores) > 2 and "(" in scores[2]:
                        tiebreak_set3 = 1
                        games_set3 = int(scores[2].split("(")[1].split(")")[0])
                        scores[2] = scores[2].split("(")[0].strip()

                # Determine the 'stage' based on row index
                row_index += 1
                if row_index in [3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59, 60]:
                    stage = "quarterfinal"
                elif row_index in [5, 6, 13, 14, 21, 22, 29, 30]:
                    stage = "semifinal"
                elif row_index in [7, 8, 15, 16]:
                    stage = "final"
                else:
                    stage = "roundrobin"

                # Find the player's href link in the first column
                player_link = row.find("a")

                # Check if the player_link is not None and get the href attribute
                if player_link:
                    href = player_link.get("href")

                    # Retrieve additional table data for the player
                    player_data = retrieve_table_data_selenium("https://www.ultimatetennisstatistics.com" + href)

                    # Check if valid data is retrieved before appending
                    if player_data is not None:
                        player_data.update({
                            "Player": player_name,
                            "Nationality": nationality,
                            "Set1": scores[0] if len(scores) > 0 else None,
                            "Set2": scores[1] if len(scores) > 1 else None,
                            "Set3": scores[2] if len(scores) > 2 else None,
                            "Tiebreak_Set1": tiebreak_set1,
                            "Tiebreak_Set2": tiebreak_set2,
                            "Tiebreak_Set3": tiebreak_set3,
                            "Games_Set1": games_set1,
                            "Games_Set2": games_set2,
                            "Games_Set3": games_set3,
                            "Stage": stage,
                        })

                        player_data_list.append(player_data)

        # Create a Pandas DataFrame from the list of player data
        df = pd.DataFrame(player_data_list)

        # Display the DataFrame
        print(df)

    else:
        print("Table not found on the webpage.")
else:
    print("Failed to fetch the webpage. Status code:", response.status_code)


Error retrieving table data for URL https://www.ultimatetennisstatistics.com/playerProfile?playerId=46888&tab=profile: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF711B37D12+55474]
	(No symbol) [0x00007FF711AA77C2]
	(No symbol) [0x00007FF71195E0EB]
	(No symbol) [0x00007FF71199EBAC]
	(No symbol) [0x00007FF71199ED2C]
	(No symbol) [0x00007FF7119D9F77]
	(No symbol) [0x00007FF7119BF19F]
	(No symbol) [0x00007FF7119D7EF2]
	(No symbol) [0x00007FF7119BEF33]
	(No symbol) [0x00007FF711993D41]
	(No symbol) [0x00007FF711994F84]
	GetHandleVerifier [0x00007FF711E9B762+3609346]
	GetHandleVerifier [0x00007FF711EF1A80+3962400]
	GetHandleVerifier [0x00007FF711EE9F0F+3930799]
	GetHandleVerifier [0x00007FF711BD3CA6+694342]
	(No symbol) [0x00007FF711AB2218]
	(No symbol) [0x00007FF711AAE484]
	(No symbol) [0x00007FF711AAE5B2]
	(No symbol) [0x00007FF711A9EE13]
	BaseThreadInitThunk [0x00007FFB0682257D+29]
	RtlUserThreadStart [0x00007FFB0746AA68+40]

Error retrieving table data for URL https://www.ultimate

KeyboardInterrupt: 

In [38]:
player_data

{'Age': '26 (14-11-1996)',
 'Country': 'Croatia',
 'Height': '188 cm',
 'Plays': 'Right-handed',
 'Backhand': 'Two-handed',
 'Favorite Surface': 'Slow (H, Cl) 16%',
 'Turned Pro': '2013',
 'Seasons': '11',
 'Active': 'Yes',
 'Prize Money': '$8,509,168',
 'Wikipedia': 'Wikipedia',
 'Player': 'B. Coric',
 'Nationality': 'CRO',
 'Set1': '6',
 'Set2': '7',
 'Set3': None,
 'Tiebreak_Set1': 0,
 'Tiebreak_Set2': 1,
 'Tiebreak_Set3': 0,
 'Games_Set1': None,
 'Games_Set2': 8,
 'Games_Set3': None,
 'Stage': 'quarterfinal'}